gman 5.0.9 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,14 +1,26 @@
1
1
  class Gman
2
+ class Locality
3
+ AFFINITY_NAMESPACES = %w(state dst cog).freeze
2
4
 
3
- LOCALITY_REGEX = %r{
4
- (
5
- (state|dst|cog)
6
- |
7
- (ci|co|borough|boro|city|county|parish|town|twp|vi|vil|village)\.[a-z-]+
8
- )
9
- \.(ak|al|ar|az|ca|co|ct|dc|de|fl|ga|hi|ia|id|il|in|ks|ky|la|ma|md|me|mi|mn|mo|ms|mt|nc|nd|ne|nh|nj|nm|nv|ny|oh|ok|or|pa|ri|sc|sd|tn|tx|um|ut|va|vt|wa|wi|wv|wy)
10
- \.us
11
- }x
5
+ STATES = %w(
6
+ ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
7
+ la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
8
+ ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
9
+ ).freeze
10
+
11
+ LOCALITY_DOMAINS = %w(
12
+ ci co borough boro city county
13
+ parish town twp vi vil village
14
+ ).freeze
15
+
16
+ REGEX = /
17
+ (
18
+ (#{Regexp.union(AFFINITY_NAMESPACES)})
19
+ |
20
+ (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
21
+ )\.(#{Regexp.union(STATES)})\.us
22
+ /x
23
+ end
12
24
 
13
25
  # Second level .us domains for states and locality
14
26
  # See http://en.wikipedia.org/wiki/.us
@@ -23,6 +35,6 @@ class Gman
23
35
  # * k12.il.us
24
36
  # * ci.foo.zx.us
25
37
  def locality?
26
- !!(domain.to_s =~ LOCALITY_REGEX)
38
+ !domain.to_s.match(Locality::REGEX).nil?
27
39
  end
28
40
  end
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '5.0.9'
2
+ VERSION = '6.0.0'.freeze
3
3
  end
data/script/add CHANGED
@@ -7,11 +7,11 @@
7
7
  require './lib/gman/importer'
8
8
 
9
9
  if ARGV.length < 2
10
- puts "Usage: script/add [GROUP] [DOMAIN(S)]"
10
+ puts 'Usage: script/add [GROUP] [DOMAIN(S)]'
11
11
  exit 1
12
12
  end
13
13
 
14
14
  group = ARGV[0]
15
15
  domains = ARGV.drop(1)
16
16
 
17
- Gman.import({ group => domains })
17
+ Gman.import(group => domains)
@@ -4,8 +4,8 @@
4
4
  #
5
5
  # usage: script/alphabetize
6
6
 
7
- require_relative "../lib/gman"
8
- require_relative "../lib/gman/importer"
7
+ require_relative '../lib/gman'
8
+ require_relative '../lib/gman/importer'
9
9
 
10
10
  current = Gman::DomainList.current
11
11
  current.alphabetize
@@ -3,4 +3,6 @@
3
3
  set -ex
4
4
 
5
5
  bundle exec rake test
6
+ bundle exec rubocop -D -S -a
6
7
  bundle exec script/dedupe
8
+ bundle exec gem build gman.gemspec
@@ -7,7 +7,7 @@ require './lib/gman/importer'
7
7
 
8
8
  current = Gman::DomainList.current
9
9
 
10
- puts "Checking for duplicate domains in the domain list..."
10
+ puts 'Checking for duplicate domains in the domain list...'
11
11
  puts "Current list contains #{current.count} domains..."
12
12
 
13
13
  dupe = current.count - current.domains.uniq.count
@@ -16,6 +16,6 @@ exit 0 if dupe == 0
16
16
 
17
17
  dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
18
18
 
19
- puts "Duplicate domains:"
19
+ puts 'Duplicate domains:'
20
20
  puts dupes
21
21
  exit 1
@@ -3,9 +3,12 @@
3
3
  require 'ruby-prof'
4
4
  require './lib/gman'
5
5
 
6
- # Pick N random domains directly, without pre-loading the Gman list for an accurate benchmark
6
+ # Pick N random domains directly,
7
+ # without pre-loading the Gman list for an accurate benchmark
7
8
  count = (ARGV[0] || 100).to_i
8
- domains = File.readlines("./config/domains.txt").select { |l| l =~ /^[a-z0-9]/i }.sample(count)
9
+ domains = File.readlines('./config/domains.txt')
10
+ domains = domains.select { |l| l =~ /^[a-z0-9]/i }
11
+ domains = domains.sample(count)
9
12
 
10
13
  RubyProf.start
11
14
  domains.each do |domain|
@@ -1,20 +1,20 @@
1
- #! /usr/bin/env ruby
1
+ #!/usr/bin/env ruby
2
2
  # Given an array of domains, removes them from the list
3
3
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
4
 
5
- require_relative "../lib/gman"
6
- require_relative "../lib/gman/domain_list"
5
+ require_relative '../lib/gman'
6
+ require_relative '../lib/gman/domain_list'
7
7
 
8
8
  domains = ARGV
9
- domains = domains.clone.map { |d| d.gsub ",", "" }
9
+ domains = domains.clone.map { |d| d.delete ',' }
10
10
 
11
- list = File.open("./config/domains.txt").read
11
+ list = File.open('./config/domains.txt').read
12
12
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
13
 
14
14
  domains.each do |domain|
15
- list.gsub! /^#{domain}$\n/, ""
15
+ list.gsub!(/^#{domain}$\n/, '')
16
16
  end
17
17
 
18
18
  puts "Ending list: #{Gman::DomainList.current.count} domains"
19
19
 
20
- File.write "./config/domains.txt", list
20
+ File.write './config/domains.txt', list
@@ -8,57 +8,62 @@
8
8
  require './lib/gman/importer'
9
9
  require 'yaml'
10
10
 
11
- ENV["RECONCILING"] = "true"
12
- blacklist = ["usagovQUASI"]
13
- source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
11
+ ENV['RECONCILING'] = 'true'
12
+ blacklist = ['usagovQUASI']
13
+ source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
14
 
15
15
  data = open(source).read
16
- data = data.split("__________________________________________________________________________")
16
+ data = data.split('_' * 74)
17
17
  data = data.last.strip
18
- data = data.split(/\r?\n/).reject { |r| r.empty? }
18
+ data = data.split(/\r?\n/).reject(&:empty?)
19
19
 
20
20
  domains = {}
21
- group = ""
21
+ group = ''
22
22
  data.each do |row|
23
23
  if row =~ /^\w/
24
24
  group = row
25
25
  domains[group] = []
26
26
  else
27
- domains[group].push row.sub("\.\t", "").strip
27
+ domains[group].push row.sub("\.\t", '').strip
28
28
  end
29
29
  end
30
30
 
31
- domains.reject! { |group,domain| blacklist.include?(group) }
31
+ domains.reject! { |g, _domain| blacklist.include?(g) }
32
32
  importer = Gman::Importer.new(domains)
33
33
 
34
34
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
35
 
36
- importer.domains.list.each do |group, domains|
37
- domains.map! { |domain| Gman.new(domain).to_s }
38
- domains.map! { |domain| importer.normalize_domain(domain) }
36
+ importer.domains.list.each do |_group, d|
37
+ d.map! { |domain| Gman.new(domain).to_s }
38
+ d.map! { |domain| importer.normalize_domain(domain) }
39
39
  end
40
40
 
41
- importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
41
+ count = importer.domains.domains.count
42
+ importer.logger.info "Filtered down to #{count} normalized domains"
42
43
 
43
44
  missing = {}
44
- importer.domains.list.each do |group, usagovdomains|
45
- next unless importer.current.list[group]
46
- missing[group] = importer.current.list[group] - usagovdomains
45
+ importer.domains.list.each do |g, usagovdomains|
46
+ next unless importer.current.list[g]
47
+ missing[g] = importer.current.list[g] - usagovdomains
47
48
  end
48
49
 
49
- missing.reject! { |key, value| value.empty? }
50
+ missing.reject! { |_key, value| value.empty? }
50
51
 
51
- importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
52
+ count = missing.values.count
53
+ importer.logger.info "Found #{count} domains not on the USA.gov list"
52
54
  puts "Here's the list of missing domains:"
53
55
  puts YAML.dump(missing)
54
56
 
55
57
  domains = importer.domains.domains
56
- domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
58
+ domains = domains.group_by do |domain|
59
+ importer.valid_domain?(domain, skip_dupe: true)
60
+ end
57
61
  domains.delete(true)
58
62
  domains.delete(false)
59
- domains.delete("locality")
63
+ domains.delete('locality')
60
64
 
61
- importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
65
+ count = domains.values.flatten.count
66
+ importer.logger.info "Calling out #{count} rejected domains"
62
67
 
63
- puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
68
+ puts 'Here are the rejected domains and why they were rejected:'
64
69
  puts YAML.dump(domains)
@@ -5,10 +5,10 @@ require 'open-uri'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
8
+ url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
9
 
10
- domains = open(url).read.encode("UTF-8")
11
- domains = CSV.parse(domains, :headers => true)
12
- domains = domains.map { |row| row["Domain Name"] }
10
+ domains = open(url).read.encode('UTF-8')
11
+ domains = CSV.parse(domains, headers: true)
12
+ domains = domains.map { |row| row['Domain Name'] }
13
13
 
14
- Gman.import("German Federal" => domains)
14
+ Gman.import('German Federal' => domains)
@@ -5,9 +5,9 @@ require 'open-uri'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
8
+ url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
9
 
10
- csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
10
+ csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
11
11
 
12
12
  # For some reason, the header row is actually the last row
13
13
  # Pop the last line off the file and prepend it at the begining
@@ -17,7 +17,7 @@ lines.unshift lines.pop
17
17
  csv = lines.join("\n")
18
18
 
19
19
  # Load municipal domains
20
- data = CSV.parse(csv, :headers => true, :col_sep => ";")
21
- domains = data.map { |row| row["Internet"] }
20
+ data = CSV.parse(csv, headers: true, col_sep: ';')
21
+ domains = data.map { |row| row['Internet'] }
22
22
 
23
- Gman.import("German Municipalities" => domains)
23
+ Gman.import('German Municipalities' => domains)
@@ -3,8 +3,16 @@
3
3
 
4
4
  require 'fileutils'
5
5
 
6
- FileUtils.rm_rf("almanak.overheid.nl")
7
- domains = `wget -q -r -nc -np https://almanak.overheid.nl/
8
- grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
6
+ FileUtils.rm_rf('almanak.overheid.nl')
7
+ commands = [
8
+ "wget -q -r -nc -np https://almanak.overheid.nl/
9
+ grep @ -rI almanak.overheid.nl/",
10
+ 'cut -f 2 -d @',
11
+ "cut -f 1 -d '\"'",
12
+ 'grep \\.nl$',
13
+ 'sort',
14
+ 'uniq'
15
+ ]
16
+ domains = system commands.join('|')
9
17
 
10
- Gman.import("Netherlands" => domains.split("\n"))
18
+ Gman.import('Netherlands' => domains.split("\n"))
@@ -1,10 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
  # Propagates an initial list of best-guess government domains
3
3
 
4
- require "public_suffix"
5
- require "yaml"
6
- require_relative "../lib/gman"
7
- require_relative "../lib/gman/importer"
4
+ require 'public_suffix'
5
+ require 'yaml'
6
+ require_relative '../lib/gman'
7
+ require_relative '../lib/gman/importer'
8
8
 
9
9
  # https://gist.github.com/benbalter/6147066
10
10
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
@@ -15,13 +15,13 @@ PublicSuffix::List.default.each do |rule|
15
15
 
16
16
  if rule.parts.length == 1
17
17
  domain = rule.parts.first if ".#{rule.value}" =~ REGEX
18
- else
19
- domain = rule.parts.pop(2).join(".") if ".#{rule.value}" =~ REGEX
18
+ elsif ".#{rule.value}" =~ REGEX
19
+ domain = rule.parts.pop(2).join('.')
20
20
  end
21
21
 
22
- domains.push domain unless domain.nil? or domains.include? domain
22
+ domains.push domain unless domain.nil? || domains.include?(domain)
23
23
  end
24
24
 
25
25
  # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
26
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({"non-us gov" => domains}, :skip_resolve => true)
27
+ Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
@@ -5,15 +5,17 @@ require 'csv'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
8
+ url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
9
  agent = Mechanize.new
10
10
  page = agent.get(url)
11
11
  form = page.forms.first
12
- form.radiobuttons.find { |r| r.value = "Textfil" }.check
13
- submit_button = form.buttons.find { |b| b.type == "submit" }
12
+ form.radiobuttons.find { |r| r.value = 'Textfil' }.check
13
+ submit_button = form.buttons.find { |b| b.type == 'submit' }
14
14
  response = agent.submit(form, submit_button)
15
15
 
16
- rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
17
- domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
16
+ rows = CSV.parse(response.content, headers: true, col_sep: "\t")
17
+ domains = rows.map do |row|
18
+ row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
19
+ end
18
20
 
19
- Gman.import("Swedish Administrative Authorities" => domains)
21
+ Gman.import('Swedish Administrative Authorities' => domains)
@@ -12,24 +12,24 @@
12
12
 
13
13
  require './lib/gman/importer'
14
14
 
15
- blacklist = ["usagovQUASI"]
16
- source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
15
+ blacklist = %w(usagovQUASI usagovFEDgov)
16
+ source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
17
17
 
18
18
  data = open(source).read
19
- data = data.split("__________________________________________________________________________")
19
+ data = data.split('_' * 74)
20
20
  data = data.last.strip
21
- data = data.split(/\r?\n/).reject { |r| r.empty? }
21
+ data = data.split(/\r?\n/).reject(&:empty?)
22
22
 
23
23
  domains = {}
24
- group = ""
24
+ group = ''
25
25
  data.each do |row|
26
26
  if row =~ /^\w/
27
27
  group = row
28
28
  domains[group] = []
29
29
  else
30
- domains[group].push row.sub("\.\t", "").strip
30
+ domains[group].push row.sub("\.\t", '').strip
31
31
  end
32
32
  end
33
33
 
34
- domains.reject! { |group,domain| blacklist.include?(group) }
34
+ domains.reject! { |g, _| blacklist.include?(g) }
35
35
  Gman.import(domains)
@@ -0,0 +1,2 @@
1
+ // test
2
+ gov
File without changes
@@ -8,7 +8,7 @@ begin
8
8
  Bundler.setup(:default, :development)
9
9
  rescue Bundler::BundlerError => e
10
10
  $stderr.puts e.message
11
- $stderr.puts "Run `bundle install` to install missing gems"
11
+ $stderr.puts 'Run `bundle install` to install missing gems'
12
12
  exit e.status_code
13
13
  end
14
14
 
@@ -16,11 +16,25 @@ require 'shoulda'
16
16
 
17
17
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
- require_relative "../lib/gman"
20
- require_relative "../lib/gman/domain_list"
19
+ require_relative '../lib/gman'
20
+ require_relative '../lib/gman/domain_list'
21
+ require_relative '../lib/gman/importer'
21
22
 
22
- require './lib/gman/importer'
23
+ def bin_path(cmd = 'gman')
24
+ File.expand_path "../bin/#{cmd}", File.dirname(__FILE__)
25
+ end
23
26
 
24
27
  def test_bin(*args)
25
- output, status = Open3.capture2e("bundle", "exec", "gman", *args)
28
+ Open3.capture2e('bundle', 'exec', bin_path, *args)
29
+ end
30
+
31
+ def fixture_path(fixture)
32
+ File.expand_path "./fixtures/#{fixture}", File.dirname(__FILE__)
33
+ end
34
+
35
+ def with_env(key, value)
36
+ old_env = ENV[key]
37
+ ENV[key] = value
38
+ yield
39
+ ENV[key] = old_env
26
40
  end
@@ -1,57 +1,62 @@
1
1
  require File.join(File.dirname(__FILE__), 'helper')
2
2
 
3
- VALID = [ "foo.gov",
4
- "http://foo.mil",
5
- "foo@bar.gc.ca",
6
- "foo.gov.au",
7
- "https://www.foo.gouv.fr",
8
- "foo@ci.champaign.il.us",
9
- "foo.bar.baz.gov.au",
10
- "foo@bar.gov.uk",
11
- "foo.gov",
12
- "foo.fed.us",
13
- "foo.state.il.us",
14
- "state.il.us",
15
- "foo@af.mil",
16
- "foo.gov.in"
17
- ]
3
+ VALID = ['foo.gov',
4
+ 'http://foo.mil',
5
+ 'foo@bar.gc.ca',
6
+ 'foo.gov.au',
7
+ 'https://www.foo.gouv.fr',
8
+ 'foo@ci.champaign.il.us',
9
+ 'foo.bar.baz.gov.au',
10
+ 'foo@bar.gov.uk',
11
+ 'foo.gov',
12
+ 'foo.fed.us',
13
+ 'foo.state.il.us',
14
+ 'state.il.us',
15
+ 'foo@af.mil',
16
+ 'foo.gov.in'
17
+ ].freeze
18
18
 
19
- INVALID = [ "foo.bar.com",
20
- "bar@foo.biz",
21
- "http://www.foo.biz",
22
- "foo.uk",
23
- "gov",
24
- "foo@k12.champaign.il.us",
25
- "foo@kii.gov.by",
26
- "foo",
27
- "",
28
- nil,
29
- " ",
30
- "foo.city.il.us",
31
- "foo.ci.il.us",
32
- "foo.zx.us",
33
- "foo@mail.gov.ua"
34
- ]
19
+ INVALID = ['foo.bar.com',
20
+ 'bar@foo.biz',
21
+ 'http://www.foo.biz',
22
+ 'foo.uk',
23
+ 'gov',
24
+ 'foo@k12.champaign.il.us',
25
+ 'foo@kii.gov.by',
26
+ 'foo',
27
+ '',
28
+ nil,
29
+ ' ',
30
+ 'foo.city.il.us',
31
+ 'foo.ci.il.us',
32
+ 'foo.zx.us',
33
+ 'foo@mail.gov.ua'
34
+ ].freeze
35
35
 
36
36
  class TestGman < Minitest::Test
37
-
38
37
  VALID.each do |domain|
39
38
  should "recognize #{domain} as a government domain" do
40
- assert Gman::valid?(domain)
39
+ assert Gman.valid?(domain)
41
40
  end
42
41
  end
43
42
 
44
43
  INVALID.each do |domain|
45
44
  should "recognize #{domain} as a non-government domain" do
46
- refute Gman::valid?(domain)
45
+ refute Gman.valid?(domain)
47
46
  end
48
47
  end
49
48
 
50
- should "not allow educational domains" do
51
- assert_equal false, Gman::valid?("foo@gwu.edu")
49
+ should 'not allow educational domains' do
50
+ assert_equal false, Gman.valid?('foo@gwu.edu')
51
+ end
52
+
53
+ should 'returns the path to domains.txt' do
54
+ assert_equal true, File.exist?(Gman.list_path)
52
55
  end
53
56
 
54
- should "returns the path to domains.txt" do
55
- assert_equal true, File.exists?(Gman.list_path)
57
+ should 'stub domains when asked' do
58
+ with_env 'GMAN_STUB_DOMAINS', 'true' do
59
+ assert_equal fixture_path('domains.txt'), Gman.list_path
60
+ end
56
61
  end
57
62
  end