gman 5.0.9 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,26 @@
1
1
  class Gman
2
+ class Locality
3
+ AFFINITY_NAMESPACES = %w(state dst cog).freeze
2
4
 
3
- LOCALITY_REGEX = %r{
4
- (
5
- (state|dst|cog)
6
- |
7
- (ci|co|borough|boro|city|county|parish|town|twp|vi|vil|village)\.[a-z-]+
8
- )
9
- \.(ak|al|ar|az|ca|co|ct|dc|de|fl|ga|hi|ia|id|il|in|ks|ky|la|ma|md|me|mi|mn|mo|ms|mt|nc|nd|ne|nh|nj|nm|nv|ny|oh|ok|or|pa|ri|sc|sd|tn|tx|um|ut|va|vt|wa|wi|wv|wy)
10
- \.us
11
- }x
5
+ STATES = %w(
6
+ ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
7
+ la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
8
+ ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
9
+ ).freeze
10
+
11
+ LOCALITY_DOMAINS = %w(
12
+ ci co borough boro city county
13
+ parish town twp vi vil village
14
+ ).freeze
15
+
16
+ REGEX = /
17
+ (
18
+ (#{Regexp.union(AFFINITY_NAMESPACES)})
19
+ |
20
+ (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
21
+ )\.(#{Regexp.union(STATES)})\.us
22
+ /x
23
+ end
12
24
 
13
25
  # Second level .us domains for states and locality
14
26
  # See http://en.wikipedia.org/wiki/.us
@@ -23,6 +35,6 @@ class Gman
23
35
  # * k12.il.us
24
36
  # * ci.foo.zx.us
25
37
  def locality?
26
- !!(domain.to_s =~ LOCALITY_REGEX)
38
+ !domain.to_s.match(Locality::REGEX).nil?
27
39
  end
28
40
  end
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '5.0.9'
2
+ VERSION = '6.0.0'.freeze
3
3
  end
data/script/add CHANGED
@@ -7,11 +7,11 @@
7
7
  require './lib/gman/importer'
8
8
 
9
9
  if ARGV.length < 2
10
- puts "Usage: script/add [GROUP] [DOMAIN(S)]"
10
+ puts 'Usage: script/add [GROUP] [DOMAIN(S)]'
11
11
  exit 1
12
12
  end
13
13
 
14
14
  group = ARGV[0]
15
15
  domains = ARGV.drop(1)
16
16
 
17
- Gman.import({ group => domains })
17
+ Gman.import(group => domains)
@@ -4,8 +4,8 @@
4
4
  #
5
5
  # usage: script/alphabetize
6
6
 
7
- require_relative "../lib/gman"
8
- require_relative "../lib/gman/importer"
7
+ require_relative '../lib/gman'
8
+ require_relative '../lib/gman/importer'
9
9
 
10
10
  current = Gman::DomainList.current
11
11
  current.alphabetize
@@ -3,4 +3,6 @@
3
3
  set -ex
4
4
 
5
5
  bundle exec rake test
6
+ bundle exec rubocop -D -S -a
6
7
  bundle exec script/dedupe
8
+ bundle exec gem build gman.gemspec
@@ -7,7 +7,7 @@ require './lib/gman/importer'
7
7
 
8
8
  current = Gman::DomainList.current
9
9
 
10
- puts "Checking for duplicate domains in the domain list..."
10
+ puts 'Checking for duplicate domains in the domain list...'
11
11
  puts "Current list contains #{current.count} domains..."
12
12
 
13
13
  dupe = current.count - current.domains.uniq.count
@@ -16,6 +16,6 @@ exit 0 if dupe == 0
16
16
 
17
17
  dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
18
18
 
19
- puts "Duplicate domains:"
19
+ puts 'Duplicate domains:'
20
20
  puts dupes
21
21
  exit 1
@@ -3,9 +3,12 @@
3
3
  require 'ruby-prof'
4
4
  require './lib/gman'
5
5
 
6
- # Pick N random domains directly, without pre-loading the Gman list for an accurate benchmark
6
+ # Pick N random domains directly,
7
+ # without pre-loading the Gman list for an accurate benchmark
7
8
  count = (ARGV[0] || 100).to_i
8
- domains = File.readlines("./config/domains.txt").select { |l| l =~ /^[a-z0-9]/i }.sample(count)
9
+ domains = File.readlines('./config/domains.txt')
10
+ domains = domains.select { |l| l =~ /^[a-z0-9]/i }
11
+ domains = domains.sample(count)
9
12
 
10
13
  RubyProf.start
11
14
  domains.each do |domain|
@@ -1,20 +1,20 @@
1
- #! /usr/bin/env ruby
1
+ #!/usr/bin/env ruby
2
2
  # Given an array of domains, removes them from the list
3
3
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
4
 
5
- require_relative "../lib/gman"
6
- require_relative "../lib/gman/domain_list"
5
+ require_relative '../lib/gman'
6
+ require_relative '../lib/gman/domain_list'
7
7
 
8
8
  domains = ARGV
9
- domains = domains.clone.map { |d| d.gsub ",", "" }
9
+ domains = domains.clone.map { |d| d.delete ',' }
10
10
 
11
- list = File.open("./config/domains.txt").read
11
+ list = File.open('./config/domains.txt').read
12
12
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
13
 
14
14
  domains.each do |domain|
15
- list.gsub! /^#{domain}$\n/, ""
15
+ list.gsub!(/^#{domain}$\n/, '')
16
16
  end
17
17
 
18
18
  puts "Ending list: #{Gman::DomainList.current.count} domains"
19
19
 
20
- File.write "./config/domains.txt", list
20
+ File.write './config/domains.txt', list
@@ -8,57 +8,62 @@
8
8
  require './lib/gman/importer'
9
9
  require 'yaml'
10
10
 
11
- ENV["RECONCILING"] = "true"
12
- blacklist = ["usagovQUASI"]
13
- source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
11
+ ENV['RECONCILING'] = 'true'
12
+ blacklist = ['usagovQUASI']
13
+ source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
14
 
15
15
  data = open(source).read
16
- data = data.split("__________________________________________________________________________")
16
+ data = data.split('_' * 74)
17
17
  data = data.last.strip
18
- data = data.split(/\r?\n/).reject { |r| r.empty? }
18
+ data = data.split(/\r?\n/).reject(&:empty?)
19
19
 
20
20
  domains = {}
21
- group = ""
21
+ group = ''
22
22
  data.each do |row|
23
23
  if row =~ /^\w/
24
24
  group = row
25
25
  domains[group] = []
26
26
  else
27
- domains[group].push row.sub("\.\t", "").strip
27
+ domains[group].push row.sub("\.\t", '').strip
28
28
  end
29
29
  end
30
30
 
31
- domains.reject! { |group,domain| blacklist.include?(group) }
31
+ domains.reject! { |g, _domain| blacklist.include?(g) }
32
32
  importer = Gman::Importer.new(domains)
33
33
 
34
34
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
35
 
36
- importer.domains.list.each do |group, domains|
37
- domains.map! { |domain| Gman.new(domain).to_s }
38
- domains.map! { |domain| importer.normalize_domain(domain) }
36
+ importer.domains.list.each do |_group, d|
37
+ d.map! { |domain| Gman.new(domain).to_s }
38
+ d.map! { |domain| importer.normalize_domain(domain) }
39
39
  end
40
40
 
41
- importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
41
+ count = importer.domains.domains.count
42
+ importer.logger.info "Filtered down to #{count} normalized domains"
42
43
 
43
44
  missing = {}
44
- importer.domains.list.each do |group, usagovdomains|
45
- next unless importer.current.list[group]
46
- missing[group] = importer.current.list[group] - usagovdomains
45
+ importer.domains.list.each do |g, usagovdomains|
46
+ next unless importer.current.list[g]
47
+ missing[g] = importer.current.list[g] - usagovdomains
47
48
  end
48
49
 
49
- missing.reject! { |key, value| value.empty? }
50
+ missing.reject! { |_key, value| value.empty? }
50
51
 
51
- importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
52
+ count = missing.values.count
53
+ importer.logger.info "Found #{count} domains not on the USA.gov list"
52
54
  puts "Here's the list of missing domains:"
53
55
  puts YAML.dump(missing)
54
56
 
55
57
  domains = importer.domains.domains
56
- domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
58
+ domains = domains.group_by do |domain|
59
+ importer.valid_domain?(domain, skip_dupe: true)
60
+ end
57
61
  domains.delete(true)
58
62
  domains.delete(false)
59
- domains.delete("locality")
63
+ domains.delete('locality')
60
64
 
61
- importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
65
+ count = domains.values.flatten.count
66
+ importer.logger.info "Calling out #{count} rejected domains"
62
67
 
63
- puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
68
+ puts 'Here are the rejected domains and why they were rejected:'
64
69
  puts YAML.dump(domains)
@@ -5,10 +5,10 @@ require 'open-uri'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
8
+ url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
9
 
10
- domains = open(url).read.encode("UTF-8")
11
- domains = CSV.parse(domains, :headers => true)
12
- domains = domains.map { |row| row["Domain Name"] }
10
+ domains = open(url).read.encode('UTF-8')
11
+ domains = CSV.parse(domains, headers: true)
12
+ domains = domains.map { |row| row['Domain Name'] }
13
13
 
14
- Gman.import("German Federal" => domains)
14
+ Gman.import('German Federal' => domains)
@@ -5,9 +5,9 @@ require 'open-uri'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
8
+ url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
9
 
10
- csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
10
+ csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
11
11
 
12
12
  # For some reason, the header row is actually the last row
13
13
  # Pop the last line off the file and prepend it at the begining
@@ -17,7 +17,7 @@ lines.unshift lines.pop
17
17
  csv = lines.join("\n")
18
18
 
19
19
  # Load municipal domains
20
- data = CSV.parse(csv, :headers => true, :col_sep => ";")
21
- domains = data.map { |row| row["Internet"] }
20
+ data = CSV.parse(csv, headers: true, col_sep: ';')
21
+ domains = data.map { |row| row['Internet'] }
22
22
 
23
- Gman.import("German Municipalities" => domains)
23
+ Gman.import('German Municipalities' => domains)
@@ -3,8 +3,16 @@
3
3
 
4
4
  require 'fileutils'
5
5
 
6
- FileUtils.rm_rf("almanak.overheid.nl")
7
- domains = `wget -q -r -nc -np https://almanak.overheid.nl/
8
- grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
6
+ FileUtils.rm_rf('almanak.overheid.nl')
7
+ commands = [
8
+ "wget -q -r -nc -np https://almanak.overheid.nl/
9
+ grep @ -rI almanak.overheid.nl/",
10
+ 'cut -f 2 -d @',
11
+ "cut -f 1 -d '\"'",
12
+ 'grep \\.nl$',
13
+ 'sort',
14
+ 'uniq'
15
+ ]
16
+ domains = system commands.join('|')
9
17
 
10
- Gman.import("Netherlands" => domains.split("\n"))
18
+ Gman.import('Netherlands' => domains.split("\n"))
@@ -1,10 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
  # Propagates an initial list of best-guess government domains
3
3
 
4
- require "public_suffix"
5
- require "yaml"
6
- require_relative "../lib/gman"
7
- require_relative "../lib/gman/importer"
4
+ require 'public_suffix'
5
+ require 'yaml'
6
+ require_relative '../lib/gman'
7
+ require_relative '../lib/gman/importer'
8
8
 
9
9
  # https://gist.github.com/benbalter/6147066
10
10
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
@@ -15,13 +15,13 @@ PublicSuffix::List.default.each do |rule|
15
15
 
16
16
  if rule.parts.length == 1
17
17
  domain = rule.parts.first if ".#{rule.value}" =~ REGEX
18
- else
19
- domain = rule.parts.pop(2).join(".") if ".#{rule.value}" =~ REGEX
18
+ elsif ".#{rule.value}" =~ REGEX
19
+ domain = rule.parts.pop(2).join('.')
20
20
  end
21
21
 
22
- domains.push domain unless domain.nil? or domains.include? domain
22
+ domains.push domain unless domain.nil? || domains.include?(domain)
23
23
  end
24
24
 
25
25
  # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
26
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({"non-us gov" => domains}, :skip_resolve => true)
27
+ Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
@@ -5,15 +5,17 @@ require 'csv'
5
5
  require './lib/gman'
6
6
  require './lib/gman/importer'
7
7
 
8
- url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
8
+ url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
9
  agent = Mechanize.new
10
10
  page = agent.get(url)
11
11
  form = page.forms.first
12
- form.radiobuttons.find { |r| r.value = "Textfil" }.check
13
- submit_button = form.buttons.find { |b| b.type == "submit" }
12
+ form.radiobuttons.find { |r| r.value = 'Textfil' }.check
13
+ submit_button = form.buttons.find { |b| b.type == 'submit' }
14
14
  response = agent.submit(form, submit_button)
15
15
 
16
- rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
17
- domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
16
+ rows = CSV.parse(response.content, headers: true, col_sep: "\t")
17
+ domains = rows.map do |row|
18
+ row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
19
+ end
18
20
 
19
- Gman.import("Swedish Administrative Authorities" => domains)
21
+ Gman.import('Swedish Administrative Authorities' => domains)
@@ -12,24 +12,24 @@
12
12
 
13
13
  require './lib/gman/importer'
14
14
 
15
- blacklist = ["usagovQUASI"]
16
- source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
15
+ blacklist = %w(usagovQUASI usagovFEDgov)
16
+ source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
17
17
 
18
18
  data = open(source).read
19
- data = data.split("__________________________________________________________________________")
19
+ data = data.split('_' * 74)
20
20
  data = data.last.strip
21
- data = data.split(/\r?\n/).reject { |r| r.empty? }
21
+ data = data.split(/\r?\n/).reject(&:empty?)
22
22
 
23
23
  domains = {}
24
- group = ""
24
+ group = ''
25
25
  data.each do |row|
26
26
  if row =~ /^\w/
27
27
  group = row
28
28
  domains[group] = []
29
29
  else
30
- domains[group].push row.sub("\.\t", "").strip
30
+ domains[group].push row.sub("\.\t", '').strip
31
31
  end
32
32
  end
33
33
 
34
- domains.reject! { |group,domain| blacklist.include?(group) }
34
+ domains.reject! { |g, _| blacklist.include?(g) }
35
35
  Gman.import(domains)
@@ -0,0 +1,2 @@
1
+ // test
2
+ gov
File without changes
@@ -8,7 +8,7 @@ begin
8
8
  Bundler.setup(:default, :development)
9
9
  rescue Bundler::BundlerError => e
10
10
  $stderr.puts e.message
11
- $stderr.puts "Run `bundle install` to install missing gems"
11
+ $stderr.puts 'Run `bundle install` to install missing gems'
12
12
  exit e.status_code
13
13
  end
14
14
 
@@ -16,11 +16,25 @@ require 'shoulda'
16
16
 
17
17
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
- require_relative "../lib/gman"
20
- require_relative "../lib/gman/domain_list"
19
+ require_relative '../lib/gman'
20
+ require_relative '../lib/gman/domain_list'
21
+ require_relative '../lib/gman/importer'
21
22
 
22
- require './lib/gman/importer'
23
+ def bin_path(cmd = 'gman')
24
+ File.expand_path "../bin/#{cmd}", File.dirname(__FILE__)
25
+ end
23
26
 
24
27
  def test_bin(*args)
25
- output, status = Open3.capture2e("bundle", "exec", "gman", *args)
28
+ Open3.capture2e('bundle', 'exec', bin_path, *args)
29
+ end
30
+
31
+ def fixture_path(fixture)
32
+ File.expand_path "./fixtures/#{fixture}", File.dirname(__FILE__)
33
+ end
34
+
35
+ def with_env(key, value)
36
+ old_env = ENV[key]
37
+ ENV[key] = value
38
+ yield
39
+ ENV[key] = old_env
26
40
  end
@@ -1,57 +1,62 @@
1
1
  require File.join(File.dirname(__FILE__), 'helper')
2
2
 
3
- VALID = [ "foo.gov",
4
- "http://foo.mil",
5
- "foo@bar.gc.ca",
6
- "foo.gov.au",
7
- "https://www.foo.gouv.fr",
8
- "foo@ci.champaign.il.us",
9
- "foo.bar.baz.gov.au",
10
- "foo@bar.gov.uk",
11
- "foo.gov",
12
- "foo.fed.us",
13
- "foo.state.il.us",
14
- "state.il.us",
15
- "foo@af.mil",
16
- "foo.gov.in"
17
- ]
3
+ VALID = ['foo.gov',
4
+ 'http://foo.mil',
5
+ 'foo@bar.gc.ca',
6
+ 'foo.gov.au',
7
+ 'https://www.foo.gouv.fr',
8
+ 'foo@ci.champaign.il.us',
9
+ 'foo.bar.baz.gov.au',
10
+ 'foo@bar.gov.uk',
11
+ 'foo.gov',
12
+ 'foo.fed.us',
13
+ 'foo.state.il.us',
14
+ 'state.il.us',
15
+ 'foo@af.mil',
16
+ 'foo.gov.in'
17
+ ].freeze
18
18
 
19
- INVALID = [ "foo.bar.com",
20
- "bar@foo.biz",
21
- "http://www.foo.biz",
22
- "foo.uk",
23
- "gov",
24
- "foo@k12.champaign.il.us",
25
- "foo@kii.gov.by",
26
- "foo",
27
- "",
28
- nil,
29
- " ",
30
- "foo.city.il.us",
31
- "foo.ci.il.us",
32
- "foo.zx.us",
33
- "foo@mail.gov.ua"
34
- ]
19
+ INVALID = ['foo.bar.com',
20
+ 'bar@foo.biz',
21
+ 'http://www.foo.biz',
22
+ 'foo.uk',
23
+ 'gov',
24
+ 'foo@k12.champaign.il.us',
25
+ 'foo@kii.gov.by',
26
+ 'foo',
27
+ '',
28
+ nil,
29
+ ' ',
30
+ 'foo.city.il.us',
31
+ 'foo.ci.il.us',
32
+ 'foo.zx.us',
33
+ 'foo@mail.gov.ua'
34
+ ].freeze
35
35
 
36
36
  class TestGman < Minitest::Test
37
-
38
37
  VALID.each do |domain|
39
38
  should "recognize #{domain} as a government domain" do
40
- assert Gman::valid?(domain)
39
+ assert Gman.valid?(domain)
41
40
  end
42
41
  end
43
42
 
44
43
  INVALID.each do |domain|
45
44
  should "recognize #{domain} as a non-government domain" do
46
- refute Gman::valid?(domain)
45
+ refute Gman.valid?(domain)
47
46
  end
48
47
  end
49
48
 
50
- should "not allow educational domains" do
51
- assert_equal false, Gman::valid?("foo@gwu.edu")
49
+ should 'not allow educational domains' do
50
+ assert_equal false, Gman.valid?('foo@gwu.edu')
51
+ end
52
+
53
+ should 'returns the path to domains.txt' do
54
+ assert_equal true, File.exist?(Gman.list_path)
52
55
  end
53
56
 
54
- should "returns the path to domains.txt" do
55
- assert_equal true, File.exists?(Gman.list_path)
57
+ should 'stub domains when asked' do
58
+ with_env 'GMAN_STUB_DOMAINS', 'true' do
59
+ assert_equal fixture_path('domains.txt'), Gman.list_path
60
+ end
56
61
  end
57
62
  end