RubyGems - gman - Versions diffs - 5.0.9 → 6.0.0 - Mend

gman 5.0.9 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/.rubocop.yml +21 -0
data/.ruby-version +1 -1
data/Gemfile +1 -0
data/README.md +16 -22
data/Rakefile +3 -3
data/bin/gman +10 -11
data/bin/gman_filter +7 -7
data/config/domains.txt +19 -19
data/config/vendor/dotgovs.csv +398 -355
data/gman.gemspec +34 -27
data/lib/gman.rb +29 -23
data/lib/gman/country_codes.rb +14 -15
data/lib/gman/domain_list.rb +34 -25
data/lib/gman/identifier.rb +39 -43
data/lib/gman/importer.rb +111 -61
data/lib/gman/locality.rb +22 -10
data/lib/gman/version.rb +1 -1
data/script/add +2 -2
data/script/alphabetize +2 -2
data/script/cibuild +2 -0
data/script/dedupe +2 -2
data/script/profile +5 -2
data/script/prune +7 -7
data/script/reconcile-us +26 -21
data/script/vendor-federal-de +5 -5
data/script/vendor-municipal-de +5 -5
data/script/vendor-nl +12 -4
data/script/vendor-public-suffix +8 -8
data/script/vendor-se +8 -6
data/script/vendor-us +7 -7
data/test/fixtures/domains.txt +2 -0
data/test/{obama.txt → fixtures/obama.txt} +0 -0
data/test/helper.rb +19 -5
data/test/test_gman.rb +43 -38
data/test/test_gman_bin.rb +37 -43
data/test/test_gman_country_codes.rb +10 -6
data/test/test_gman_domains.rb +15 -10
data/test/test_gman_filter.rb +5 -7
data/test/test_gman_identifier.rb +36 -35
data/test/test_gman_importer.rb +250 -0
data/test/test_gman_locality.rb +5 -5
metadata +28 -10
data/lib/gman/sanctions.rb +0 -29
data/test/test_gman_sanctions.rb +0 -20

data/lib/gman/locality.rb CHANGED

@@ -1,14 +1,26 @@
 class Gman
+  class Locality
+    AFFINITY_NAMESPACES = %w(state dst cog).freeze
-  LOCALITY_REGEX = %r{
-    (
-      (state|dst|cog)
-    |
-      (ci|co|borough|boro|city|county|parish|town|twp|vi|vil|village)\.[a-z-]+
-    )
-    \.(ak|al|ar|az|ca|co|ct|dc|de|fl|ga|hi|ia|id|il|in|ks|ky|la|ma|md|me|mi|mn|mo|ms|mt|nc|nd|ne|nh|nj|nm|nv|ny|oh|ok|or|pa|ri|sc|sd|tn|tx|um|ut|va|vt|wa|wi|wv|wy)
-    \.us
-     }x
+    STATES = %w(
+      ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
+      la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
+      ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
+    ).freeze
+    LOCALITY_DOMAINS = %w(
+      ci co borough boro city county
+      parish town twp vi vil village
+    ).freeze
+    REGEX = /
+      (
+        (#{Regexp.union(AFFINITY_NAMESPACES)})
+      |
+        (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
+      )\.(#{Regexp.union(STATES)})\.us
+    /x
+  end
   # Second level .us domains for states and locality
   # See http://en.wikipedia.org/wiki/.us
@@ -23,6 +35,6 @@ class Gman
   #  * k12.il.us
   #  * ci.foo.zx.us
   def locality?
-    !!(domain.to_s =~ LOCALITY_REGEX)
+    !domain.to_s.match(Locality::REGEX).nil?
   end
 end

data/lib/gman/version.rb CHANGED

@@ -1,3 +1,3 @@
 class Gman
-  VERSION = '5.0.9'
+  VERSION = '6.0.0'.freeze
 end

data/script/add CHANGED

@@ -7,11 +7,11 @@
 require './lib/gman/importer'
 if ARGV.length < 2
-  puts "Usage: script/add [GROUP] [DOMAIN(S)]"
+  puts 'Usage: script/add [GROUP] [DOMAIN(S)]'
   exit 1
 end
 group = ARGV[0]
 domains = ARGV.drop(1)
-Gman.import({ group => domains })
+Gman.import(group => domains)

data/script/alphabetize CHANGED

@@ -4,8 +4,8 @@
 #
 # usage: script/alphabetize
-require_relative "../lib/gman"
-require_relative "../lib/gman/importer"
+require_relative '../lib/gman'
+require_relative '../lib/gman/importer'
 current = Gman::DomainList.current
 current.alphabetize

data/script/cibuild CHANGED

@@ -3,4 +3,6 @@
 set -ex
 bundle exec rake test
+bundle exec rubocop -D -S -a
 bundle exec script/dedupe
+bundle exec gem build gman.gemspec

data/script/dedupe CHANGED

@@ -7,7 +7,7 @@ require './lib/gman/importer'
 current = Gman::DomainList.current
-puts "Checking for duplicate domains in the domain list..."
+puts 'Checking for duplicate domains in the domain list...'
 puts "Current list contains #{current.count} domains..."
 dupe = current.count - current.domains.uniq.count
@@ -16,6 +16,6 @@ exit 0 if dupe == 0
 dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
-puts "Duplicate domains:"
+puts 'Duplicate domains:'
 puts dupes
 exit 1

data/script/profile CHANGED

@@ -3,9 +3,12 @@
 require 'ruby-prof'
 require './lib/gman'
-# Pick N random domains directly, without pre-loading the Gman list for an accurate benchmark
+# Pick N random domains directly,
+# without pre-loading the Gman list for an accurate benchmark
 count = (ARGV[0] || 100).to_i
-domains = File.readlines("./config/domains.txt").select { |l| l =~ /^[a-z0-9]/i }.sample(count)
+domains = File.readlines('./config/domains.txt')
+domains = domains.select { |l| l =~ /^[a-z0-9]/i }
+domains = domains.sample(count)
 RubyProf.start
 domains.each do |domain|

data/script/prune CHANGED

@@ -1,20 +1,20 @@
-#! /usr/bin/env ruby
+#!/usr/bin/env ruby
 # Given an array of domains, removes them from the list
 # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
-require_relative "../lib/gman"
-require_relative "../lib/gman/domain_list"
+require_relative '../lib/gman'
+require_relative '../lib/gman/domain_list'
 domains = ARGV
-domains = domains.clone.map { |d| d.gsub ",", "" }
+domains = domains.clone.map { |d| d.delete ',' }
-list = File.open("./config/domains.txt").read
+list = File.open('./config/domains.txt').read
 puts "Starting list: #{Gman::DomainList.current.count} domains"
 domains.each do |domain|
-  list.gsub! /^#{domain}$\n/, ""
+  list.gsub!(/^#{domain}$\n/, '')
 end
 puts "Ending list: #{Gman::DomainList.current.count} domains"
-File.write "./config/domains.txt", list
+File.write './config/domains.txt', list

data/script/reconcile-us CHANGED

@@ -8,57 +8,62 @@
 require './lib/gman/importer'
 require 'yaml'
-ENV["RECONCILING"] = "true"
-blacklist = ["usagovQUASI"]
-source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
+ENV['RECONCILING'] = 'true'
+blacklist = ['usagovQUASI']
+source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
 data = open(source).read
-data = data.split("__________________________________________________________________________")
+data = data.split('_' * 74)
 data = data.last.strip
-data = data.split(/\r?\n/).reject { |r| r.empty? }
+data = data.split(/\r?\n/).reject(&:empty?)
 domains = {}
-group = ""
+group = ''
 data.each do |row|
   if row =~ /^\w/
     group = row
     domains[group] = []
   else
-    domains[group].push row.sub("\.\t", "").strip
+    domains[group].push row.sub("\.\t", '').strip
   end
 end
-domains.reject! { |group,domain| blacklist.include?(group) }
+domains.reject! { |g, _domain| blacklist.include?(g) }
 importer = Gman::Importer.new(domains)
 importer.logger.info "Starting with #{importer.domains.count} domains"
-importer.domains.list.each do |group, domains|
-  domains.map! { |domain| Gman.new(domain).to_s }
-  domains.map! { |domain| importer.normalize_domain(domain) }
+importer.domains.list.each do |_group, d|
+  d.map! { |domain| Gman.new(domain).to_s }
+  d.map! { |domain| importer.normalize_domain(domain) }
 end
-importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
+count = importer.domains.domains.count
+importer.logger.info "Filtered down to #{count} normalized domains"
 missing = {}
-importer.domains.list.each do |group, usagovdomains|
-  next unless importer.current.list[group]
-  missing[group] = importer.current.list[group] - usagovdomains
+importer.domains.list.each do |g, usagovdomains|
+  next unless importer.current.list[g]
+  missing[g] = importer.current.list[g] - usagovdomains
 end
-missing.reject! { |key, value| value.empty? }
+missing.reject! { |_key, value| value.empty? }
-importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
+count = missing.values.count
+importer.logger.info "Found #{count} domains not on the USA.gov list"
 puts "Here's the list of missing domains:"
 puts YAML.dump(missing)
 domains = importer.domains.domains
-domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
+domains = domains.group_by do |domain|
+  importer.valid_domain?(domain, skip_dupe: true)
+end
 domains.delete(true)
 domains.delete(false)
-domains.delete("locality")
+domains.delete('locality')
-importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
+count = domains.values.flatten.count
+importer.logger.info "Calling out #{count} rejected domains"
-puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
+puts 'Here are the rejected domains and why they were rejected:'
 puts YAML.dump(domains)

data/script/vendor-federal-de CHANGED

@@ -5,10 +5,10 @@ require 'open-uri'
 require './lib/gman'
 require './lib/gman/importer'
-url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
+url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
-domains = open(url).read.encode("UTF-8")
-domains = CSV.parse(domains, :headers => true)
-domains = domains.map { |row| row["Domain Name"] }
+domains = open(url).read.encode('UTF-8')
+domains = CSV.parse(domains, headers: true)
+domains = domains.map { |row| row['Domain Name'] }
-Gman.import("German Federal" => domains)
+Gman.import('German Federal' => domains)

data/script/vendor-municipal-de CHANGED

@@ -5,9 +5,9 @@ require 'open-uri'
 require './lib/gman'
 require './lib/gman/importer'
-url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
+url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
-csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
+csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
 # For some reason, the header row is actually the last row
 # Pop the last line off the file and prepend it at the begining
@@ -17,7 +17,7 @@ lines.unshift lines.pop
 csv = lines.join("\n")
 # Load municipal domains
-data = CSV.parse(csv, :headers => true, :col_sep => ";")
-domains = data.map { |row| row["Internet"] }
+data = CSV.parse(csv, headers: true, col_sep: ';')
+domains = data.map { |row| row['Internet'] }
-Gman.import("German Municipalities" => domains)
+Gman.import('German Municipalities' => domains)

data/script/vendor-nl CHANGED

@@ -3,8 +3,16 @@
 require 'fileutils'
-FileUtils.rm_rf("almanak.overheid.nl")
-domains = `wget -q -r -nc -np https://almanak.overheid.nl/
-grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
+FileUtils.rm_rf('almanak.overheid.nl')
+commands = [
+  "wget -q -r -nc -np https://almanak.overheid.nl/
+  grep @ -rI almanak.overheid.nl/",
+  'cut -f 2 -d @',
+  "cut -f 1 -d '\"'",
+  'grep \\.nl$',
+  'sort',
+  'uniq'
+]
+domains = system commands.join('|')
-Gman.import("Netherlands" => domains.split("\n"))
+Gman.import('Netherlands' => domains.split("\n"))

data/script/vendor-public-suffix CHANGED

@@ -1,10 +1,10 @@
 #!/usr/bin/env ruby
 # Propagates an initial list of best-guess government domains
-require "public_suffix"
-require "yaml"
-require_relative "../lib/gman"
-require_relative "../lib/gman/importer"
+require 'public_suffix'
+require 'yaml'
+require_relative '../lib/gman'
+require_relative '../lib/gman/importer'
 # https://gist.github.com/benbalter/6147066
 REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
@@ -15,13 +15,13 @@ PublicSuffix::List.default.each do |rule|
   if rule.parts.length == 1
     domain = rule.parts.first if ".#{rule.value}" =~ REGEX
-  else
-    domain = rule.parts.pop(2).join(".") if ".#{rule.value}" =~ REGEX
+  elsif ".#{rule.value}" =~ REGEX
+    domain = rule.parts.pop(2).join('.')
   end
-  domains.push domain unless domain.nil? or domains.include? domain
+  domains.push domain unless domain.nil? || domains.include?(domain)
 end
 # Note: We want to skip resolution here, because a domain like `gov.sv` may be
 # a valid TLD, not have any top-level sites, and we'd still want it listed
-Gman.import({"non-us gov" => domains}, :skip_resolve => true)
+Gman.import({ 'non-us gov' => domains }, skip_resolve: true)

data/script/vendor-se CHANGED

@@ -5,15 +5,17 @@ require 'csv'
 require './lib/gman'
 require './lib/gman/importer'
-url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
+url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
 agent = Mechanize.new
 page = agent.get(url)
 form = page.forms.first
-form.radiobuttons.find { |r| r.value = "Textfil" }.check
-submit_button = form.buttons.find { |b| b.type == "submit" }
+form.radiobuttons.find { |r| r.value = 'Textfil' }.check
+submit_button = form.buttons.find { |b| b.type == 'submit' }
 response = agent.submit(form, submit_button)
-rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
-domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
+rows = CSV.parse(response.content, headers: true, col_sep: "\t")
+domains = rows.map do |row|
+  row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
+end
-Gman.import("Swedish Administrative Authorities" => domains)
+Gman.import('Swedish Administrative Authorities' => domains)

data/script/vendor-us CHANGED

@@ -12,24 +12,24 @@
 require './lib/gman/importer'
-blacklist = ["usagovQUASI"]
-source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
+blacklist = %w(usagovQUASI usagovFEDgov)
+source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
 data = open(source).read
-data = data.split("__________________________________________________________________________")
+data = data.split('_' * 74)
 data = data.last.strip
-data = data.split(/\r?\n/).reject { |r| r.empty? }
+data = data.split(/\r?\n/).reject(&:empty?)
 domains = {}
-group = ""
+group = ''
 data.each do |row|
   if row =~ /^\w/
     group = row
     domains[group] = []
   else
-    domains[group].push row.sub("\.\t", "").strip
+    domains[group].push row.sub("\.\t", '').strip
   end
 end
-domains.reject! { |group,domain| blacklist.include?(group) }
+domains.reject! { |g, _| blacklist.include?(g) }
 Gman.import(domains)

data/test/fixtures/domains.txt ADDED

	@@ -0,0 +1,2 @@
1	+ // test
2	+ gov

data/test/{obama.txt → fixtures/obama.txt} RENAMED

File without changes

data/test/helper.rb CHANGED

@@ -8,7 +8,7 @@ begin
   Bundler.setup(:default, :development)
 rescue Bundler::BundlerError => e
   $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
+  $stderr.puts 'Run `bundle install` to install missing gems'
   exit e.status_code
 end
@@ -16,11 +16,25 @@ require 'shoulda'
 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 $LOAD_PATH.unshift(File.dirname(__FILE__))
-require_relative "../lib/gman"
-require_relative "../lib/gman/domain_list"
+require_relative '../lib/gman'
+require_relative '../lib/gman/domain_list'
+require_relative '../lib/gman/importer'
-require './lib/gman/importer'
+def bin_path(cmd = 'gman')
+  File.expand_path "../bin/#{cmd}", File.dirname(__FILE__)
+end
 def test_bin(*args)
-  output, status = Open3.capture2e("bundle", "exec", "gman", *args)
+  Open3.capture2e('bundle', 'exec', bin_path, *args)
+end
+def fixture_path(fixture)
+  File.expand_path "./fixtures/#{fixture}", File.dirname(__FILE__)
+end
+def with_env(key, value)
+  old_env = ENV[key]
+  ENV[key] = value
+  yield
+  ENV[key] = old_env
 end

data/test/test_gman.rb CHANGED

@@ -1,57 +1,62 @@
 require File.join(File.dirname(__FILE__), 'helper')
-VALID = [  "foo.gov",
-            "http://foo.mil",
-            "foo@bar.gc.ca",
-            "foo.gov.au",
-            "https://www.foo.gouv.fr",
-            "foo@ci.champaign.il.us",
-            "foo.bar.baz.gov.au",
-            "foo@bar.gov.uk",
-            "foo.gov",
-            "foo.fed.us",
-            "foo.state.il.us",
-            "state.il.us",
-            "foo@af.mil",
-            "foo.gov.in"
-        ]
+VALID = ['foo.gov',
+         'http://foo.mil',
+         'foo@bar.gc.ca',
+         'foo.gov.au',
+         'https://www.foo.gouv.fr',
+         'foo@ci.champaign.il.us',
+         'foo.bar.baz.gov.au',
+         'foo@bar.gov.uk',
+         'foo.gov',
+         'foo.fed.us',
+         'foo.state.il.us',
+         'state.il.us',
+         'foo@af.mil',
+         'foo.gov.in'
+        ].freeze
-INVALID = [ "foo.bar.com",
-            "bar@foo.biz",
-            "http://www.foo.biz",
-            "foo.uk",
-            "gov",
-            "foo@k12.champaign.il.us",
-            "foo@kii.gov.by",
-            "foo",
-            "",
-            nil,
-            " ",
-            "foo.city.il.us",
-            "foo.ci.il.us",
-            "foo.zx.us",
-            "foo@mail.gov.ua"
-          ]
+INVALID = ['foo.bar.com',
+           'bar@foo.biz',
+           'http://www.foo.biz',
+           'foo.uk',
+           'gov',
+           'foo@k12.champaign.il.us',
+           'foo@kii.gov.by',
+           'foo',
+           '',
+           nil,
+           ' ',
+           'foo.city.il.us',
+           'foo.ci.il.us',
+           'foo.zx.us',
+           'foo@mail.gov.ua'
+          ].freeze
 class TestGman < Minitest::Test
   VALID.each do |domain|
     should "recognize #{domain} as a government domain" do
-      assert Gman::valid?(domain)
+      assert Gman.valid?(domain)
     end
   end
   INVALID.each do |domain|
     should "recognize #{domain} as a non-government domain" do
-      refute Gman::valid?(domain)
+      refute Gman.valid?(domain)
     end
   end
-  should "not allow educational domains" do
-    assert_equal false, Gman::valid?("foo@gwu.edu")
+  should 'not allow educational domains' do
+    assert_equal false, Gman.valid?('foo@gwu.edu')
+  end
+  should 'returns the path to domains.txt' do
+    assert_equal true, File.exist?(Gman.list_path)
   end
-  should "returns the path to domains.txt" do
-    assert_equal true, File.exists?(Gman.list_path)
+  should 'stub domains when asked' do
+    with_env 'GMAN_STUB_DOMAINS', 'true' do
+      assert_equal fixture_path('domains.txt'), Gman.list_path
+    end
   end
 end