RubyGems - gman - Versions diffs - 6.0.1 → 7.0.4 - Mend

gman 6.0.1 → 7.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

checksums.yaml +5 -5
data/.github/CODEOWNERS +3 -0
data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
data/.github/config.yml +23 -0
data/.github/funding.yml +1 -0
data/.github/no-response.yml +15 -0
data/.github/release-drafter.yml +4 -0
data/.github/settings.yml +33 -0
data/.github/stale.yml +29 -0
data/.gitignore +1 -0
data/.rspec +2 -0
data/.rubocop.yml +17 -5
data/.rubocop_todo.yml +84 -0
data/.ruby-version +1 -1
data/Gemfile +2 -0
data/bin/gman +6 -4
data/bin/gman_filter +5 -7
data/config/domains.txt +8446 -173
data/config/vendor/academic.txt +8038 -0
data/config/vendor/dotgovs.csv +5786 -5560
data/docs/CODE_OF_CONDUCT.md +46 -0
data/docs/CONTRIBUTING.md +92 -0
data/{README.md → docs/README.md} +3 -3
data/docs/SECURITY.md +3 -0
data/docs/_config.yml +2 -0
data/gman.gemspec +18 -17
data/lib/gman.rb +25 -21
data/lib/gman/country_codes.rb +17 -17
data/lib/gman/domain_list.rb +123 -41
data/lib/gman/identifier.rb +59 -21
data/lib/gman/importer.rb +39 -40
data/lib/gman/locality.rb +23 -21
data/lib/gman/version.rb +3 -1
data/script/add +2 -0
data/script/alphabetize +2 -0
data/script/cibuild +1 -1
data/script/dedupe +2 -1
data/script/profile +2 -1
data/script/prune +5 -3
data/script/reconcile-us +6 -3
data/script/vendor +1 -1
data/script/vendor-federal-de +3 -3
data/script/vendor-municipal-de +3 -3
data/script/vendor-nl +4 -1
data/script/vendor-public-suffix +7 -6
data/script/vendor-se +3 -3
data/script/vendor-swot +43 -0
data/script/vendor-us +8 -5
data/spec/fixtures/domains.txt +4 -0
data/{test → spec}/fixtures/obama.txt +0 -0
data/spec/gman/bin_spec.rb +101 -0
data/spec/gman/country_code_spec.rb +39 -0
data/spec/gman/domain_list_spec.rb +110 -0
data/spec/gman/domains_spec.rb +25 -0
data/spec/gman/identifier_spec.rb +218 -0
data/spec/gman/importer_spec.rb +236 -0
data/spec/gman/locality_spec.rb +24 -0
data/spec/gman_spec.rb +74 -0
data/spec/spec_helper.rb +31 -0
metadata +86 -73
data/CONTRIBUTING.md +0 -22
data/Rakefile +0 -22
data/test/fixtures/domains.txt +0 -2
data/test/helper.rb +0 -40
data/test/test_gman.rb +0 -62
data/test/test_gman_bin.rb +0 -75
data/test/test_gman_country_codes.rb +0 -18
data/test/test_gman_domains.rb +0 -33
data/test/test_gman_filter.rb +0 -17
data/test/test_gman_identifier.rb +0 -106
data/test/test_gman_importer.rb +0 -250
data/test/test_gman_locality.rb +0 -10

data/script/profile CHANGED

@@ -1,4 +1,5 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 require 'ruby-prof'
 require './lib/gman'
@@ -17,4 +18,4 @@ end
 result = RubyProf.stop
 printer = RubyProf::FlatPrinter.new(result)
-printer.print(STDOUT)
+printer.print($stdout)

data/script/prune CHANGED

@@ -1,4 +1,6 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 # Given an array of domains, removes them from the list
 # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
 puts "Starting list: #{Gman::DomainList.current.count} domains"
 domains.each do |domain|
-  list.gsub!(/^#{domain}$\n/, '')
+  list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
 end
-puts "Ending list: #{Gman::DomainList.current.count} domains"
 File.write './config/domains.txt', list
+puts "Ending list: #{Gman::DomainList.current.count} domains"

data/script/reconcile-us CHANGED

@@ -1,4 +1,6 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 #
 # Reconciles the USA.gov-maintained list of US domains with domains.txt
 # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
 blacklist = ['usagovQUASI']
 source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
-data = open(source).read
+data = URI.open(source).read
 data = data.split('_' * 74)
 data = data.last.strip
 data = data.split(/\r?\n/).reject(&:empty?)
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
 domains = {}
 group = ''
 data.each do |row|
-  if row =~ /^\w/
+  if /^\w/.match?(row)
     group = row
     domains[group] = []
   else
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
 importer.logger.info "Starting with #{importer.domains.count} domains"
-importer.domains.list.each do |_group, d|
+importer.domains.list.each_value do |d|
   d.map! { |domain| Gman.new(domain).to_s }
   d.map! { |domain| importer.normalize_domain(domain) }
 end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
 missing = {}
 importer.domains.list.each do |g, usagovdomains|
   next unless importer.current.list[g]
   missing[g] = importer.current.list[g] - usagovdomains
 end

data/script/vendor CHANGED

@@ -6,7 +6,7 @@ for file in script/vendor-*; do
     echo "*************************************"
     echo "Vendoring $file"
     echo "*************************************"
-    "$file"
+    bundle exec "$file"
   fi
 done

data/script/vendor-federal-de CHANGED

@@ -1,14 +1,14 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 require 'csv'
 require 'open-uri'
 require './lib/gman'
-require './lib/gman/importer'
 url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
-domains = open(url).read.encode('UTF-8')
+domains = URI.open(url).read.encode('UTF-8')
 domains = CSV.parse(domains, headers: true)
 domains = domains.map { |row| row['Domain Name'] }
-Gman.import('German Federal' => domains)
+Gman::Importer.new('German Federal' => domains).import

data/script/vendor-municipal-de CHANGED

@@ -1,13 +1,13 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 require 'csv'
 require 'open-uri'
 require './lib/gman'
-require './lib/gman/importer'
 url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
-csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
+csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
 # For some reason, the header row is actually the last row
 # Pop the last line off the file and prepend it at the begining
@@ -20,4 +20,4 @@ csv = lines.join("\n")
 data = CSV.parse(csv, headers: true, col_sep: ';')
 domains = data.map { |row| row['Internet'] }
-Gman.import('German Municipalities' => domains)
+Gman::Importer.new('German Municipalities' => domains).import

data/script/vendor-nl CHANGED

@@ -1,7 +1,10 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
 require 'fileutils'
+require './lib/gman'
 FileUtils.rm_rf('almanak.overheid.nl')
 commands = [
@@ -15,4 +18,4 @@ commands = [
 ]
 domains = system commands.join('|')
-Gman.import('Netherlands' => domains.split("\n"))
+Gman::Importer.new('Netherlands' => domains.split("\n")).import

data/script/vendor-public-suffix CHANGED

@@ -1,27 +1,28 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 # Propagates an initial list of best-guess government domains
 require 'public_suffix'
 require 'yaml'
 require_relative '../lib/gman'
-require_relative '../lib/gman/importer'
 # https://gist.github.com/benbalter/6147066
-REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
+REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
 domains = []
 PublicSuffix::List.default.each do |rule|
   domain = nil
   if rule.parts.length == 1
-    domain = rule.parts.first if ".#{rule.value}" =~ REGEX
-  elsif ".#{rule.value}" =~ REGEX
+    domain = rule.parts.first if REGEX.match?(".#{rule.value}")
+  elsif REGEX.match?(".#{rule.value}")
     domain = rule.parts.pop(2).join('.')
   end
   domains.push domain unless domain.nil? || domains.include?(domain)
 end
-# Note: We want to skip resolution here, because a domain like `gov.sv` may be
+# NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
 # a valid TLD, not have any top-level sites, and we'd still want it listed
-Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
+Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)

data/script/vendor-se CHANGED

@@ -1,9 +1,9 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 require 'mechanize'
 require 'csv'
 require './lib/gman'
-require './lib/gman/importer'
 url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
 agent = Mechanize.new
@@ -15,7 +15,7 @@ response = agent.submit(form, submit_button)
 rows = CSV.parse(response.content, headers: true, col_sep: "\t")
 domains = rows.map do |row|
-  row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
+  row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
 end
-Gman.import('Swedish Administrative Authorities' => domains)
+Gman::Importer.new('Swedish Administrative Authorities' => domains).import

data/script/vendor-swot ADDED

@@ -0,0 +1,43 @@
+#! /usr/bin/env ruby
+# frozen_string_literal: true
+#
+# Vendors the Swot-maintained list of adademic domains into config/academic.txt
+# Source: https://github.com/leereilly/swot/
+#
+# Usage: script/vendor-swot
+#
+# Will automatically fetch latest version of the list and merge
+# You can check for changes and commit via `git status`
+#
+# It's also probably a good idea to run `script/ci-build` for good measure
+#
+# Note: We do this, because as a bajillion individual files, Swot takes up 30MB
+require 'gman'
+require 'swot'
+# Generate array of all Swot domains
+domains = Swot.all_domains
+domains << Swot::ACADEMIC_TLDS
+# Init the importer, builiding a DomainList
+group = "Academic domains vendored from Swot v#{Swot::VERSION}"
+hash = { group => domains }
+importer = Gman::Importer.new(hash)
+importer.logger.info "Importing from Swot v#{Swot::VERSION}"
+importer.logger.info "Found #{domains.count} academic domains"
+domain_list = importer.domain_list
+domain_list.path = Gman.academic_list_path
+# Cleanup and write
+# Note: we're not using the import method, as that assume's we're writing the
+# government domain list and would use Swot to ensure domains aren't academic
+importer.send :normalize_domains!
+domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
+domain_list.data[group] = domain_list.data[group].flatten
+domain_list.write
+importer.logger.info "Vendored #{importer.domain_list.count} academic domains."

data/script/vendor-us CHANGED

@@ -1,4 +1,6 @@
 #! /usr/bin/env ruby
+# frozen_string_literal: true
 #
 # Vendors the USA.gov-maintained list of US domains into domains.txt
 # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -10,12 +12,13 @@
 #
 # It's also probably a good idea to run `script/ci-build` for good measure
-require './lib/gman/importer'
+require './lib/gman'
+require 'open-uri'
-blacklist = %w(usagovQUASI usagovFEDgov)
+blacklist = %w[usagovQUASI usagovFEDgov]
 source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
-data = open(source).read
+data = URI.open(source).read
 data = data.split('_' * 74)
 data = data.last.strip
 data = data.split(/\r?\n/).reject(&:empty?)
@@ -23,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
 domains = {}
 group = ''
 data.each do |row|
-  if row =~ /^\w/
+  if /^\w/.match?(row)
     group = row
     domains[group] = []
   else
@@ -32,4 +35,4 @@ data.each do |row|
 end
 domains.reject! { |g, _| blacklist.include?(g) }
-Gman.import(domains)
+Gman::Importer.new(domains).import

data/spec/fixtures/domains.txt ADDED

@@ -0,0 +1,4 @@
+// foo
+bar.gov
+baz.net
+!mail.bar.gov

data/{test → spec}/fixtures/obama.txt RENAMED

File without changes

data/spec/gman/bin_spec.rb ADDED

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+RSpec.describe 'Gman bin' do
+  let(:domain) { 'whitehouse.gov' }
+  let(:args) { [domain] }
+  let(:command) { 'gman' }
+  let(:bin_path) do
+    File.expand_path "../../bin/#{command}", File.dirname(__FILE__)
+  end
+  let(:response_parts) { Open3.capture2e('bundle', 'exec', bin_path, *args) }
+  let(:output) { response_parts[0] }
+  let(:status) { response_parts[1] }
+  let(:exit_code) { status.exitstatus }
+  context 'a valid domain' do
+    it 'parses the domain' do
+      expect(output).to match('Domain  : whitehouse.gov')
+    end
+    it "knows it's valid" do
+      expect(output).to match('Valid government domain')
+      expect(exit_code).to be(0)
+    end
+    it 'knows the type' do
+      expect(output).to match(/federal/i)
+    end
+    it 'knows the agency' do
+      expect(output).to match('Executive Office of the President')
+    end
+    it 'knows the country' do
+      expect(output).to match('United States')
+    end
+    it 'knows the city' do
+      expect(output).to match('Washington')
+    end
+    it 'knows the state' do
+      expect(output).to match('DC')
+    end
+    it 'colors by default' do
+      expect(output).to match(/\e\[32m/)
+    end
+    context 'with colorization disabled' do
+      let(:args) { [domain, '--no-color'] }
+      it "doesn't color" do
+        expect(output).not_to match(/\e\[32m/)
+      end
+    end
+  end
+  context 'with no args' do
+    let(:args) { [] }
+    it 'displays the help text' do
+      expect(output).to match('USAGE')
+    end
+  end
+  context 'an invalid domain' do
+    let(:domain) { 'foo.invalid' }
+    it 'knows the domain is invalid' do
+      expect(output).to match('Invalid domain')
+      expect(exit_code).to be(1)
+    end
+  end
+  context 'a non-government domain' do
+    let(:domain) { 'github.com' }
+    it "knows it's not a government domain" do
+      expect(output).to match('Not a government domain')
+      expect(exit_code).to be(1)
+    end
+  end
+  context 'filtering' do
+    let(:command) { 'gman_filter' }
+    let(:txt_path) do
+      File.expand_path '../fixtures/obama.txt', File.dirname(__FILE__)
+    end
+    let(:args) { [txt_path] }
+    it 'returns only government domains' do
+      expected = <<~EXPECTED
+        mr.senator@obama.senate.gov
+        president@whitehouse.gov
+        commander.in.chief@us.army.mil
+      EXPECTED
+      expect(output).to eql(expected)
+    end
+  end
+end

data/spec/gman/country_code_spec.rb ADDED

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+RSpec.describe 'Gman Country Codes' do
+  {
+    'whitehouse.gov' => 'United States of America',
+    'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
+    'army.mil' => 'United States of America',
+    'foo.gc.ca' => 'Canada',
+    'foo.eu' => nil
+  }.each do |domain, expected_country|
+    context "given #{domain.inspect}" do
+      subject { Gman.new(domain) }
+      let(:country) { subject.country }
+      it 'knows the country' do
+        if expected_country.nil?
+          expect(country).to be_nil
+        else
+          expect(country.name).to eql(expected_country)
+        end
+      end
+      it 'knows the alpha2' do
+        expected = case expected_country
+                   when 'United States of America'
+                     'us'
+                   when 'Canada'
+                     'ca'
+                   when 'United Kingdom of Great Britain and Northern Ireland'
+                     'gb'
+                   else
+                     'eu'
+                   end
+        expect(subject.alpha2).to eql(expected)
+      end
+    end
+  end
+end

data/spec/gman/domain_list_spec.rb ADDED

@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+RSpec.describe Gman::DomainList do
+  let(:data) { subject.data }
+  let(:canada) { data['Canada municipal'] }
+  %i[path contents data].each do |type|
+    context "when initialized by #{type}" do
+      subject do
+        case type
+        when :path
+          described_class.new(path: Gman.list_path)
+        when :contents
+          contents = File.read(Gman.list_path)
+          described_class.new(contents: contents)
+        when :data
+          data = described_class.new(path: Gman.list_path).to_h
+          described_class.new(data: data)
+        end
+      end
+      it 'stores the init var' do
+        expect(subject.send(type)).not_to be_nil
+      end
+      it 'returns the domain data' do
+        expect(data).to have_key('Canada federal')
+        expect(data.values.flatten).to include('gov')
+      end
+      it 'returns the list contents' do
+        expect(subject.contents).to match(/^gov$/)
+      end
+      it 'knows the list path' do
+        expect(subject.path).to eql(Gman.list_path)
+      end
+      it 'returns the PublicSuffix list' do
+        expect(subject.public_suffix_list).to be_a(PublicSuffix::List)
+      end
+      it 'knows if a domain is valid' do
+        expect(subject.valid?('whitehouse.gov')).to be(true)
+      end
+      it 'knows if a domain is invalid' do
+        expect(subject.valid?('example.com')).to be(false)
+      end
+      it 'returns the domain groups' do
+        expect(subject.groups).to include('Canada federal')
+      end
+      it 'returns the domains' do
+        expect(subject.domains).to include('gov')
+      end
+      it 'returns the domain count' do
+        expect(subject.count).to be_a(Integer)
+        expect(subject.count).to be > 100
+      end
+      it 'alphabetizes the list' do
+        canada.shuffle!
+        expect(canada.first).not_to eql('100milehouse.com')
+        subject.alphabetize
+        expect(canada.first).to eql('100milehouse.com')
+      end
+      it 'outputs public suffix format' do
+        expect(subject.to_s).to match("// Canada federal\ncanada\.ca\n")
+      end
+      it "finds a domain's parent" do
+        expect(subject.parent_domain('foo.gov.uk')).to eql('gov.uk')
+      end
+      context 'with the list path stubbed' do
+        let(:stubbed_file_contents) { File.read(stubbed_list_path) }
+        before do
+          subject.instance_variable_set('@path', stubbed_list_path)
+        end
+        context 'with list data stubbed' do
+          before do
+            subject.data = { 'foo' => ['!mail.bar.gov', 'bar.gov', 'baz.net'] }
+          end
+          context 'alphabetizing' do
+            before { subject.alphabetize }
+            it 'puts exceptions last' do
+              expect(subject.data['foo'].last).to eql('!mail.bar.gov')
+            end
+          end
+          context 'writing' do
+            before { subject.write }
+            it 'writes the contents' do
+              expect(stubbed_file_contents).to match("// foo\nbar.gov\nbaz.net")
+            end
+          end
+        end
+      end
+    end
+  end
+end