gman 6.0.1 → 7.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/funding.yml +1 -0
  7. data/.github/no-response.yml +15 -0
  8. data/.github/release-drafter.yml +4 -0
  9. data/.github/settings.yml +33 -0
  10. data/.github/stale.yml +29 -0
  11. data/.gitignore +1 -0
  12. data/.rspec +2 -0
  13. data/.rubocop.yml +17 -5
  14. data/.rubocop_todo.yml +84 -0
  15. data/.ruby-version +1 -1
  16. data/Gemfile +2 -0
  17. data/bin/gman +6 -4
  18. data/bin/gman_filter +5 -7
  19. data/config/domains.txt +8446 -173
  20. data/config/vendor/academic.txt +8038 -0
  21. data/config/vendor/dotgovs.csv +5786 -5560
  22. data/docs/CODE_OF_CONDUCT.md +46 -0
  23. data/docs/CONTRIBUTING.md +92 -0
  24. data/{README.md → docs/README.md} +3 -3
  25. data/docs/SECURITY.md +3 -0
  26. data/docs/_config.yml +2 -0
  27. data/gman.gemspec +18 -17
  28. data/lib/gman.rb +25 -21
  29. data/lib/gman/country_codes.rb +17 -17
  30. data/lib/gman/domain_list.rb +123 -41
  31. data/lib/gman/identifier.rb +59 -21
  32. data/lib/gman/importer.rb +39 -40
  33. data/lib/gman/locality.rb +23 -21
  34. data/lib/gman/version.rb +3 -1
  35. data/script/add +2 -0
  36. data/script/alphabetize +2 -0
  37. data/script/cibuild +1 -1
  38. data/script/dedupe +2 -1
  39. data/script/profile +2 -1
  40. data/script/prune +5 -3
  41. data/script/reconcile-us +6 -3
  42. data/script/vendor +1 -1
  43. data/script/vendor-federal-de +3 -3
  44. data/script/vendor-municipal-de +3 -3
  45. data/script/vendor-nl +4 -1
  46. data/script/vendor-public-suffix +7 -6
  47. data/script/vendor-se +3 -3
  48. data/script/vendor-swot +43 -0
  49. data/script/vendor-us +8 -5
  50. data/spec/fixtures/domains.txt +4 -0
  51. data/{test → spec}/fixtures/obama.txt +0 -0
  52. data/spec/gman/bin_spec.rb +101 -0
  53. data/spec/gman/country_code_spec.rb +39 -0
  54. data/spec/gman/domain_list_spec.rb +110 -0
  55. data/spec/gman/domains_spec.rb +25 -0
  56. data/spec/gman/identifier_spec.rb +218 -0
  57. data/spec/gman/importer_spec.rb +236 -0
  58. data/spec/gman/locality_spec.rb +24 -0
  59. data/spec/gman_spec.rb +74 -0
  60. data/spec/spec_helper.rb +31 -0
  61. metadata +86 -73
  62. data/CONTRIBUTING.md +0 -22
  63. data/Rakefile +0 -22
  64. data/test/fixtures/domains.txt +0 -2
  65. data/test/helper.rb +0 -40
  66. data/test/test_gman.rb +0 -62
  67. data/test/test_gman_bin.rb +0 -75
  68. data/test/test_gman_country_codes.rb +0 -18
  69. data/test/test_gman_domains.rb +0 -33
  70. data/test/test_gman_filter.rb +0 -17
  71. data/test/test_gman_identifier.rb +0 -106
  72. data/test/test_gman_importer.rb +0 -250
  73. data/test/test_gman_locality.rb +0 -10
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -17,4 +18,4 @@ end
17
18
 
18
19
  result = RubyProf.stop
19
20
  printer = RubyProf::FlatPrinter.new(result)
20
- printer.print(STDOUT)
21
+ printer.print($stdout)
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
20
22
  domains = {}
21
23
  group = ''
22
24
  data.each do |row|
23
- if row =~ /^\w/
25
+ if /^\w/.match?(row)
24
26
  group = row
25
27
  domains[group] = []
26
28
  else
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -6,7 +6,7 @@ for file in script/vendor-*; do
6
6
  echo "*************************************"
7
7
  echo "Vendoring $file"
8
8
  echo "*************************************"
9
- "$file"
9
+ bundle exec "$file"
10
10
  fi
11
11
  done
12
12
 
@@ -1,14 +1,14 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
9
 
10
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
11
11
  domains = CSV.parse(domains, headers: true)
12
12
  domains = domains.map { |row| row['Domain Name'] }
13
13
 
14
- Gman.import('German Federal' => domains)
14
+ Gman::Importer.new('German Federal' => domains).import
@@ -1,13 +1,13 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
9
 
10
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
11
11
 
12
12
  # For some reason, the header row is actually the last row
13
13
  # Pop the last line off the file and prepend it at the begining
@@ -20,4 +20,4 @@ csv = lines.join("\n")
20
20
  data = CSV.parse(csv, headers: true, col_sep: ';')
21
21
  domains = data.map { |row| row['Internet'] }
22
22
 
23
- Gman.import('German Municipalities' => domains)
23
+ Gman::Importer.new('German Municipalities' => domains).import
@@ -1,7 +1,10 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
7
+ require './lib/gman'
5
8
 
6
9
  FileUtils.rm_rf('almanak.overheid.nl')
7
10
  commands = [
@@ -15,4 +18,4 @@ commands = [
15
18
  ]
16
19
  domains = system commands.join('|')
17
20
 
18
- Gman.import('Netherlands' => domains.split("\n"))
21
+ Gman::Importer.new('Netherlands' => domains.split("\n")).import
@@ -1,27 +1,28 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
5
7
  require 'yaml'
6
8
  require_relative '../lib/gman'
7
- require_relative '../lib/gman/importer'
8
9
 
9
10
  # https://gist.github.com/benbalter/6147066
10
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
11
12
 
12
13
  domains = []
13
14
  PublicSuffix::List.default.each do |rule|
14
15
  domain = nil
15
16
 
16
17
  if rule.parts.length == 1
17
- domain = rule.parts.first if ".#{rule.value}" =~ REGEX
18
- elsif ".#{rule.value}" =~ REGEX
18
+ domain = rule.parts.first if REGEX.match?(".#{rule.value}")
19
+ elsif REGEX.match?(".#{rule.value}")
19
20
  domain = rule.parts.pop(2).join('.')
20
21
  end
21
22
 
22
23
  domains.push domain unless domain.nil? || domains.include?(domain)
23
24
  end
24
25
 
25
- # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
26
27
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
28
+ Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -1,9 +1,9 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
9
  agent = Mechanize.new
@@ -15,7 +15,7 @@ response = agent.submit(form, submit_button)
15
15
 
16
16
  rows = CSV.parse(response.content, headers: true, col_sep: "\t")
17
17
  domains = rows.map do |row|
18
- row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
18
+ row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
19
19
  end
20
20
 
21
- Gman.import('Swedish Administrative Authorities' => domains)
21
+ Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Vendors the Swot-maintained list of adademic domains into config/academic.txt
6
+ # Source: https://github.com/leereilly/swot/
7
+ #
8
+ # Usage: script/vendor-swot
9
+ #
10
+ # Will automatically fetch latest version of the list and merge
11
+ # You can check for changes and commit via `git status`
12
+ #
13
+ # It's also probably a good idea to run `script/ci-build` for good measure
14
+ #
15
+ # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
16
+
17
+ require 'gman'
18
+ require 'swot'
19
+
20
+ # Generate array of all Swot domains
21
+ domains = Swot.all_domains
22
+ domains << Swot::ACADEMIC_TLDS
23
+
24
+ # Init the importer, builiding a DomainList
25
+ group = "Academic domains vendored from Swot v#{Swot::VERSION}"
26
+ hash = { group => domains }
27
+
28
+ importer = Gman::Importer.new(hash)
29
+ importer.logger.info "Importing from Swot v#{Swot::VERSION}"
30
+ importer.logger.info "Found #{domains.count} academic domains"
31
+
32
+ domain_list = importer.domain_list
33
+ domain_list.path = Gman.academic_list_path
34
+
35
+ # Cleanup and write
36
+ # Note: we're not using the import method, as that assume's we're writing the
37
+ # government domain list and would use Swot to ensure domains aren't academic
38
+ importer.send :normalize_domains!
39
+ domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
40
+ domain_list.data[group] = domain_list.data[group].flatten
41
+ domain_list.write
42
+
43
+ importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -10,12 +12,13 @@
10
12
  #
11
13
  # It's also probably a good idea to run `script/ci-build` for good measure
12
14
 
13
- require './lib/gman/importer'
15
+ require './lib/gman'
16
+ require 'open-uri'
14
17
 
15
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
16
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
17
20
 
18
- data = open(source).read
21
+ data = URI.open(source).read
19
22
  data = data.split('_' * 74)
20
23
  data = data.last.strip
21
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -23,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
23
26
  domains = {}
24
27
  group = ''
25
28
  data.each do |row|
26
- if row =~ /^\w/
29
+ if /^\w/.match?(row)
27
30
  group = row
28
31
  domains[group] = []
29
32
  else
@@ -32,4 +35,4 @@ data.each do |row|
32
35
  end
33
36
 
34
37
  domains.reject! { |g, _| blacklist.include?(g) }
35
- Gman.import(domains)
38
+ Gman::Importer.new(domains).import
@@ -0,0 +1,4 @@
1
+ // foo
2
+ bar.gov
3
+ baz.net
4
+ !mail.bar.gov
File without changes
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Gman bin' do
4
+ let(:domain) { 'whitehouse.gov' }
5
+ let(:args) { [domain] }
6
+ let(:command) { 'gman' }
7
+ let(:bin_path) do
8
+ File.expand_path "../../bin/#{command}", File.dirname(__FILE__)
9
+ end
10
+ let(:response_parts) { Open3.capture2e('bundle', 'exec', bin_path, *args) }
11
+ let(:output) { response_parts[0] }
12
+ let(:status) { response_parts[1] }
13
+ let(:exit_code) { status.exitstatus }
14
+
15
+ context 'a valid domain' do
16
+ it 'parses the domain' do
17
+ expect(output).to match('Domain : whitehouse.gov')
18
+ end
19
+
20
+ it "knows it's valid" do
21
+ expect(output).to match('Valid government domain')
22
+ expect(exit_code).to be(0)
23
+ end
24
+
25
+ it 'knows the type' do
26
+ expect(output).to match(/federal/i)
27
+ end
28
+
29
+ it 'knows the agency' do
30
+ expect(output).to match('Executive Office of the President')
31
+ end
32
+
33
+ it 'knows the country' do
34
+ expect(output).to match('United States')
35
+ end
36
+
37
+ it 'knows the city' do
38
+ expect(output).to match('Washington')
39
+ end
40
+
41
+ it 'knows the state' do
42
+ expect(output).to match('DC')
43
+ end
44
+
45
+ it 'colors by default' do
46
+ expect(output).to match(/\e\[32m/)
47
+ end
48
+
49
+ context 'with colorization disabled' do
50
+ let(:args) { [domain, '--no-color'] }
51
+
52
+ it "doesn't color" do
53
+ expect(output).not_to match(/\e\[32m/)
54
+ end
55
+ end
56
+ end
57
+
58
+ context 'with no args' do
59
+ let(:args) { [] }
60
+
61
+ it 'displays the help text' do
62
+ expect(output).to match('USAGE')
63
+ end
64
+ end
65
+
66
+ context 'an invalid domain' do
67
+ let(:domain) { 'foo.invalid' }
68
+
69
+ it 'knows the domain is invalid' do
70
+ expect(output).to match('Invalid domain')
71
+ expect(exit_code).to be(1)
72
+ end
73
+ end
74
+
75
+ context 'a non-government domain' do
76
+ let(:domain) { 'github.com' }
77
+
78
+ it "knows it's not a government domain" do
79
+ expect(output).to match('Not a government domain')
80
+ expect(exit_code).to be(1)
81
+ end
82
+ end
83
+
84
+ context 'filtering' do
85
+ let(:command) { 'gman_filter' }
86
+ let(:txt_path) do
87
+ File.expand_path '../fixtures/obama.txt', File.dirname(__FILE__)
88
+ end
89
+ let(:args) { [txt_path] }
90
+
91
+ it 'returns only government domains' do
92
+ expected = <<~EXPECTED
93
+ mr.senator@obama.senate.gov
94
+ president@whitehouse.gov
95
+ commander.in.chief@us.army.mil
96
+ EXPECTED
97
+
98
+ expect(output).to eql(expected)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Gman Country Codes' do
4
+ {
5
+ 'whitehouse.gov' => 'United States of America',
6
+ 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
7
+ 'army.mil' => 'United States of America',
8
+ 'foo.gc.ca' => 'Canada',
9
+ 'foo.eu' => nil
10
+ }.each do |domain, expected_country|
11
+ context "given #{domain.inspect}" do
12
+ subject { Gman.new(domain) }
13
+
14
+ let(:country) { subject.country }
15
+
16
+ it 'knows the country' do
17
+ if expected_country.nil?
18
+ expect(country).to be_nil
19
+ else
20
+ expect(country.name).to eql(expected_country)
21
+ end
22
+ end
23
+
24
+ it 'knows the alpha2' do
25
+ expected = case expected_country
26
+ when 'United States of America'
27
+ 'us'
28
+ when 'Canada'
29
+ 'ca'
30
+ when 'United Kingdom of Great Britain and Northern Ireland'
31
+ 'gb'
32
+ else
33
+ 'eu'
34
+ end
35
+ expect(subject.alpha2).to eql(expected)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Gman::DomainList do
4
+ let(:data) { subject.data }
5
+ let(:canada) { data['Canada municipal'] }
6
+
7
+ %i[path contents data].each do |type|
8
+ context "when initialized by #{type}" do
9
+ subject do
10
+ case type
11
+ when :path
12
+ described_class.new(path: Gman.list_path)
13
+ when :contents
14
+ contents = File.read(Gman.list_path)
15
+ described_class.new(contents: contents)
16
+ when :data
17
+ data = described_class.new(path: Gman.list_path).to_h
18
+ described_class.new(data: data)
19
+ end
20
+ end
21
+
22
+ it 'stores the init var' do
23
+ expect(subject.send(type)).not_to be_nil
24
+ end
25
+
26
+ it 'returns the domain data' do
27
+ expect(data).to have_key('Canada federal')
28
+ expect(data.values.flatten).to include('gov')
29
+ end
30
+
31
+ it 'returns the list contents' do
32
+ expect(subject.contents).to match(/^gov$/)
33
+ end
34
+
35
+ it 'knows the list path' do
36
+ expect(subject.path).to eql(Gman.list_path)
37
+ end
38
+
39
+ it 'returns the PublicSuffix list' do
40
+ expect(subject.public_suffix_list).to be_a(PublicSuffix::List)
41
+ end
42
+
43
+ it 'knows if a domain is valid' do
44
+ expect(subject.valid?('whitehouse.gov')).to be(true)
45
+ end
46
+
47
+ it 'knows if a domain is invalid' do
48
+ expect(subject.valid?('example.com')).to be(false)
49
+ end
50
+
51
+ it 'returns the domain groups' do
52
+ expect(subject.groups).to include('Canada federal')
53
+ end
54
+
55
+ it 'returns the domains' do
56
+ expect(subject.domains).to include('gov')
57
+ end
58
+
59
+ it 'returns the domain count' do
60
+ expect(subject.count).to be_a(Integer)
61
+ expect(subject.count).to be > 100
62
+ end
63
+
64
+ it 'alphabetizes the list' do
65
+ canada.shuffle!
66
+ expect(canada.first).not_to eql('100milehouse.com')
67
+ subject.alphabetize
68
+ expect(canada.first).to eql('100milehouse.com')
69
+ end
70
+
71
+ it 'outputs public suffix format' do
72
+ expect(subject.to_s).to match("// Canada federal\ncanada\.ca\n")
73
+ end
74
+
75
+ it "finds a domain's parent" do
76
+ expect(subject.parent_domain('foo.gov.uk')).to eql('gov.uk')
77
+ end
78
+
79
+ context 'with the list path stubbed' do
80
+ let(:stubbed_file_contents) { File.read(stubbed_list_path) }
81
+
82
+ before do
83
+ subject.instance_variable_set('@path', stubbed_list_path)
84
+ end
85
+
86
+ context 'with list data stubbed' do
87
+ before do
88
+ subject.data = { 'foo' => ['!mail.bar.gov', 'bar.gov', 'baz.net'] }
89
+ end
90
+
91
+ context 'alphabetizing' do
92
+ before { subject.alphabetize }
93
+
94
+ it 'puts exceptions last' do
95
+ expect(subject.data['foo'].last).to eql('!mail.bar.gov')
96
+ end
97
+ end
98
+
99
+ context 'writing' do
100
+ before { subject.write }
101
+
102
+ it 'writes the contents' do
103
+ expect(stubbed_file_contents).to match("// foo\nbar.gov\nbaz.net")
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end