gman 6.0.1 → 7.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/funding.yml +1 -0
  7. data/.github/no-response.yml +15 -0
  8. data/.github/release-drafter.yml +4 -0
  9. data/.github/settings.yml +33 -0
  10. data/.github/stale.yml +29 -0
  11. data/.gitignore +1 -0
  12. data/.rspec +2 -0
  13. data/.rubocop.yml +17 -5
  14. data/.rubocop_todo.yml +84 -0
  15. data/.ruby-version +1 -1
  16. data/Gemfile +2 -0
  17. data/bin/gman +6 -4
  18. data/bin/gman_filter +5 -7
  19. data/config/domains.txt +8446 -173
  20. data/config/vendor/academic.txt +8038 -0
  21. data/config/vendor/dotgovs.csv +5786 -5560
  22. data/docs/CODE_OF_CONDUCT.md +46 -0
  23. data/docs/CONTRIBUTING.md +92 -0
  24. data/{README.md → docs/README.md} +3 -3
  25. data/docs/SECURITY.md +3 -0
  26. data/docs/_config.yml +2 -0
  27. data/gman.gemspec +18 -17
  28. data/lib/gman.rb +25 -21
  29. data/lib/gman/country_codes.rb +17 -17
  30. data/lib/gman/domain_list.rb +123 -41
  31. data/lib/gman/identifier.rb +59 -21
  32. data/lib/gman/importer.rb +39 -40
  33. data/lib/gman/locality.rb +23 -21
  34. data/lib/gman/version.rb +3 -1
  35. data/script/add +2 -0
  36. data/script/alphabetize +2 -0
  37. data/script/cibuild +1 -1
  38. data/script/dedupe +2 -1
  39. data/script/profile +2 -1
  40. data/script/prune +5 -3
  41. data/script/reconcile-us +6 -3
  42. data/script/vendor +1 -1
  43. data/script/vendor-federal-de +3 -3
  44. data/script/vendor-municipal-de +3 -3
  45. data/script/vendor-nl +4 -1
  46. data/script/vendor-public-suffix +7 -6
  47. data/script/vendor-se +3 -3
  48. data/script/vendor-swot +43 -0
  49. data/script/vendor-us +8 -5
  50. data/spec/fixtures/domains.txt +4 -0
  51. data/{test → spec}/fixtures/obama.txt +0 -0
  52. data/spec/gman/bin_spec.rb +101 -0
  53. data/spec/gman/country_code_spec.rb +39 -0
  54. data/spec/gman/domain_list_spec.rb +110 -0
  55. data/spec/gman/domains_spec.rb +25 -0
  56. data/spec/gman/identifier_spec.rb +218 -0
  57. data/spec/gman/importer_spec.rb +236 -0
  58. data/spec/gman/locality_spec.rb +24 -0
  59. data/spec/gman_spec.rb +74 -0
  60. data/spec/spec_helper.rb +31 -0
  61. metadata +86 -73
  62. data/CONTRIBUTING.md +0 -22
  63. data/Rakefile +0 -22
  64. data/test/fixtures/domains.txt +0 -2
  65. data/test/helper.rb +0 -40
  66. data/test/test_gman.rb +0 -62
  67. data/test/test_gman_bin.rb +0 -75
  68. data/test/test_gman_country_codes.rb +0 -18
  69. data/test/test_gman_domains.rb +0 -33
  70. data/test/test_gman_filter.rb +0 -17
  71. data/test/test_gman_identifier.rb +0 -106
  72. data/test/test_gman_importer.rb +0 -250
  73. data/test/test_gman_locality.rb +0 -10
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -17,4 +18,4 @@ end
17
18
 
18
19
  result = RubyProf.stop
19
20
  printer = RubyProf::FlatPrinter.new(result)
20
- printer.print(STDOUT)
21
+ printer.print($stdout)
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
20
22
  domains = {}
21
23
  group = ''
22
24
  data.each do |row|
23
- if row =~ /^\w/
25
+ if /^\w/.match?(row)
24
26
  group = row
25
27
  domains[group] = []
26
28
  else
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -6,7 +6,7 @@ for file in script/vendor-*; do
6
6
  echo "*************************************"
7
7
  echo "Vendoring $file"
8
8
  echo "*************************************"
9
- "$file"
9
+ bundle exec "$file"
10
10
  fi
11
11
  done
12
12
 
@@ -1,14 +1,14 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
9
 
10
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
11
11
  domains = CSV.parse(domains, headers: true)
12
12
  domains = domains.map { |row| row['Domain Name'] }
13
13
 
14
- Gman.import('German Federal' => domains)
14
+ Gman::Importer.new('German Federal' => domains).import
@@ -1,13 +1,13 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
9
 
10
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
11
11
 
12
12
  # For some reason, the header row is actually the last row
13
13
  # Pop the last line off the file and prepend it at the begining
@@ -20,4 +20,4 @@ csv = lines.join("\n")
20
20
  data = CSV.parse(csv, headers: true, col_sep: ';')
21
21
  domains = data.map { |row| row['Internet'] }
22
22
 
23
- Gman.import('German Municipalities' => domains)
23
+ Gman::Importer.new('German Municipalities' => domains).import
@@ -1,7 +1,10 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
7
+ require './lib/gman'
5
8
 
6
9
  FileUtils.rm_rf('almanak.overheid.nl')
7
10
  commands = [
@@ -15,4 +18,4 @@ commands = [
15
18
  ]
16
19
  domains = system commands.join('|')
17
20
 
18
- Gman.import('Netherlands' => domains.split("\n"))
21
+ Gman::Importer.new('Netherlands' => domains.split("\n")).import
@@ -1,27 +1,28 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
5
7
  require 'yaml'
6
8
  require_relative '../lib/gman'
7
- require_relative '../lib/gman/importer'
8
9
 
9
10
  # https://gist.github.com/benbalter/6147066
10
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
11
12
 
12
13
  domains = []
13
14
  PublicSuffix::List.default.each do |rule|
14
15
  domain = nil
15
16
 
16
17
  if rule.parts.length == 1
17
- domain = rule.parts.first if ".#{rule.value}" =~ REGEX
18
- elsif ".#{rule.value}" =~ REGEX
18
+ domain = rule.parts.first if REGEX.match?(".#{rule.value}")
19
+ elsif REGEX.match?(".#{rule.value}")
19
20
  domain = rule.parts.pop(2).join('.')
20
21
  end
21
22
 
22
23
  domains.push domain unless domain.nil? || domains.include?(domain)
23
24
  end
24
25
 
25
- # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
26
27
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
28
+ Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -1,9 +1,9 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
5
6
  require './lib/gman'
6
- require './lib/gman/importer'
7
7
 
8
8
  url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
9
  agent = Mechanize.new
@@ -15,7 +15,7 @@ response = agent.submit(form, submit_button)
15
15
 
16
16
  rows = CSV.parse(response.content, headers: true, col_sep: "\t")
17
17
  domains = rows.map do |row|
18
- row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
18
+ row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
19
19
  end
20
20
 
21
- Gman.import('Swedish Administrative Authorities' => domains)
21
+ Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Vendors the Swot-maintained list of adademic domains into config/academic.txt
6
+ # Source: https://github.com/leereilly/swot/
7
+ #
8
+ # Usage: script/vendor-swot
9
+ #
10
+ # Will automatically fetch latest version of the list and merge
11
+ # You can check for changes and commit via `git status`
12
+ #
13
+ # It's also probably a good idea to run `script/ci-build` for good measure
14
+ #
15
+ # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
16
+
17
+ require 'gman'
18
+ require 'swot'
19
+
20
+ # Generate array of all Swot domains
21
+ domains = Swot.all_domains
22
+ domains << Swot::ACADEMIC_TLDS
23
+
24
+ # Init the importer, builiding a DomainList
25
+ group = "Academic domains vendored from Swot v#{Swot::VERSION}"
26
+ hash = { group => domains }
27
+
28
+ importer = Gman::Importer.new(hash)
29
+ importer.logger.info "Importing from Swot v#{Swot::VERSION}"
30
+ importer.logger.info "Found #{domains.count} academic domains"
31
+
32
+ domain_list = importer.domain_list
33
+ domain_list.path = Gman.academic_list_path
34
+
35
+ # Cleanup and write
36
+ # Note: we're not using the import method, as that assume's we're writing the
37
+ # government domain list and would use Swot to ensure domains aren't academic
38
+ importer.send :normalize_domains!
39
+ domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
40
+ domain_list.data[group] = domain_list.data[group].flatten
41
+ domain_list.write
42
+
43
+ importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -10,12 +12,13 @@
10
12
  #
11
13
  # It's also probably a good idea to run `script/ci-build` for good measure
12
14
 
13
- require './lib/gman/importer'
15
+ require './lib/gman'
16
+ require 'open-uri'
14
17
 
15
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
16
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
17
20
 
18
- data = open(source).read
21
+ data = URI.open(source).read
19
22
  data = data.split('_' * 74)
20
23
  data = data.last.strip
21
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -23,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
23
26
  domains = {}
24
27
  group = ''
25
28
  data.each do |row|
26
- if row =~ /^\w/
29
+ if /^\w/.match?(row)
27
30
  group = row
28
31
  domains[group] = []
29
32
  else
@@ -32,4 +35,4 @@ data.each do |row|
32
35
  end
33
36
 
34
37
  domains.reject! { |g, _| blacklist.include?(g) }
35
- Gman.import(domains)
38
+ Gman::Importer.new(domains).import
@@ -0,0 +1,4 @@
1
+ // foo
2
+ bar.gov
3
+ baz.net
4
+ !mail.bar.gov
File without changes
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Gman bin' do
4
+ let(:domain) { 'whitehouse.gov' }
5
+ let(:args) { [domain] }
6
+ let(:command) { 'gman' }
7
+ let(:bin_path) do
8
+ File.expand_path "../../bin/#{command}", File.dirname(__FILE__)
9
+ end
10
+ let(:response_parts) { Open3.capture2e('bundle', 'exec', bin_path, *args) }
11
+ let(:output) { response_parts[0] }
12
+ let(:status) { response_parts[1] }
13
+ let(:exit_code) { status.exitstatus }
14
+
15
+ context 'a valid domain' do
16
+ it 'parses the domain' do
17
+ expect(output).to match('Domain : whitehouse.gov')
18
+ end
19
+
20
+ it "knows it's valid" do
21
+ expect(output).to match('Valid government domain')
22
+ expect(exit_code).to be(0)
23
+ end
24
+
25
+ it 'knows the type' do
26
+ expect(output).to match(/federal/i)
27
+ end
28
+
29
+ it 'knows the agency' do
30
+ expect(output).to match('Executive Office of the President')
31
+ end
32
+
33
+ it 'knows the country' do
34
+ expect(output).to match('United States')
35
+ end
36
+
37
+ it 'knows the city' do
38
+ expect(output).to match('Washington')
39
+ end
40
+
41
+ it 'knows the state' do
42
+ expect(output).to match('DC')
43
+ end
44
+
45
+ it 'colors by default' do
46
+ expect(output).to match(/\e\[32m/)
47
+ end
48
+
49
+ context 'with colorization disabled' do
50
+ let(:args) { [domain, '--no-color'] }
51
+
52
+ it "doesn't color" do
53
+ expect(output).not_to match(/\e\[32m/)
54
+ end
55
+ end
56
+ end
57
+
58
+ context 'with no args' do
59
+ let(:args) { [] }
60
+
61
+ it 'displays the help text' do
62
+ expect(output).to match('USAGE')
63
+ end
64
+ end
65
+
66
+ context 'an invalid domain' do
67
+ let(:domain) { 'foo.invalid' }
68
+
69
+ it 'knows the domain is invalid' do
70
+ expect(output).to match('Invalid domain')
71
+ expect(exit_code).to be(1)
72
+ end
73
+ end
74
+
75
+ context 'a non-government domain' do
76
+ let(:domain) { 'github.com' }
77
+
78
+ it "knows it's not a government domain" do
79
+ expect(output).to match('Not a government domain')
80
+ expect(exit_code).to be(1)
81
+ end
82
+ end
83
+
84
+ context 'filtering' do
85
+ let(:command) { 'gman_filter' }
86
+ let(:txt_path) do
87
+ File.expand_path '../fixtures/obama.txt', File.dirname(__FILE__)
88
+ end
89
+ let(:args) { [txt_path] }
90
+
91
+ it 'returns only government domains' do
92
+ expected = <<~EXPECTED
93
+ mr.senator@obama.senate.gov
94
+ president@whitehouse.gov
95
+ commander.in.chief@us.army.mil
96
+ EXPECTED
97
+
98
+ expect(output).to eql(expected)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Gman Country Codes' do
4
+ {
5
+ 'whitehouse.gov' => 'United States of America',
6
+ 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
7
+ 'army.mil' => 'United States of America',
8
+ 'foo.gc.ca' => 'Canada',
9
+ 'foo.eu' => nil
10
+ }.each do |domain, expected_country|
11
+ context "given #{domain.inspect}" do
12
+ subject { Gman.new(domain) }
13
+
14
+ let(:country) { subject.country }
15
+
16
+ it 'knows the country' do
17
+ if expected_country.nil?
18
+ expect(country).to be_nil
19
+ else
20
+ expect(country.name).to eql(expected_country)
21
+ end
22
+ end
23
+
24
+ it 'knows the alpha2' do
25
+ expected = case expected_country
26
+ when 'United States of America'
27
+ 'us'
28
+ when 'Canada'
29
+ 'ca'
30
+ when 'United Kingdom of Great Britain and Northern Ireland'
31
+ 'gb'
32
+ else
33
+ 'eu'
34
+ end
35
+ expect(subject.alpha2).to eql(expected)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Gman::DomainList do
4
+ let(:data) { subject.data }
5
+ let(:canada) { data['Canada municipal'] }
6
+
7
+ %i[path contents data].each do |type|
8
+ context "when initialized by #{type}" do
9
+ subject do
10
+ case type
11
+ when :path
12
+ described_class.new(path: Gman.list_path)
13
+ when :contents
14
+ contents = File.read(Gman.list_path)
15
+ described_class.new(contents: contents)
16
+ when :data
17
+ data = described_class.new(path: Gman.list_path).to_h
18
+ described_class.new(data: data)
19
+ end
20
+ end
21
+
22
+ it 'stores the init var' do
23
+ expect(subject.send(type)).not_to be_nil
24
+ end
25
+
26
+ it 'returns the domain data' do
27
+ expect(data).to have_key('Canada federal')
28
+ expect(data.values.flatten).to include('gov')
29
+ end
30
+
31
+ it 'returns the list contents' do
32
+ expect(subject.contents).to match(/^gov$/)
33
+ end
34
+
35
+ it 'knows the list path' do
36
+ expect(subject.path).to eql(Gman.list_path)
37
+ end
38
+
39
+ it 'returns the PublicSuffix list' do
40
+ expect(subject.public_suffix_list).to be_a(PublicSuffix::List)
41
+ end
42
+
43
+ it 'knows if a domain is valid' do
44
+ expect(subject.valid?('whitehouse.gov')).to be(true)
45
+ end
46
+
47
+ it 'knows if a domain is invalid' do
48
+ expect(subject.valid?('example.com')).to be(false)
49
+ end
50
+
51
+ it 'returns the domain groups' do
52
+ expect(subject.groups).to include('Canada federal')
53
+ end
54
+
55
+ it 'returns the domains' do
56
+ expect(subject.domains).to include('gov')
57
+ end
58
+
59
+ it 'returns the domain count' do
60
+ expect(subject.count).to be_a(Integer)
61
+ expect(subject.count).to be > 100
62
+ end
63
+
64
+ it 'alphabetizes the list' do
65
+ canada.shuffle!
66
+ expect(canada.first).not_to eql('100milehouse.com')
67
+ subject.alphabetize
68
+ expect(canada.first).to eql('100milehouse.com')
69
+ end
70
+
71
+ it 'outputs public suffix format' do
72
+ expect(subject.to_s).to match("// Canada federal\ncanada\.ca\n")
73
+ end
74
+
75
+ it "finds a domain's parent" do
76
+ expect(subject.parent_domain('foo.gov.uk')).to eql('gov.uk')
77
+ end
78
+
79
+ context 'with the list path stubbed' do
80
+ let(:stubbed_file_contents) { File.read(stubbed_list_path) }
81
+
82
+ before do
83
+ subject.instance_variable_set('@path', stubbed_list_path)
84
+ end
85
+
86
+ context 'with list data stubbed' do
87
+ before do
88
+ subject.data = { 'foo' => ['!mail.bar.gov', 'bar.gov', 'baz.net'] }
89
+ end
90
+
91
+ context 'alphabetizing' do
92
+ before { subject.alphabetize }
93
+
94
+ it 'puts exceptions last' do
95
+ expect(subject.data['foo'].last).to eql('!mail.bar.gov')
96
+ end
97
+ end
98
+
99
+ context 'writing' do
100
+ before { subject.write }
101
+
102
+ it 'writes the contents' do
103
+ expect(stubbed_file_contents).to match("// foo\nbar.gov\nbaz.net")
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end