gman 7.0.2 → 7.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/no-response.yml +15 -0
  7. data/.github/release-drafter.yml +4 -0
  8. data/.github/settings.yml +33 -0
  9. data/.github/stale.yml +29 -0
  10. data/.rubocop.yml +5 -5
  11. data/.ruby-version +1 -1
  12. data/Gemfile +2 -0
  13. data/bin/gman +3 -1
  14. data/bin/gman_filter +3 -5
  15. data/config/domains.txt +191 -134
  16. data/config/vendor/dotgovs.csv +5786 -5634
  17. data/docs/CODE_OF_CONDUCT.md +46 -0
  18. data/docs/CONTRIBUTING.md +92 -0
  19. data/{README.md → docs/README.md} +2 -2
  20. data/docs/_config.yml +2 -0
  21. data/gman.gemspec +16 -15
  22. data/lib/gman.rb +4 -1
  23. data/lib/gman/country_codes.rb +19 -19
  24. data/lib/gman/domain_list.rb +10 -6
  25. data/lib/gman/identifier.rb +55 -17
  26. data/lib/gman/importer.rb +27 -18
  27. data/lib/gman/locality.rb +8 -6
  28. data/lib/gman/version.rb +3 -1
  29. data/script/add +2 -0
  30. data/script/alphabetize +2 -0
  31. data/script/dedupe +1 -0
  32. data/script/profile +1 -0
  33. data/script/prune +5 -3
  34. data/script/reconcile-us +5 -2
  35. data/script/vendor-federal-de +2 -1
  36. data/script/vendor-municipal-de +2 -1
  37. data/script/vendor-nl +2 -0
  38. data/script/vendor-public-suffix +3 -1
  39. data/script/vendor-se +1 -0
  40. data/script/vendor-swot +2 -0
  41. data/script/vendor-us +4 -2
  42. data/spec/gman/bin_spec.rb +8 -6
  43. data/spec/gman/country_code_spec.rb +6 -4
  44. data/spec/gman/domain_list_spec.rb +3 -1
  45. data/spec/gman/domains_spec.rb +3 -0
  46. data/spec/gman/identifier_spec.rb +38 -3
  47. data/spec/gman/importer_spec.rb +9 -7
  48. data/spec/gman/locality_spec.rb +2 -0
  49. data/spec/gman_spec.rb +2 -0
  50. data/spec/spec_helper.rb +2 -0
  51. metadata +52 -44
  52. data/CONTRIBUTING.md +0 -22
  53. data/contributing.json +0 -32
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Utility functions for parsing and manipulating public-suffix domain lists
2
4
  # Only used in development and not loaded by default
3
5
  require 'yaml'
@@ -13,7 +15,7 @@ class Gman
13
15
  attr_accessor :domain_list
14
16
 
15
17
  # Known false positives from vendored lists
16
- BLACKLIST = %w(
18
+ BLACKLIST = %w[
17
19
  business.centurytel.net
18
20
  chesnee.net
19
21
  citlink.net
@@ -39,23 +41,24 @@ class Gman
39
41
  wctc.net
40
42
  webconnections.net
41
43
  webpages.charter.net
42
- ).freeze
44
+ ].freeze
43
45
 
44
46
  REGEX_CHECKS = {
45
- 'home. regex' => /^home\./,
46
- 'user. regex' => /^users?\./,
47
- 'sites. regex' => /^sites?\./,
48
- 'weebly' => /weebly\.com$/,
49
- 'wordpress' => /wordpress\.com$/,
50
- 'govoffice' => /govoffice\d?\.com$/,
51
- 'homestead' => /homestead\.com$/,
52
- 'wix.com' => /wix\.com$/,
53
- 'blogspot.com' => /blogspot\.com$/,
54
- 'tripod.com' => /tripod\.com$/,
47
+ 'home. regex' => /^home\./,
48
+ 'user. regex' => /^users?\./,
49
+ 'sites. regex' => /^sites?\./,
50
+ 'weebly' => /weebly\.com$/,
51
+ 'wordpress' => /wordpress\.com$/,
52
+ 'govoffice' => /govoffice\d?\.com$/,
53
+ 'homestead' => /homestead\.com$/,
54
+ 'wix.com' => /wix\.com$/,
55
+ 'blogspot.com' => /blogspot\.com$/,
56
+ 'tripod.com' => /tripod\.com$/,
55
57
  'squarespace.com' => /squarespace\.com$/,
56
- 'github.io' => /github\.io$/,
57
- 'tumblr' => /tumblr\.com$/,
58
- 'locality' => Gman::Locality::REGEX
58
+ 'github.io' => /github\.io$/,
59
+ 'tumblr' => /tumblr\.com$/,
60
+ 'locality' => Gman::Locality::REGEX,
61
+ 'french edu' => /^ac-.*?\.fr/
59
62
  }.freeze
60
63
 
61
64
  def initialize(domains)
@@ -75,6 +78,7 @@ class Gman
75
78
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
76
79
  return false unless ensure_valid(domain)
77
80
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
81
+
78
82
  true
79
83
  end
80
84
 
@@ -82,6 +86,7 @@ class Gman
82
86
  # rather than a bool and silence log output
83
87
  def reject(domain, reason)
84
88
  return reason if ENV['RECONCILING']
89
+
85
90
  logger.info "👎 `#{domain}`: #{reason}"
86
91
  false
87
92
  end
@@ -102,13 +107,14 @@ class Gman
102
107
  end
103
108
 
104
109
  def resolver
105
- @resolver ||= Resolv::DNS.new(nameserver: ['8.8.8.8', '8.8.4.4'])
110
+ @resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
106
111
  end
107
112
 
108
113
  # Verifies that the given domain has an MX record, and thus is valid
109
114
  def domain_resolves?(domain)
110
115
  domain = Addressable::URI.new(host: domain).normalize.host
111
116
  return true if ip?(domain)
117
+
112
118
  returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
113
119
  end
114
120
 
@@ -123,6 +129,7 @@ class Gman
123
129
 
124
130
  def ensure_valid(domain)
125
131
  return false if domain.empty?
132
+
126
133
  if BLACKLIST.include?(domain)
127
134
  reject(domain, 'blacklist')
128
135
  elsif !PublicSuffix.valid?("foo.#{domain}")
@@ -136,11 +143,13 @@ class Gman
136
143
 
137
144
  def ensure_resolves(domain)
138
145
  return reject(domain, 'unresolvable') unless domain_resolves?(domain)
146
+
139
147
  true
140
148
  end
141
149
 
142
150
  def ensure_not_dupe(domain)
143
151
  return true unless dupe?(domain)
152
+
144
153
  if current.domains.include?(domain)
145
154
  reject(domain, 'duplicate')
146
155
  else
@@ -154,14 +163,14 @@ class Gman
154
163
  end
155
164
 
156
165
  def normalize_domains!
157
- domain_list.to_h.each do |_group, domains|
166
+ domain_list.to_h.each_value do |domains|
158
167
  domains.map! { |domain| normalize_domain(domain) }
159
168
  domains.uniq!
160
169
  end
161
170
  end
162
171
 
163
172
  def ensure_validity!(options = {})
164
- domain_list.data.each do |_group, domains|
173
+ domain_list.data.each_value do |domains|
165
174
  domains.select! { |domain| valid_domain?(domain, options) }
166
175
  end
167
176
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
4
  # Second level .us domains for states and locality
3
5
  # See http://en.wikipedia.org/wiki/.us
@@ -12,18 +14,18 @@ class Gman
12
14
  # * k12.il.us
13
15
  # * ci.foo.zx.us
14
16
  class Locality
15
- AFFINITY_NAMESPACES = %w(state dst cog).freeze
17
+ AFFINITY_NAMESPACES = %w[state dst cog].freeze
16
18
 
17
- STATES = %w(
19
+ STATES = %w[
18
20
  ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
19
21
  la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
20
22
  ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
21
- ).freeze
23
+ ].freeze
22
24
 
23
- LOCALITY_DOMAINS = %w(
25
+ LOCALITY_DOMAINS = %w[
24
26
  ci co borough boro city county
25
27
  parish town twp vi vil village
26
- ).freeze
28
+ ].freeze
27
29
 
28
30
  REGEX = /
29
31
  (
@@ -31,7 +33,7 @@ class Gman
31
33
  |
32
34
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
33
35
  )\.(#{Regexp.union(STATES)})\.us
34
- /x
36
+ /x.freeze
35
37
 
36
38
  def self.valid?(domain)
37
39
  !domain.to_s.match(Locality::REGEX).nil?
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
- VERSION = '7.0.2'.freeze
4
+ VERSION = '7.0.3'
3
5
  end
data/script/add CHANGED
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Add one or more domains to a given group, running the standard import checks
4
6
  #
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Alphabetizes entries in the domains.txt file
4
6
  #
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'yaml'
4
5
  require 'open-uri'
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
8
9
 
9
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
10
11
  domains = CSV.parse(domains, headers: true)
11
12
  domains = domains.map { |row| row['Domain Name'] }
12
13
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
8
9
 
9
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
11
 
11
12
  # For some reason, the header row is actually the last row
12
13
  # Pop the last line off the file and prepend it at the begining
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
@@ -6,7 +8,7 @@ require 'yaml'
6
8
  require_relative '../lib/gman'
7
9
 
8
10
  # https://gist.github.com/benbalter/6147066
9
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
10
12
 
11
13
  domains = []
12
14
  PublicSuffix::List.default.each do |rule|
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
6
  # Source: https://github.com/leereilly/swot/
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -13,10 +15,10 @@
13
15
  require './lib/gman'
14
16
  require 'open-uri'
15
17
 
16
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
17
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
18
20
 
19
- data = open(source).read
21
+ data = URI.open(source).read
20
22
  data = data.split('_' * 74)
21
23
  data = data.last.strip
22
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman bin' do
2
4
  let(:domain) { 'whitehouse.gov' }
3
5
  let(:args) { [domain] }
@@ -21,7 +23,7 @@ RSpec.describe 'Gman bin' do
21
23
  end
22
24
 
23
25
  it 'knows the type' do
24
- expect(output).to match('federal')
26
+ expect(output).to match(/federal/i)
25
27
  end
26
28
 
27
29
  it 'knows the agency' do
@@ -87,11 +89,11 @@ RSpec.describe 'Gman bin' do
87
89
  let(:args) { [txt_path] }
88
90
 
89
91
  it 'returns only government domains' do
90
- expected = <<-EOS
91
- mr.senator@obama.senate.gov
92
- president@whitehouse.gov
93
- commander.in.chief@us.army.mil
94
- EOS
92
+ expected = <<~EXPECTED
93
+ mr.senator@obama.senate.gov
94
+ president@whitehouse.gov
95
+ commander.in.chief@us.army.mil
96
+ EXPECTED
95
97
 
96
98
  expect(output).to eql(expected)
97
99
  end
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman Country Codes' do
2
4
  {
3
5
  'whitehouse.gov' => 'United States of America',
4
- 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
5
- 'army.mil' => 'United States of America',
6
- 'foo.gc.ca' => 'Canada',
7
- 'foo.eu' => nil
6
+ 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
7
+ 'army.mil' => 'United States of America',
8
+ 'foo.gc.ca' => 'Canada',
9
+ 'foo.eu' => nil
8
10
  }.each do |domain, expected_country|
9
11
  context "given #{domain.inspect}" do
10
12
  subject { Gman.new(domain) }
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe Gman::DomainList do
2
4
  let(:data) { subject.data }
3
5
  let(:canada) { data['Canada municipal'] }
4
6
 
5
- [:path, :contents, :data].each do |type|
7
+ %i[path contents data].each do |type|
6
8
  context "when initialized by #{type}" do
7
9
  subject do
8
10
  case type
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman domains' do
2
4
  let(:resolve_domains?) { ENV['GMAN_RESOLVE_DOMAINS'] == 'true' }
3
5
  let(:importer) { Gman::Importer.new({}) }
@@ -12,6 +14,7 @@ RSpec.describe 'Gman domains' do
12
14
 
13
15
  Parallel.each(domains, in_threads: 4) do |domain|
14
16
  next if importer.valid_domain?(domain, options)
17
+
15
18
  invalid_domains.push domain
16
19
  end
17
20
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman identifier' do
2
4
  let(:domain) { '' }
3
5
  subject { Gman.new(domain) }
@@ -94,6 +96,10 @@ RSpec.describe 'Gman identifier' do
94
96
  it 'knows the agency' do
95
97
  expect(subject.agency).to eql('Executive Office of the President')
96
98
  end
99
+
100
+ it 'knows the organization' do
101
+ expect(subject.organization).to eql('White House')
102
+ end
97
103
  end
98
104
 
99
105
  context 'a state .gov' do
@@ -161,14 +167,43 @@ RSpec.describe 'Gman identifier' do
161
167
  expect(subject.city).to eql('Pittsburgh')
162
168
  end
163
169
  end
170
+
171
+ context 'a city .gov' do
172
+ let(:domain) { 'ABERDEENMD.GOV' }
173
+
174
+ it "knows it's a city" do
175
+ expect(subject).to be_a_city
176
+ expect(subject.type).to eql(:city)
177
+ end
178
+
179
+ it 'knows the city' do
180
+ expect(subject.city).to eql('Aberdeen')
181
+ end
182
+
183
+ it 'knows the state' do
184
+ expect(subject.state).to eql('MD')
185
+ end
186
+
187
+ it "knows it's a dotgov" do
188
+ expect(subject).to be_a_dotgov
189
+ end
190
+
191
+ it "know's it's not a state" do
192
+ expect(subject).to_not be_a_state
193
+ end
194
+
195
+ it "know's it's not a county" do
196
+ expect(subject).to_not be_a_county
197
+ end
198
+ end
164
199
  end
165
200
  end
166
201
 
167
202
  context "determining a domain's type" do
168
203
  {
169
- :unknown => 'cityofperu.org',
170
- :"Canada municipal" => 'acme.ca',
171
- :"Canada federal" => 'canada.ca'
204
+ unknown: 'cityofperu.org',
205
+ "Canada municipal": 'acme.ca',
206
+ "Canada federal": 'canada.ca'
172
207
  }.each do |expected, domain|
173
208
  context "Given the #{domain} domain" do
174
209
  let(:domain) { domain }