gman 7.0.2 → 7.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/no-response.yml +15 -0
  7. data/.github/release-drafter.yml +4 -0
  8. data/.github/settings.yml +33 -0
  9. data/.github/stale.yml +29 -0
  10. data/.rubocop.yml +5 -5
  11. data/.ruby-version +1 -1
  12. data/Gemfile +2 -0
  13. data/bin/gman +3 -1
  14. data/bin/gman_filter +3 -5
  15. data/config/domains.txt +191 -134
  16. data/config/vendor/dotgovs.csv +5786 -5634
  17. data/docs/CODE_OF_CONDUCT.md +46 -0
  18. data/docs/CONTRIBUTING.md +92 -0
  19. data/{README.md → docs/README.md} +2 -2
  20. data/docs/_config.yml +2 -0
  21. data/gman.gemspec +16 -15
  22. data/lib/gman.rb +4 -1
  23. data/lib/gman/country_codes.rb +19 -19
  24. data/lib/gman/domain_list.rb +10 -6
  25. data/lib/gman/identifier.rb +55 -17
  26. data/lib/gman/importer.rb +27 -18
  27. data/lib/gman/locality.rb +8 -6
  28. data/lib/gman/version.rb +3 -1
  29. data/script/add +2 -0
  30. data/script/alphabetize +2 -0
  31. data/script/dedupe +1 -0
  32. data/script/profile +1 -0
  33. data/script/prune +5 -3
  34. data/script/reconcile-us +5 -2
  35. data/script/vendor-federal-de +2 -1
  36. data/script/vendor-municipal-de +2 -1
  37. data/script/vendor-nl +2 -0
  38. data/script/vendor-public-suffix +3 -1
  39. data/script/vendor-se +1 -0
  40. data/script/vendor-swot +2 -0
  41. data/script/vendor-us +4 -2
  42. data/spec/gman/bin_spec.rb +8 -6
  43. data/spec/gman/country_code_spec.rb +6 -4
  44. data/spec/gman/domain_list_spec.rb +3 -1
  45. data/spec/gman/domains_spec.rb +3 -0
  46. data/spec/gman/identifier_spec.rb +38 -3
  47. data/spec/gman/importer_spec.rb +9 -7
  48. data/spec/gman/locality_spec.rb +2 -0
  49. data/spec/gman_spec.rb +2 -0
  50. data/spec/spec_helper.rb +2 -0
  51. metadata +52 -44
  52. data/CONTRIBUTING.md +0 -22
  53. data/contributing.json +0 -32
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Utility functions for parsing and manipulating public-suffix domain lists
2
4
  # Only used in development and not loaded by default
3
5
  require 'yaml'
@@ -13,7 +15,7 @@ class Gman
13
15
  attr_accessor :domain_list
14
16
 
15
17
  # Known false positives from vendored lists
16
- BLACKLIST = %w(
18
+ BLACKLIST = %w[
17
19
  business.centurytel.net
18
20
  chesnee.net
19
21
  citlink.net
@@ -39,23 +41,24 @@ class Gman
39
41
  wctc.net
40
42
  webconnections.net
41
43
  webpages.charter.net
42
- ).freeze
44
+ ].freeze
43
45
 
44
46
  REGEX_CHECKS = {
45
- 'home. regex' => /^home\./,
46
- 'user. regex' => /^users?\./,
47
- 'sites. regex' => /^sites?\./,
48
- 'weebly' => /weebly\.com$/,
49
- 'wordpress' => /wordpress\.com$/,
50
- 'govoffice' => /govoffice\d?\.com$/,
51
- 'homestead' => /homestead\.com$/,
52
- 'wix.com' => /wix\.com$/,
53
- 'blogspot.com' => /blogspot\.com$/,
54
- 'tripod.com' => /tripod\.com$/,
47
+ 'home. regex' => /^home\./,
48
+ 'user. regex' => /^users?\./,
49
+ 'sites. regex' => /^sites?\./,
50
+ 'weebly' => /weebly\.com$/,
51
+ 'wordpress' => /wordpress\.com$/,
52
+ 'govoffice' => /govoffice\d?\.com$/,
53
+ 'homestead' => /homestead\.com$/,
54
+ 'wix.com' => /wix\.com$/,
55
+ 'blogspot.com' => /blogspot\.com$/,
56
+ 'tripod.com' => /tripod\.com$/,
55
57
  'squarespace.com' => /squarespace\.com$/,
56
- 'github.io' => /github\.io$/,
57
- 'tumblr' => /tumblr\.com$/,
58
- 'locality' => Gman::Locality::REGEX
58
+ 'github.io' => /github\.io$/,
59
+ 'tumblr' => /tumblr\.com$/,
60
+ 'locality' => Gman::Locality::REGEX,
61
+ 'french edu' => /^ac-.*?\.fr/
59
62
  }.freeze
60
63
 
61
64
  def initialize(domains)
@@ -75,6 +78,7 @@ class Gman
75
78
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
76
79
  return false unless ensure_valid(domain)
77
80
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
81
+
78
82
  true
79
83
  end
80
84
 
@@ -82,6 +86,7 @@ class Gman
82
86
  # rather than a bool and silence log output
83
87
  def reject(domain, reason)
84
88
  return reason if ENV['RECONCILING']
89
+
85
90
  logger.info "👎 `#{domain}`: #{reason}"
86
91
  false
87
92
  end
@@ -102,13 +107,14 @@ class Gman
102
107
  end
103
108
 
104
109
  def resolver
105
- @resolver ||= Resolv::DNS.new(nameserver: ['8.8.8.8', '8.8.4.4'])
110
+ @resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
106
111
  end
107
112
 
108
113
  # Verifies that the given domain has an MX record, and thus is valid
109
114
  def domain_resolves?(domain)
110
115
  domain = Addressable::URI.new(host: domain).normalize.host
111
116
  return true if ip?(domain)
117
+
112
118
  returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
113
119
  end
114
120
 
@@ -123,6 +129,7 @@ class Gman
123
129
 
124
130
  def ensure_valid(domain)
125
131
  return false if domain.empty?
132
+
126
133
  if BLACKLIST.include?(domain)
127
134
  reject(domain, 'blacklist')
128
135
  elsif !PublicSuffix.valid?("foo.#{domain}")
@@ -136,11 +143,13 @@ class Gman
136
143
 
137
144
  def ensure_resolves(domain)
138
145
  return reject(domain, 'unresolvable') unless domain_resolves?(domain)
146
+
139
147
  true
140
148
  end
141
149
 
142
150
  def ensure_not_dupe(domain)
143
151
  return true unless dupe?(domain)
152
+
144
153
  if current.domains.include?(domain)
145
154
  reject(domain, 'duplicate')
146
155
  else
@@ -154,14 +163,14 @@ class Gman
154
163
  end
155
164
 
156
165
  def normalize_domains!
157
- domain_list.to_h.each do |_group, domains|
166
+ domain_list.to_h.each_value do |domains|
158
167
  domains.map! { |domain| normalize_domain(domain) }
159
168
  domains.uniq!
160
169
  end
161
170
  end
162
171
 
163
172
  def ensure_validity!(options = {})
164
- domain_list.data.each do |_group, domains|
173
+ domain_list.data.each_value do |domains|
165
174
  domains.select! { |domain| valid_domain?(domain, options) }
166
175
  end
167
176
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
4
  # Second level .us domains for states and locality
3
5
  # See http://en.wikipedia.org/wiki/.us
@@ -12,18 +14,18 @@ class Gman
12
14
  # * k12.il.us
13
15
  # * ci.foo.zx.us
14
16
  class Locality
15
- AFFINITY_NAMESPACES = %w(state dst cog).freeze
17
+ AFFINITY_NAMESPACES = %w[state dst cog].freeze
16
18
 
17
- STATES = %w(
19
+ STATES = %w[
18
20
  ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
19
21
  la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
20
22
  ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
21
- ).freeze
23
+ ].freeze
22
24
 
23
- LOCALITY_DOMAINS = %w(
25
+ LOCALITY_DOMAINS = %w[
24
26
  ci co borough boro city county
25
27
  parish town twp vi vil village
26
- ).freeze
28
+ ].freeze
27
29
 
28
30
  REGEX = /
29
31
  (
@@ -31,7 +33,7 @@ class Gman
31
33
  |
32
34
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
33
35
  )\.(#{Regexp.union(STATES)})\.us
34
- /x
36
+ /x.freeze
35
37
 
36
38
  def self.valid?(domain)
37
39
  !domain.to_s.match(Locality::REGEX).nil?
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
- VERSION = '7.0.2'.freeze
4
+ VERSION = '7.0.3'
3
5
  end
data/script/add CHANGED
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Add one or more domains to a given group, running the standard import checks
4
6
  #
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Alphabetizes entries in the domains.txt file
4
6
  #
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'yaml'
4
5
  require 'open-uri'
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
8
9
 
9
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
10
11
  domains = CSV.parse(domains, headers: true)
11
12
  domains = domains.map { |row| row['Domain Name'] }
12
13
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
8
9
 
9
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
11
 
11
12
  # For some reason, the header row is actually the last row
12
13
  # Pop the last line off the file and prepend it at the begining
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
@@ -6,7 +8,7 @@ require 'yaml'
6
8
  require_relative '../lib/gman'
7
9
 
8
10
  # https://gist.github.com/benbalter/6147066
9
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
10
12
 
11
13
  domains = []
12
14
  PublicSuffix::List.default.each do |rule|
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
6
  # Source: https://github.com/leereilly/swot/
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -13,10 +15,10 @@
13
15
  require './lib/gman'
14
16
  require 'open-uri'
15
17
 
16
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
17
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
18
20
 
19
- data = open(source).read
21
+ data = URI.open(source).read
20
22
  data = data.split('_' * 74)
21
23
  data = data.last.strip
22
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman bin' do
2
4
  let(:domain) { 'whitehouse.gov' }
3
5
  let(:args) { [domain] }
@@ -21,7 +23,7 @@ RSpec.describe 'Gman bin' do
21
23
  end
22
24
 
23
25
  it 'knows the type' do
24
- expect(output).to match('federal')
26
+ expect(output).to match(/federal/i)
25
27
  end
26
28
 
27
29
  it 'knows the agency' do
@@ -87,11 +89,11 @@ RSpec.describe 'Gman bin' do
87
89
  let(:args) { [txt_path] }
88
90
 
89
91
  it 'returns only government domains' do
90
- expected = <<-EOS
91
- mr.senator@obama.senate.gov
92
- president@whitehouse.gov
93
- commander.in.chief@us.army.mil
94
- EOS
92
+ expected = <<~EXPECTED
93
+ mr.senator@obama.senate.gov
94
+ president@whitehouse.gov
95
+ commander.in.chief@us.army.mil
96
+ EXPECTED
95
97
 
96
98
  expect(output).to eql(expected)
97
99
  end
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman Country Codes' do
2
4
  {
3
5
  'whitehouse.gov' => 'United States of America',
4
- 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
5
- 'army.mil' => 'United States of America',
6
- 'foo.gc.ca' => 'Canada',
7
- 'foo.eu' => nil
6
+ 'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
7
+ 'army.mil' => 'United States of America',
8
+ 'foo.gc.ca' => 'Canada',
9
+ 'foo.eu' => nil
8
10
  }.each do |domain, expected_country|
9
11
  context "given #{domain.inspect}" do
10
12
  subject { Gman.new(domain) }
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe Gman::DomainList do
2
4
  let(:data) { subject.data }
3
5
  let(:canada) { data['Canada municipal'] }
4
6
 
5
- [:path, :contents, :data].each do |type|
7
+ %i[path contents data].each do |type|
6
8
  context "when initialized by #{type}" do
7
9
  subject do
8
10
  case type
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman domains' do
2
4
  let(:resolve_domains?) { ENV['GMAN_RESOLVE_DOMAINS'] == 'true' }
3
5
  let(:importer) { Gman::Importer.new({}) }
@@ -12,6 +14,7 @@ RSpec.describe 'Gman domains' do
12
14
 
13
15
  Parallel.each(domains, in_threads: 4) do |domain|
14
16
  next if importer.valid_domain?(domain, options)
17
+
15
18
  invalid_domains.push domain
16
19
  end
17
20
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  RSpec.describe 'Gman identifier' do
2
4
  let(:domain) { '' }
3
5
  subject { Gman.new(domain) }
@@ -94,6 +96,10 @@ RSpec.describe 'Gman identifier' do
94
96
  it 'knows the agency' do
95
97
  expect(subject.agency).to eql('Executive Office of the President')
96
98
  end
99
+
100
+ it 'knows the organization' do
101
+ expect(subject.organization).to eql('White House')
102
+ end
97
103
  end
98
104
 
99
105
  context 'a state .gov' do
@@ -161,14 +167,43 @@ RSpec.describe 'Gman identifier' do
161
167
  expect(subject.city).to eql('Pittsburgh')
162
168
  end
163
169
  end
170
+
171
+ context 'a city .gov' do
172
+ let(:domain) { 'ABERDEENMD.GOV' }
173
+
174
+ it "knows it's a city" do
175
+ expect(subject).to be_a_city
176
+ expect(subject.type).to eql(:city)
177
+ end
178
+
179
+ it 'knows the city' do
180
+ expect(subject.city).to eql('Aberdeen')
181
+ end
182
+
183
+ it 'knows the state' do
184
+ expect(subject.state).to eql('MD')
185
+ end
186
+
187
+ it "knows it's a dotgov" do
188
+ expect(subject).to be_a_dotgov
189
+ end
190
+
191
+ it "know's it's not a state" do
192
+ expect(subject).to_not be_a_state
193
+ end
194
+
195
+ it "know's it's not a county" do
196
+ expect(subject).to_not be_a_county
197
+ end
198
+ end
164
199
  end
165
200
  end
166
201
 
167
202
  context "determining a domain's type" do
168
203
  {
169
- :unknown => 'cityofperu.org',
170
- :"Canada municipal" => 'acme.ca',
171
- :"Canada federal" => 'canada.ca'
204
+ unknown: 'cityofperu.org',
205
+ "Canada municipal": 'acme.ca',
206
+ "Canada federal": 'canada.ca'
172
207
  }.each do |expected, domain|
173
208
  context "Given the #{domain} domain" do
174
209
  let(:domain) { domain }