gman 7.0.2 → 7.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/CODEOWNERS +3 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
- data/.github/config.yml +23 -0
- data/.github/no-response.yml +15 -0
- data/.github/release-drafter.yml +4 -0
- data/.github/settings.yml +33 -0
- data/.github/stale.yml +29 -0
- data/.rubocop.yml +5 -5
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/bin/gman +3 -1
- data/bin/gman_filter +3 -5
- data/config/domains.txt +191 -134
- data/config/vendor/dotgovs.csv +5786 -5634
- data/docs/CODE_OF_CONDUCT.md +46 -0
- data/docs/CONTRIBUTING.md +92 -0
- data/{README.md → docs/README.md} +2 -2
- data/docs/_config.yml +2 -0
- data/gman.gemspec +16 -15
- data/lib/gman.rb +4 -1
- data/lib/gman/country_codes.rb +19 -19
- data/lib/gman/domain_list.rb +10 -6
- data/lib/gman/identifier.rb +55 -17
- data/lib/gman/importer.rb +27 -18
- data/lib/gman/locality.rb +8 -6
- data/lib/gman/version.rb +3 -1
- data/script/add +2 -0
- data/script/alphabetize +2 -0
- data/script/dedupe +1 -0
- data/script/profile +1 -0
- data/script/prune +5 -3
- data/script/reconcile-us +5 -2
- data/script/vendor-federal-de +2 -1
- data/script/vendor-municipal-de +2 -1
- data/script/vendor-nl +2 -0
- data/script/vendor-public-suffix +3 -1
- data/script/vendor-se +1 -0
- data/script/vendor-swot +2 -0
- data/script/vendor-us +4 -2
- data/spec/gman/bin_spec.rb +8 -6
- data/spec/gman/country_code_spec.rb +6 -4
- data/spec/gman/domain_list_spec.rb +3 -1
- data/spec/gman/domains_spec.rb +3 -0
- data/spec/gman/identifier_spec.rb +38 -3
- data/spec/gman/importer_spec.rb +9 -7
- data/spec/gman/locality_spec.rb +2 -0
- data/spec/gman_spec.rb +2 -0
- data/spec/spec_helper.rb +2 -0
- metadata +52 -44
- data/CONTRIBUTING.md +0 -22
- data/contributing.json +0 -32
data/lib/gman/importer.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Utility functions for parsing and manipulating public-suffix domain lists
|
2
4
|
# Only used in development and not loaded by default
|
3
5
|
require 'yaml'
|
@@ -13,7 +15,7 @@ class Gman
|
|
13
15
|
attr_accessor :domain_list
|
14
16
|
|
15
17
|
# Known false positives from vendored lists
|
16
|
-
BLACKLIST = %w
|
18
|
+
BLACKLIST = %w[
|
17
19
|
business.centurytel.net
|
18
20
|
chesnee.net
|
19
21
|
citlink.net
|
@@ -39,23 +41,24 @@ class Gman
|
|
39
41
|
wctc.net
|
40
42
|
webconnections.net
|
41
43
|
webpages.charter.net
|
42
|
-
|
44
|
+
].freeze
|
43
45
|
|
44
46
|
REGEX_CHECKS = {
|
45
|
-
'home. regex'
|
46
|
-
'user. regex'
|
47
|
-
'sites. regex'
|
48
|
-
'weebly'
|
49
|
-
'wordpress'
|
50
|
-
'govoffice'
|
51
|
-
'homestead'
|
52
|
-
'wix.com'
|
53
|
-
'blogspot.com'
|
54
|
-
'tripod.com'
|
47
|
+
'home. regex' => /^home\./,
|
48
|
+
'user. regex' => /^users?\./,
|
49
|
+
'sites. regex' => /^sites?\./,
|
50
|
+
'weebly' => /weebly\.com$/,
|
51
|
+
'wordpress' => /wordpress\.com$/,
|
52
|
+
'govoffice' => /govoffice\d?\.com$/,
|
53
|
+
'homestead' => /homestead\.com$/,
|
54
|
+
'wix.com' => /wix\.com$/,
|
55
|
+
'blogspot.com' => /blogspot\.com$/,
|
56
|
+
'tripod.com' => /tripod\.com$/,
|
55
57
|
'squarespace.com' => /squarespace\.com$/,
|
56
|
-
'github.io'
|
57
|
-
'tumblr'
|
58
|
-
'locality'
|
58
|
+
'github.io' => /github\.io$/,
|
59
|
+
'tumblr' => /tumblr\.com$/,
|
60
|
+
'locality' => Gman::Locality::REGEX,
|
61
|
+
'french edu' => /^ac-.*?\.fr/
|
59
62
|
}.freeze
|
60
63
|
|
61
64
|
def initialize(domains)
|
@@ -75,6 +78,7 @@ class Gman
|
|
75
78
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
76
79
|
return false unless ensure_valid(domain)
|
77
80
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
81
|
+
|
78
82
|
true
|
79
83
|
end
|
80
84
|
|
@@ -82,6 +86,7 @@ class Gman
|
|
82
86
|
# rather than a bool and silence log output
|
83
87
|
def reject(domain, reason)
|
84
88
|
return reason if ENV['RECONCILING']
|
89
|
+
|
85
90
|
logger.info "👎 `#{domain}`: #{reason}"
|
86
91
|
false
|
87
92
|
end
|
@@ -102,13 +107,14 @@ class Gman
|
|
102
107
|
end
|
103
108
|
|
104
109
|
def resolver
|
105
|
-
@resolver ||= Resolv::DNS.new(nameserver: ['
|
110
|
+
@resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
|
106
111
|
end
|
107
112
|
|
108
113
|
# Verifies that the given domain has an MX record, and thus is valid
|
109
114
|
def domain_resolves?(domain)
|
110
115
|
domain = Addressable::URI.new(host: domain).normalize.host
|
111
116
|
return true if ip?(domain)
|
117
|
+
|
112
118
|
returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
|
113
119
|
end
|
114
120
|
|
@@ -123,6 +129,7 @@ class Gman
|
|
123
129
|
|
124
130
|
def ensure_valid(domain)
|
125
131
|
return false if domain.empty?
|
132
|
+
|
126
133
|
if BLACKLIST.include?(domain)
|
127
134
|
reject(domain, 'blacklist')
|
128
135
|
elsif !PublicSuffix.valid?("foo.#{domain}")
|
@@ -136,11 +143,13 @@ class Gman
|
|
136
143
|
|
137
144
|
def ensure_resolves(domain)
|
138
145
|
return reject(domain, 'unresolvable') unless domain_resolves?(domain)
|
146
|
+
|
139
147
|
true
|
140
148
|
end
|
141
149
|
|
142
150
|
def ensure_not_dupe(domain)
|
143
151
|
return true unless dupe?(domain)
|
152
|
+
|
144
153
|
if current.domains.include?(domain)
|
145
154
|
reject(domain, 'duplicate')
|
146
155
|
else
|
@@ -154,14 +163,14 @@ class Gman
|
|
154
163
|
end
|
155
164
|
|
156
165
|
def normalize_domains!
|
157
|
-
domain_list.to_h.
|
166
|
+
domain_list.to_h.each_value do |domains|
|
158
167
|
domains.map! { |domain| normalize_domain(domain) }
|
159
168
|
domains.uniq!
|
160
169
|
end
|
161
170
|
end
|
162
171
|
|
163
172
|
def ensure_validity!(options = {})
|
164
|
-
domain_list.data.
|
173
|
+
domain_list.data.each_value do |domains|
|
165
174
|
domains.select! { |domain| valid_domain?(domain, options) }
|
166
175
|
end
|
167
176
|
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Gman
|
2
4
|
# Second level .us domains for states and locality
|
3
5
|
# See http://en.wikipedia.org/wiki/.us
|
@@ -12,18 +14,18 @@ class Gman
|
|
12
14
|
# * k12.il.us
|
13
15
|
# * ci.foo.zx.us
|
14
16
|
class Locality
|
15
|
-
AFFINITY_NAMESPACES = %w
|
17
|
+
AFFINITY_NAMESPACES = %w[state dst cog].freeze
|
16
18
|
|
17
|
-
STATES = %w
|
19
|
+
STATES = %w[
|
18
20
|
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
19
21
|
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
20
22
|
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
21
|
-
|
23
|
+
].freeze
|
22
24
|
|
23
|
-
LOCALITY_DOMAINS = %w
|
25
|
+
LOCALITY_DOMAINS = %w[
|
24
26
|
ci co borough boro city county
|
25
27
|
parish town twp vi vil village
|
26
|
-
|
28
|
+
].freeze
|
27
29
|
|
28
30
|
REGEX = /
|
29
31
|
(
|
@@ -31,7 +33,7 @@ class Gman
|
|
31
33
|
|
|
32
34
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
33
35
|
)\.(#{Regexp.union(STATES)})\.us
|
34
|
-
/x
|
36
|
+
/x.freeze
|
35
37
|
|
36
38
|
def self.valid?(domain)
|
37
39
|
!domain.to_s.match(Locality::REGEX).nil?
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
data/script/alphabetize
CHANGED
data/script/dedupe
CHANGED
data/script/profile
CHANGED
data/script/prune
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Given an array of domains, removes them from the list
|
3
5
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
6
|
|
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
|
|
12
14
|
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
13
15
|
|
14
16
|
domains.each do |domain|
|
15
|
-
list.gsub!(/^#{domain}$\n/, '')
|
17
|
+
list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
|
16
18
|
end
|
17
19
|
|
18
|
-
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
19
|
-
|
20
20
|
File.write './config/domains.txt', list
|
21
|
+
|
22
|
+
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
data/script/reconcile-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Reconciles the USA.gov-maintained list of US domains with domains.txt
|
4
6
|
# to show domains listed in the USA.gov-maintained list that we reject and why
|
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
|
|
12
14
|
blacklist = ['usagovQUASI']
|
13
15
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
14
16
|
|
15
|
-
data = open(source).read
|
17
|
+
data = URI.open(source).read
|
16
18
|
data = data.split('_' * 74)
|
17
19
|
data = data.last.strip
|
18
20
|
data = data.split(/\r?\n/).reject(&:empty?)
|
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
|
|
33
35
|
|
34
36
|
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
37
|
|
36
|
-
importer.domains.list.
|
38
|
+
importer.domains.list.each_value do |d|
|
37
39
|
d.map! { |domain| Gman.new(domain).to_s }
|
38
40
|
d.map! { |domain| importer.normalize_domain(domain) }
|
39
41
|
end
|
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
|
|
44
46
|
missing = {}
|
45
47
|
importer.domains.list.each do |g, usagovdomains|
|
46
48
|
next unless importer.current.list[g]
|
49
|
+
|
47
50
|
missing[g] = importer.current.list[g] - usagovdomains
|
48
51
|
end
|
49
52
|
|
data/script/vendor-federal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
8
9
|
|
9
|
-
domains = open(url).read.encode('UTF-8')
|
10
|
+
domains = URI.open(url).read.encode('UTF-8')
|
10
11
|
domains = CSV.parse(domains, headers: true)
|
11
12
|
domains = domains.map { |row| row['Domain Name'] }
|
12
13
|
|
data/script/vendor-municipal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
8
9
|
|
9
|
-
csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
|
+
csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
11
|
|
11
12
|
# For some reason, the header row is actually the last row
|
12
13
|
# Pop the last line off the file and prepend it at the begining
|
data/script/vendor-nl
CHANGED
data/script/vendor-public-suffix
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Propagates an initial list of best-guess government domains
|
3
5
|
|
4
6
|
require 'public_suffix'
|
@@ -6,7 +8,7 @@ require 'yaml'
|
|
6
8
|
require_relative '../lib/gman'
|
7
9
|
|
8
10
|
# https://gist.github.com/benbalter/6147066
|
9
|
-
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
11
|
+
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
|
10
12
|
|
11
13
|
domains = []
|
12
14
|
PublicSuffix::List.default.each do |rule|
|
data/script/vendor-se
CHANGED
data/script/vendor-swot
CHANGED
data/script/vendor-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Vendors the USA.gov-maintained list of US domains into domains.txt
|
4
6
|
# Source: https://github.com/GSA-OCSIT/govt-urls
|
@@ -13,10 +15,10 @@
|
|
13
15
|
require './lib/gman'
|
14
16
|
require 'open-uri'
|
15
17
|
|
16
|
-
blacklist = %w
|
18
|
+
blacklist = %w[usagovQUASI usagovFEDgov]
|
17
19
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
18
20
|
|
19
|
-
data = open(source).read
|
21
|
+
data = URI.open(source).read
|
20
22
|
data = data.split('_' * 74)
|
21
23
|
data = data.last.strip
|
22
24
|
data = data.split(/\r?\n/).reject(&:empty?)
|
data/spec/gman/bin_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman bin' do
|
2
4
|
let(:domain) { 'whitehouse.gov' }
|
3
5
|
let(:args) { [domain] }
|
@@ -21,7 +23,7 @@ RSpec.describe 'Gman bin' do
|
|
21
23
|
end
|
22
24
|
|
23
25
|
it 'knows the type' do
|
24
|
-
expect(output).to match(
|
26
|
+
expect(output).to match(/federal/i)
|
25
27
|
end
|
26
28
|
|
27
29
|
it 'knows the agency' do
|
@@ -87,11 +89,11 @@ RSpec.describe 'Gman bin' do
|
|
87
89
|
let(:args) { [txt_path] }
|
88
90
|
|
89
91
|
it 'returns only government domains' do
|
90
|
-
expected =
|
91
|
-
mr.senator@obama.senate.gov
|
92
|
-
president@whitehouse.gov
|
93
|
-
commander.in.chief@us.army.mil
|
94
|
-
|
92
|
+
expected = <<~EXPECTED
|
93
|
+
mr.senator@obama.senate.gov
|
94
|
+
president@whitehouse.gov
|
95
|
+
commander.in.chief@us.army.mil
|
96
|
+
EXPECTED
|
95
97
|
|
96
98
|
expect(output).to eql(expected)
|
97
99
|
end
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman Country Codes' do
|
2
4
|
{
|
3
5
|
'whitehouse.gov' => 'United States of America',
|
4
|
-
'foo.gov.uk'
|
5
|
-
'army.mil'
|
6
|
-
'foo.gc.ca'
|
7
|
-
'foo.eu'
|
6
|
+
'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
|
7
|
+
'army.mil' => 'United States of America',
|
8
|
+
'foo.gc.ca' => 'Canada',
|
9
|
+
'foo.eu' => nil
|
8
10
|
}.each do |domain, expected_country|
|
9
11
|
context "given #{domain.inspect}" do
|
10
12
|
subject { Gman.new(domain) }
|
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe Gman::DomainList do
|
2
4
|
let(:data) { subject.data }
|
3
5
|
let(:canada) { data['Canada municipal'] }
|
4
6
|
|
5
|
-
[
|
7
|
+
%i[path contents data].each do |type|
|
6
8
|
context "when initialized by #{type}" do
|
7
9
|
subject do
|
8
10
|
case type
|
data/spec/gman/domains_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman domains' do
|
2
4
|
let(:resolve_domains?) { ENV['GMAN_RESOLVE_DOMAINS'] == 'true' }
|
3
5
|
let(:importer) { Gman::Importer.new({}) }
|
@@ -12,6 +14,7 @@ RSpec.describe 'Gman domains' do
|
|
12
14
|
|
13
15
|
Parallel.each(domains, in_threads: 4) do |domain|
|
14
16
|
next if importer.valid_domain?(domain, options)
|
17
|
+
|
15
18
|
invalid_domains.push domain
|
16
19
|
end
|
17
20
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman identifier' do
|
2
4
|
let(:domain) { '' }
|
3
5
|
subject { Gman.new(domain) }
|
@@ -94,6 +96,10 @@ RSpec.describe 'Gman identifier' do
|
|
94
96
|
it 'knows the agency' do
|
95
97
|
expect(subject.agency).to eql('Executive Office of the President')
|
96
98
|
end
|
99
|
+
|
100
|
+
it 'knows the organization' do
|
101
|
+
expect(subject.organization).to eql('White House')
|
102
|
+
end
|
97
103
|
end
|
98
104
|
|
99
105
|
context 'a state .gov' do
|
@@ -161,14 +167,43 @@ RSpec.describe 'Gman identifier' do
|
|
161
167
|
expect(subject.city).to eql('Pittsburgh')
|
162
168
|
end
|
163
169
|
end
|
170
|
+
|
171
|
+
context 'a city .gov' do
|
172
|
+
let(:domain) { 'ABERDEENMD.GOV' }
|
173
|
+
|
174
|
+
it "knows it's a city" do
|
175
|
+
expect(subject).to be_a_city
|
176
|
+
expect(subject.type).to eql(:city)
|
177
|
+
end
|
178
|
+
|
179
|
+
it 'knows the city' do
|
180
|
+
expect(subject.city).to eql('Aberdeen')
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'knows the state' do
|
184
|
+
expect(subject.state).to eql('MD')
|
185
|
+
end
|
186
|
+
|
187
|
+
it "knows it's a dotgov" do
|
188
|
+
expect(subject).to be_a_dotgov
|
189
|
+
end
|
190
|
+
|
191
|
+
it "know's it's not a state" do
|
192
|
+
expect(subject).to_not be_a_state
|
193
|
+
end
|
194
|
+
|
195
|
+
it "know's it's not a county" do
|
196
|
+
expect(subject).to_not be_a_county
|
197
|
+
end
|
198
|
+
end
|
164
199
|
end
|
165
200
|
end
|
166
201
|
|
167
202
|
context "determining a domain's type" do
|
168
203
|
{
|
169
|
-
:
|
170
|
-
|
171
|
-
|
204
|
+
unknown: 'cityofperu.org',
|
205
|
+
"Canada municipal": 'acme.ca',
|
206
|
+
"Canada federal": 'canada.ca'
|
172
207
|
}.each do |expected, domain|
|
173
208
|
context "Given the #{domain} domain" do
|
174
209
|
let(:domain) { domain }
|