gman 7.0.2 → 7.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/CODEOWNERS +3 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
- data/.github/config.yml +23 -0
- data/.github/no-response.yml +15 -0
- data/.github/release-drafter.yml +4 -0
- data/.github/settings.yml +33 -0
- data/.github/stale.yml +29 -0
- data/.rubocop.yml +5 -5
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/bin/gman +3 -1
- data/bin/gman_filter +3 -5
- data/config/domains.txt +191 -134
- data/config/vendor/dotgovs.csv +5786 -5634
- data/docs/CODE_OF_CONDUCT.md +46 -0
- data/docs/CONTRIBUTING.md +92 -0
- data/{README.md → docs/README.md} +2 -2
- data/docs/_config.yml +2 -0
- data/gman.gemspec +16 -15
- data/lib/gman.rb +4 -1
- data/lib/gman/country_codes.rb +19 -19
- data/lib/gman/domain_list.rb +10 -6
- data/lib/gman/identifier.rb +55 -17
- data/lib/gman/importer.rb +27 -18
- data/lib/gman/locality.rb +8 -6
- data/lib/gman/version.rb +3 -1
- data/script/add +2 -0
- data/script/alphabetize +2 -0
- data/script/dedupe +1 -0
- data/script/profile +1 -0
- data/script/prune +5 -3
- data/script/reconcile-us +5 -2
- data/script/vendor-federal-de +2 -1
- data/script/vendor-municipal-de +2 -1
- data/script/vendor-nl +2 -0
- data/script/vendor-public-suffix +3 -1
- data/script/vendor-se +1 -0
- data/script/vendor-swot +2 -0
- data/script/vendor-us +4 -2
- data/spec/gman/bin_spec.rb +8 -6
- data/spec/gman/country_code_spec.rb +6 -4
- data/spec/gman/domain_list_spec.rb +3 -1
- data/spec/gman/domains_spec.rb +3 -0
- data/spec/gman/identifier_spec.rb +38 -3
- data/spec/gman/importer_spec.rb +9 -7
- data/spec/gman/locality_spec.rb +2 -0
- data/spec/gman_spec.rb +2 -0
- data/spec/spec_helper.rb +2 -0
- metadata +52 -44
- data/CONTRIBUTING.md +0 -22
- data/contributing.json +0 -32
data/lib/gman/importer.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Utility functions for parsing and manipulating public-suffix domain lists
|
2
4
|
# Only used in development and not loaded by default
|
3
5
|
require 'yaml'
|
@@ -13,7 +15,7 @@ class Gman
|
|
13
15
|
attr_accessor :domain_list
|
14
16
|
|
15
17
|
# Known false positives from vendored lists
|
16
|
-
BLACKLIST = %w
|
18
|
+
BLACKLIST = %w[
|
17
19
|
business.centurytel.net
|
18
20
|
chesnee.net
|
19
21
|
citlink.net
|
@@ -39,23 +41,24 @@ class Gman
|
|
39
41
|
wctc.net
|
40
42
|
webconnections.net
|
41
43
|
webpages.charter.net
|
42
|
-
|
44
|
+
].freeze
|
43
45
|
|
44
46
|
REGEX_CHECKS = {
|
45
|
-
'home. regex'
|
46
|
-
'user. regex'
|
47
|
-
'sites. regex'
|
48
|
-
'weebly'
|
49
|
-
'wordpress'
|
50
|
-
'govoffice'
|
51
|
-
'homestead'
|
52
|
-
'wix.com'
|
53
|
-
'blogspot.com'
|
54
|
-
'tripod.com'
|
47
|
+
'home. regex' => /^home\./,
|
48
|
+
'user. regex' => /^users?\./,
|
49
|
+
'sites. regex' => /^sites?\./,
|
50
|
+
'weebly' => /weebly\.com$/,
|
51
|
+
'wordpress' => /wordpress\.com$/,
|
52
|
+
'govoffice' => /govoffice\d?\.com$/,
|
53
|
+
'homestead' => /homestead\.com$/,
|
54
|
+
'wix.com' => /wix\.com$/,
|
55
|
+
'blogspot.com' => /blogspot\.com$/,
|
56
|
+
'tripod.com' => /tripod\.com$/,
|
55
57
|
'squarespace.com' => /squarespace\.com$/,
|
56
|
-
'github.io'
|
57
|
-
'tumblr'
|
58
|
-
'locality'
|
58
|
+
'github.io' => /github\.io$/,
|
59
|
+
'tumblr' => /tumblr\.com$/,
|
60
|
+
'locality' => Gman::Locality::REGEX,
|
61
|
+
'french edu' => /^ac-.*?\.fr/
|
59
62
|
}.freeze
|
60
63
|
|
61
64
|
def initialize(domains)
|
@@ -75,6 +78,7 @@ class Gman
|
|
75
78
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
76
79
|
return false unless ensure_valid(domain)
|
77
80
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
81
|
+
|
78
82
|
true
|
79
83
|
end
|
80
84
|
|
@@ -82,6 +86,7 @@ class Gman
|
|
82
86
|
# rather than a bool and silence log output
|
83
87
|
def reject(domain, reason)
|
84
88
|
return reason if ENV['RECONCILING']
|
89
|
+
|
85
90
|
logger.info "👎 `#{domain}`: #{reason}"
|
86
91
|
false
|
87
92
|
end
|
@@ -102,13 +107,14 @@ class Gman
|
|
102
107
|
end
|
103
108
|
|
104
109
|
def resolver
|
105
|
-
@resolver ||= Resolv::DNS.new(nameserver: ['
|
110
|
+
@resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
|
106
111
|
end
|
107
112
|
|
108
113
|
# Verifies that the given domain has an MX record, and thus is valid
|
109
114
|
def domain_resolves?(domain)
|
110
115
|
domain = Addressable::URI.new(host: domain).normalize.host
|
111
116
|
return true if ip?(domain)
|
117
|
+
|
112
118
|
returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
|
113
119
|
end
|
114
120
|
|
@@ -123,6 +129,7 @@ class Gman
|
|
123
129
|
|
124
130
|
def ensure_valid(domain)
|
125
131
|
return false if domain.empty?
|
132
|
+
|
126
133
|
if BLACKLIST.include?(domain)
|
127
134
|
reject(domain, 'blacklist')
|
128
135
|
elsif !PublicSuffix.valid?("foo.#{domain}")
|
@@ -136,11 +143,13 @@ class Gman
|
|
136
143
|
|
137
144
|
def ensure_resolves(domain)
|
138
145
|
return reject(domain, 'unresolvable') unless domain_resolves?(domain)
|
146
|
+
|
139
147
|
true
|
140
148
|
end
|
141
149
|
|
142
150
|
def ensure_not_dupe(domain)
|
143
151
|
return true unless dupe?(domain)
|
152
|
+
|
144
153
|
if current.domains.include?(domain)
|
145
154
|
reject(domain, 'duplicate')
|
146
155
|
else
|
@@ -154,14 +163,14 @@ class Gman
|
|
154
163
|
end
|
155
164
|
|
156
165
|
def normalize_domains!
|
157
|
-
domain_list.to_h.
|
166
|
+
domain_list.to_h.each_value do |domains|
|
158
167
|
domains.map! { |domain| normalize_domain(domain) }
|
159
168
|
domains.uniq!
|
160
169
|
end
|
161
170
|
end
|
162
171
|
|
163
172
|
def ensure_validity!(options = {})
|
164
|
-
domain_list.data.
|
173
|
+
domain_list.data.each_value do |domains|
|
165
174
|
domains.select! { |domain| valid_domain?(domain, options) }
|
166
175
|
end
|
167
176
|
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Gman
|
2
4
|
# Second level .us domains for states and locality
|
3
5
|
# See http://en.wikipedia.org/wiki/.us
|
@@ -12,18 +14,18 @@ class Gman
|
|
12
14
|
# * k12.il.us
|
13
15
|
# * ci.foo.zx.us
|
14
16
|
class Locality
|
15
|
-
AFFINITY_NAMESPACES = %w
|
17
|
+
AFFINITY_NAMESPACES = %w[state dst cog].freeze
|
16
18
|
|
17
|
-
STATES = %w
|
19
|
+
STATES = %w[
|
18
20
|
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
19
21
|
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
20
22
|
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
21
|
-
|
23
|
+
].freeze
|
22
24
|
|
23
|
-
LOCALITY_DOMAINS = %w
|
25
|
+
LOCALITY_DOMAINS = %w[
|
24
26
|
ci co borough boro city county
|
25
27
|
parish town twp vi vil village
|
26
|
-
|
28
|
+
].freeze
|
27
29
|
|
28
30
|
REGEX = /
|
29
31
|
(
|
@@ -31,7 +33,7 @@ class Gman
|
|
31
33
|
|
|
32
34
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
33
35
|
)\.(#{Regexp.union(STATES)})\.us
|
34
|
-
/x
|
36
|
+
/x.freeze
|
35
37
|
|
36
38
|
def self.valid?(domain)
|
37
39
|
!domain.to_s.match(Locality::REGEX).nil?
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
data/script/alphabetize
CHANGED
data/script/dedupe
CHANGED
data/script/profile
CHANGED
data/script/prune
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Given an array of domains, removes them from the list
|
3
5
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
6
|
|
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
|
|
12
14
|
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
13
15
|
|
14
16
|
domains.each do |domain|
|
15
|
-
list.gsub!(/^#{domain}$\n/, '')
|
17
|
+
list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
|
16
18
|
end
|
17
19
|
|
18
|
-
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
19
|
-
|
20
20
|
File.write './config/domains.txt', list
|
21
|
+
|
22
|
+
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
data/script/reconcile-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Reconciles the USA.gov-maintained list of US domains with domains.txt
|
4
6
|
# to show domains listed in the USA.gov-maintained list that we reject and why
|
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
|
|
12
14
|
blacklist = ['usagovQUASI']
|
13
15
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
14
16
|
|
15
|
-
data = open(source).read
|
17
|
+
data = URI.open(source).read
|
16
18
|
data = data.split('_' * 74)
|
17
19
|
data = data.last.strip
|
18
20
|
data = data.split(/\r?\n/).reject(&:empty?)
|
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
|
|
33
35
|
|
34
36
|
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
37
|
|
36
|
-
importer.domains.list.
|
38
|
+
importer.domains.list.each_value do |d|
|
37
39
|
d.map! { |domain| Gman.new(domain).to_s }
|
38
40
|
d.map! { |domain| importer.normalize_domain(domain) }
|
39
41
|
end
|
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
|
|
44
46
|
missing = {}
|
45
47
|
importer.domains.list.each do |g, usagovdomains|
|
46
48
|
next unless importer.current.list[g]
|
49
|
+
|
47
50
|
missing[g] = importer.current.list[g] - usagovdomains
|
48
51
|
end
|
49
52
|
|
data/script/vendor-federal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
8
9
|
|
9
|
-
domains = open(url).read.encode('UTF-8')
|
10
|
+
domains = URI.open(url).read.encode('UTF-8')
|
10
11
|
domains = CSV.parse(domains, headers: true)
|
11
12
|
domains = domains.map { |row| row['Domain Name'] }
|
12
13
|
|
data/script/vendor-municipal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
8
9
|
|
9
|
-
csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
|
+
csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
11
|
|
11
12
|
# For some reason, the header row is actually the last row
|
12
13
|
# Pop the last line off the file and prepend it at the begining
|
data/script/vendor-nl
CHANGED
data/script/vendor-public-suffix
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Propagates an initial list of best-guess government domains
|
3
5
|
|
4
6
|
require 'public_suffix'
|
@@ -6,7 +8,7 @@ require 'yaml'
|
|
6
8
|
require_relative '../lib/gman'
|
7
9
|
|
8
10
|
# https://gist.github.com/benbalter/6147066
|
9
|
-
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
11
|
+
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
|
10
12
|
|
11
13
|
domains = []
|
12
14
|
PublicSuffix::List.default.each do |rule|
|
data/script/vendor-se
CHANGED
data/script/vendor-swot
CHANGED
data/script/vendor-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Vendors the USA.gov-maintained list of US domains into domains.txt
|
4
6
|
# Source: https://github.com/GSA-OCSIT/govt-urls
|
@@ -13,10 +15,10 @@
|
|
13
15
|
require './lib/gman'
|
14
16
|
require 'open-uri'
|
15
17
|
|
16
|
-
blacklist = %w
|
18
|
+
blacklist = %w[usagovQUASI usagovFEDgov]
|
17
19
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
18
20
|
|
19
|
-
data = open(source).read
|
21
|
+
data = URI.open(source).read
|
20
22
|
data = data.split('_' * 74)
|
21
23
|
data = data.last.strip
|
22
24
|
data = data.split(/\r?\n/).reject(&:empty?)
|
data/spec/gman/bin_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman bin' do
|
2
4
|
let(:domain) { 'whitehouse.gov' }
|
3
5
|
let(:args) { [domain] }
|
@@ -21,7 +23,7 @@ RSpec.describe 'Gman bin' do
|
|
21
23
|
end
|
22
24
|
|
23
25
|
it 'knows the type' do
|
24
|
-
expect(output).to match(
|
26
|
+
expect(output).to match(/federal/i)
|
25
27
|
end
|
26
28
|
|
27
29
|
it 'knows the agency' do
|
@@ -87,11 +89,11 @@ RSpec.describe 'Gman bin' do
|
|
87
89
|
let(:args) { [txt_path] }
|
88
90
|
|
89
91
|
it 'returns only government domains' do
|
90
|
-
expected =
|
91
|
-
mr.senator@obama.senate.gov
|
92
|
-
president@whitehouse.gov
|
93
|
-
commander.in.chief@us.army.mil
|
94
|
-
|
92
|
+
expected = <<~EXPECTED
|
93
|
+
mr.senator@obama.senate.gov
|
94
|
+
president@whitehouse.gov
|
95
|
+
commander.in.chief@us.army.mil
|
96
|
+
EXPECTED
|
95
97
|
|
96
98
|
expect(output).to eql(expected)
|
97
99
|
end
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman Country Codes' do
|
2
4
|
{
|
3
5
|
'whitehouse.gov' => 'United States of America',
|
4
|
-
'foo.gov.uk'
|
5
|
-
'army.mil'
|
6
|
-
'foo.gc.ca'
|
7
|
-
'foo.eu'
|
6
|
+
'foo.gov.uk' => 'United Kingdom of Great Britain and Northern Ireland',
|
7
|
+
'army.mil' => 'United States of America',
|
8
|
+
'foo.gc.ca' => 'Canada',
|
9
|
+
'foo.eu' => nil
|
8
10
|
}.each do |domain, expected_country|
|
9
11
|
context "given #{domain.inspect}" do
|
10
12
|
subject { Gman.new(domain) }
|
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe Gman::DomainList do
|
2
4
|
let(:data) { subject.data }
|
3
5
|
let(:canada) { data['Canada municipal'] }
|
4
6
|
|
5
|
-
[
|
7
|
+
%i[path contents data].each do |type|
|
6
8
|
context "when initialized by #{type}" do
|
7
9
|
subject do
|
8
10
|
case type
|
data/spec/gman/domains_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman domains' do
|
2
4
|
let(:resolve_domains?) { ENV['GMAN_RESOLVE_DOMAINS'] == 'true' }
|
3
5
|
let(:importer) { Gman::Importer.new({}) }
|
@@ -12,6 +14,7 @@ RSpec.describe 'Gman domains' do
|
|
12
14
|
|
13
15
|
Parallel.each(domains, in_threads: 4) do |domain|
|
14
16
|
next if importer.valid_domain?(domain, options)
|
17
|
+
|
15
18
|
invalid_domains.push domain
|
16
19
|
end
|
17
20
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
RSpec.describe 'Gman identifier' do
|
2
4
|
let(:domain) { '' }
|
3
5
|
subject { Gman.new(domain) }
|
@@ -94,6 +96,10 @@ RSpec.describe 'Gman identifier' do
|
|
94
96
|
it 'knows the agency' do
|
95
97
|
expect(subject.agency).to eql('Executive Office of the President')
|
96
98
|
end
|
99
|
+
|
100
|
+
it 'knows the organization' do
|
101
|
+
expect(subject.organization).to eql('White House')
|
102
|
+
end
|
97
103
|
end
|
98
104
|
|
99
105
|
context 'a state .gov' do
|
@@ -161,14 +167,43 @@ RSpec.describe 'Gman identifier' do
|
|
161
167
|
expect(subject.city).to eql('Pittsburgh')
|
162
168
|
end
|
163
169
|
end
|
170
|
+
|
171
|
+
context 'a city .gov' do
|
172
|
+
let(:domain) { 'ABERDEENMD.GOV' }
|
173
|
+
|
174
|
+
it "knows it's a city" do
|
175
|
+
expect(subject).to be_a_city
|
176
|
+
expect(subject.type).to eql(:city)
|
177
|
+
end
|
178
|
+
|
179
|
+
it 'knows the city' do
|
180
|
+
expect(subject.city).to eql('Aberdeen')
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'knows the state' do
|
184
|
+
expect(subject.state).to eql('MD')
|
185
|
+
end
|
186
|
+
|
187
|
+
it "knows it's a dotgov" do
|
188
|
+
expect(subject).to be_a_dotgov
|
189
|
+
end
|
190
|
+
|
191
|
+
it "know's it's not a state" do
|
192
|
+
expect(subject).to_not be_a_state
|
193
|
+
end
|
194
|
+
|
195
|
+
it "know's it's not a county" do
|
196
|
+
expect(subject).to_not be_a_county
|
197
|
+
end
|
198
|
+
end
|
164
199
|
end
|
165
200
|
end
|
166
201
|
|
167
202
|
context "determining a domain's type" do
|
168
203
|
{
|
169
|
-
:
|
170
|
-
|
171
|
-
|
204
|
+
unknown: 'cityofperu.org',
|
205
|
+
"Canada municipal": 'acme.ca',
|
206
|
+
"Canada federal": 'canada.ca'
|
172
207
|
}.each do |expected, domain|
|
173
208
|
context "Given the #{domain} domain" do
|
174
209
|
let(:domain) { domain }
|