gman 7.0.0 → 7.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/CODEOWNERS +3 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
- data/.github/config.yml +23 -0
- data/.github/funding.yml +1 -0
- data/.github/no-response.yml +15 -0
- data/.github/release-drafter.yml +4 -0
- data/.github/settings.yml +33 -0
- data/.github/stale.yml +29 -0
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.rubocop.yml +14 -5
- data/.rubocop_todo.yml +84 -0
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/bin/gman +6 -4
- data/bin/gman_filter +5 -7
- data/config/domains.txt +8454 -168
- data/config/vendor/academic.txt +6 -7
- data/config/vendor/dotgovs.csv +5786 -5560
- data/docs/CODE_OF_CONDUCT.md +46 -0
- data/docs/CONTRIBUTING.md +92 -0
- data/{README.md → docs/README.md} +3 -3
- data/docs/SECURITY.md +3 -0
- data/docs/_config.yml +2 -0
- data/gman.gemspec +18 -17
- data/lib/gman.rb +4 -2
- data/lib/gman/country_codes.rb +17 -17
- data/lib/gman/domain_list.rb +25 -9
- data/lib/gman/identifier.rb +57 -19
- data/lib/gman/importer.rb +31 -21
- data/lib/gman/locality.rb +8 -6
- data/lib/gman/version.rb +3 -1
- data/script/add +2 -0
- data/script/alphabetize +2 -0
- data/script/cibuild +1 -1
- data/script/dedupe +2 -1
- data/script/profile +2 -1
- data/script/prune +5 -3
- data/script/reconcile-us +6 -3
- data/script/vendor-federal-de +2 -1
- data/script/vendor-municipal-de +2 -1
- data/script/vendor-nl +2 -0
- data/script/vendor-public-suffix +6 -4
- data/script/vendor-se +2 -1
- data/script/vendor-swot +3 -1
- data/script/vendor-us +5 -3
- data/spec/fixtures/domains.txt +4 -0
- data/{test → spec}/fixtures/obama.txt +0 -0
- data/spec/gman/bin_spec.rb +101 -0
- data/spec/gman/country_code_spec.rb +39 -0
- data/spec/gman/domain_list_spec.rb +110 -0
- data/spec/gman/domains_spec.rb +25 -0
- data/spec/gman/identifier_spec.rb +218 -0
- data/spec/gman/importer_spec.rb +236 -0
- data/spec/gman/locality_spec.rb +24 -0
- data/spec/gman_spec.rb +74 -0
- data/spec/spec_helper.rb +31 -0
- metadata +89 -81
- data/.rake_tasks +0 -0
- data/CONTRIBUTING.md +0 -22
- data/Rakefile +0 -22
- data/test/fixtures/domains.txt +0 -2
- data/test/helper.rb +0 -48
- data/test/test_gman.rb +0 -56
- data/test/test_gman_bin.rb +0 -75
- data/test/test_gman_country_codes.rb +0 -18
- data/test/test_gman_domain_list.rb +0 -112
- data/test/test_gman_domains.rb +0 -32
- data/test/test_gman_filter.rb +0 -17
- data/test/test_gman_identifier.rb +0 -106
- data/test/test_gman_importer.rb +0 -244
- data/test/test_gman_locality.rb +0 -10
data/lib/gman/importer.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Utility functions for parsing and manipulating public-suffix domain lists
|
2
4
|
# Only used in development and not loaded by default
|
3
5
|
require 'yaml'
|
4
6
|
require 'open-uri'
|
5
7
|
require 'resolv'
|
6
8
|
require 'logger'
|
9
|
+
require 'swot'
|
7
10
|
require_relative '../gman'
|
8
11
|
require_relative './domain_list'
|
9
12
|
|
@@ -12,7 +15,7 @@ class Gman
|
|
12
15
|
attr_accessor :domain_list
|
13
16
|
|
14
17
|
# Known false positives from vendored lists
|
15
|
-
BLACKLIST = %w
|
18
|
+
BLACKLIST = %w[
|
16
19
|
business.centurytel.net
|
17
20
|
chesnee.net
|
18
21
|
citlink.net
|
@@ -38,23 +41,24 @@ class Gman
|
|
38
41
|
wctc.net
|
39
42
|
webconnections.net
|
40
43
|
webpages.charter.net
|
41
|
-
|
44
|
+
].freeze
|
42
45
|
|
43
46
|
REGEX_CHECKS = {
|
44
|
-
'home. regex'
|
45
|
-
'user. regex'
|
46
|
-
'sites. regex'
|
47
|
-
'weebly'
|
48
|
-
'wordpress'
|
49
|
-
'govoffice'
|
50
|
-
'homestead'
|
51
|
-
'wix.com'
|
52
|
-
'blogspot.com'
|
53
|
-
'tripod.com'
|
47
|
+
'home. regex' => /^home\./,
|
48
|
+
'user. regex' => /^users?\./,
|
49
|
+
'sites. regex' => /^sites?\./,
|
50
|
+
'weebly' => /weebly\.com$/,
|
51
|
+
'wordpress' => /wordpress\.com$/,
|
52
|
+
'govoffice' => /govoffice\d?\.com$/,
|
53
|
+
'homestead' => /homestead\.com$/,
|
54
|
+
'wix.com' => /wix\.com$/,
|
55
|
+
'blogspot.com' => /blogspot\.com$/,
|
56
|
+
'tripod.com' => /tripod\.com$/,
|
54
57
|
'squarespace.com' => /squarespace\.com$/,
|
55
|
-
'github.io'
|
56
|
-
'tumblr'
|
57
|
-
'locality'
|
58
|
+
'github.io' => /github\.io$/,
|
59
|
+
'tumblr' => /tumblr\.com$/,
|
60
|
+
'locality' => Gman::Locality::REGEX,
|
61
|
+
'french edu' => /^ac-.*?\.fr/
|
58
62
|
}.freeze
|
59
63
|
|
60
64
|
def initialize(domains)
|
@@ -62,7 +66,7 @@ class Gman
|
|
62
66
|
end
|
63
67
|
|
64
68
|
def logger
|
65
|
-
@logger ||= Logger.new(
|
69
|
+
@logger ||= Logger.new($stdout)
|
66
70
|
end
|
67
71
|
|
68
72
|
def normalize_domain(domain)
|
@@ -74,6 +78,7 @@ class Gman
|
|
74
78
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
75
79
|
return false unless ensure_valid(domain)
|
76
80
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
81
|
+
|
77
82
|
true
|
78
83
|
end
|
79
84
|
|
@@ -81,6 +86,7 @@ class Gman
|
|
81
86
|
# rather than a bool and silence log output
|
82
87
|
def reject(domain, reason)
|
83
88
|
return reason if ENV['RECONCILING']
|
89
|
+
|
84
90
|
logger.info "👎 `#{domain}`: #{reason}"
|
85
91
|
false
|
86
92
|
end
|
@@ -101,13 +107,14 @@ class Gman
|
|
101
107
|
end
|
102
108
|
|
103
109
|
def resolver
|
104
|
-
@resolver ||= Resolv::DNS.new(nameserver: ['
|
110
|
+
@resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
|
105
111
|
end
|
106
112
|
|
107
113
|
# Verifies that the given domain has an MX record, and thus is valid
|
108
114
|
def domain_resolves?(domain)
|
109
115
|
domain = Addressable::URI.new(host: domain).normalize.host
|
110
116
|
return true if ip?(domain)
|
117
|
+
|
111
118
|
returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
|
112
119
|
end
|
113
120
|
|
@@ -115,16 +122,17 @@ class Gman
|
|
115
122
|
|
116
123
|
def ensure_regex(domain)
|
117
124
|
REGEX_CHECKS.each do |msg, regex|
|
118
|
-
return reject(domain, msg) if domain
|
125
|
+
return reject(domain, msg) if domain&.match?(regex)
|
119
126
|
end
|
120
127
|
true
|
121
128
|
end
|
122
129
|
|
123
130
|
def ensure_valid(domain)
|
124
131
|
return false if domain.empty?
|
132
|
+
|
125
133
|
if BLACKLIST.include?(domain)
|
126
134
|
reject(domain, 'blacklist')
|
127
|
-
elsif !PublicSuffix.valid?(".#{domain}")
|
135
|
+
elsif !PublicSuffix.valid?("foo.#{domain}")
|
128
136
|
reject(domain, 'invalid')
|
129
137
|
elsif Swot.is_academic?(domain)
|
130
138
|
reject(domain, 'academic')
|
@@ -135,11 +143,13 @@ class Gman
|
|
135
143
|
|
136
144
|
def ensure_resolves(domain)
|
137
145
|
return reject(domain, 'unresolvable') unless domain_resolves?(domain)
|
146
|
+
|
138
147
|
true
|
139
148
|
end
|
140
149
|
|
141
150
|
def ensure_not_dupe(domain)
|
142
151
|
return true unless dupe?(domain)
|
152
|
+
|
143
153
|
if current.domains.include?(domain)
|
144
154
|
reject(domain, 'duplicate')
|
145
155
|
else
|
@@ -153,14 +163,14 @@ class Gman
|
|
153
163
|
end
|
154
164
|
|
155
165
|
def normalize_domains!
|
156
|
-
domain_list.to_h.
|
166
|
+
domain_list.to_h.each_value do |domains|
|
157
167
|
domains.map! { |domain| normalize_domain(domain) }
|
158
168
|
domains.uniq!
|
159
169
|
end
|
160
170
|
end
|
161
171
|
|
162
172
|
def ensure_validity!(options = {})
|
163
|
-
domain_list.data.
|
173
|
+
domain_list.data.each_value do |domains|
|
164
174
|
domains.select! { |domain| valid_domain?(domain, options) }
|
165
175
|
end
|
166
176
|
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Gman
|
2
4
|
# Second level .us domains for states and locality
|
3
5
|
# See http://en.wikipedia.org/wiki/.us
|
@@ -12,18 +14,18 @@ class Gman
|
|
12
14
|
# * k12.il.us
|
13
15
|
# * ci.foo.zx.us
|
14
16
|
class Locality
|
15
|
-
AFFINITY_NAMESPACES = %w
|
17
|
+
AFFINITY_NAMESPACES = %w[state dst cog].freeze
|
16
18
|
|
17
|
-
STATES = %w
|
19
|
+
STATES = %w[
|
18
20
|
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
19
21
|
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
20
22
|
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
21
|
-
|
23
|
+
].freeze
|
22
24
|
|
23
|
-
LOCALITY_DOMAINS = %w
|
25
|
+
LOCALITY_DOMAINS = %w[
|
24
26
|
ci co borough boro city county
|
25
27
|
parish town twp vi vil village
|
26
|
-
|
28
|
+
].freeze
|
27
29
|
|
28
30
|
REGEX = /
|
29
31
|
(
|
@@ -31,7 +33,7 @@ class Gman
|
|
31
33
|
|
|
32
34
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
33
35
|
)\.(#{Regexp.union(STATES)})\.us
|
34
|
-
/x
|
36
|
+
/x.freeze
|
35
37
|
|
36
38
|
def self.valid?(domain)
|
37
39
|
!domain.to_s.match(Locality::REGEX).nil?
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
data/script/alphabetize
CHANGED
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'yaml'
|
4
5
|
require 'open-uri'
|
@@ -12,7 +13,7 @@ puts "Current list contains #{current.count} domains..."
|
|
12
13
|
|
13
14
|
dupe = current.count - current.domains.uniq.count
|
14
15
|
puts "Found #{dupe} duplicate domains"
|
15
|
-
exit 0 if dupe
|
16
|
+
exit 0 if dupe.zero?
|
16
17
|
|
17
18
|
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
18
19
|
|
data/script/profile
CHANGED
data/script/prune
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Given an array of domains, removes them from the list
|
3
5
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
6
|
|
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
|
|
12
14
|
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
13
15
|
|
14
16
|
domains.each do |domain|
|
15
|
-
list.gsub!(/^#{domain}$\n/, '')
|
17
|
+
list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
|
16
18
|
end
|
17
19
|
|
18
|
-
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
19
|
-
|
20
20
|
File.write './config/domains.txt', list
|
21
|
+
|
22
|
+
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
data/script/reconcile-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Reconciles the USA.gov-maintained list of US domains with domains.txt
|
4
6
|
# to show domains listed in the USA.gov-maintained list that we reject and why
|
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
|
|
12
14
|
blacklist = ['usagovQUASI']
|
13
15
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
14
16
|
|
15
|
-
data = open(source).read
|
17
|
+
data = URI.open(source).read
|
16
18
|
data = data.split('_' * 74)
|
17
19
|
data = data.last.strip
|
18
20
|
data = data.split(/\r?\n/).reject(&:empty?)
|
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
|
|
20
22
|
domains = {}
|
21
23
|
group = ''
|
22
24
|
data.each do |row|
|
23
|
-
if
|
25
|
+
if /^\w/.match?(row)
|
24
26
|
group = row
|
25
27
|
domains[group] = []
|
26
28
|
else
|
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
|
|
33
35
|
|
34
36
|
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
37
|
|
36
|
-
importer.domains.list.
|
38
|
+
importer.domains.list.each_value do |d|
|
37
39
|
d.map! { |domain| Gman.new(domain).to_s }
|
38
40
|
d.map! { |domain| importer.normalize_domain(domain) }
|
39
41
|
end
|
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
|
|
44
46
|
missing = {}
|
45
47
|
importer.domains.list.each do |g, usagovdomains|
|
46
48
|
next unless importer.current.list[g]
|
49
|
+
|
47
50
|
missing[g] = importer.current.list[g] - usagovdomains
|
48
51
|
end
|
49
52
|
|
data/script/vendor-federal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
8
9
|
|
9
|
-
domains = open(url).read.encode('UTF-8')
|
10
|
+
domains = URI.open(url).read.encode('UTF-8')
|
10
11
|
domains = CSV.parse(domains, headers: true)
|
11
12
|
domains = domains.map { |row| row['Domain Name'] }
|
12
13
|
|
data/script/vendor-municipal-de
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'csv'
|
4
5
|
require 'open-uri'
|
@@ -6,7 +7,7 @@ require './lib/gman'
|
|
6
7
|
|
7
8
|
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
8
9
|
|
9
|
-
csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
|
+
csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
10
11
|
|
11
12
|
# For some reason, the header row is actually the last row
|
12
13
|
# Pop the last line off the file and prepend it at the begining
|
data/script/vendor-nl
CHANGED
data/script/vendor-public-suffix
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
# Propagates an initial list of best-guess government domains
|
3
5
|
|
4
6
|
require 'public_suffix'
|
@@ -6,21 +8,21 @@ require 'yaml'
|
|
6
8
|
require_relative '../lib/gman'
|
7
9
|
|
8
10
|
# https://gist.github.com/benbalter/6147066
|
9
|
-
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
11
|
+
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
|
10
12
|
|
11
13
|
domains = []
|
12
14
|
PublicSuffix::List.default.each do |rule|
|
13
15
|
domain = nil
|
14
16
|
|
15
17
|
if rule.parts.length == 1
|
16
|
-
domain = rule.parts.first if ".#{rule.value}"
|
17
|
-
elsif ".#{rule.value}"
|
18
|
+
domain = rule.parts.first if REGEX.match?(".#{rule.value}")
|
19
|
+
elsif REGEX.match?(".#{rule.value}")
|
18
20
|
domain = rule.parts.pop(2).join('.')
|
19
21
|
end
|
20
22
|
|
21
23
|
domains.push domain unless domain.nil? || domains.include?(domain)
|
22
24
|
end
|
23
25
|
|
24
|
-
#
|
26
|
+
# NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
|
25
27
|
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
26
28
|
Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
|
data/script/vendor-se
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'mechanize'
|
4
5
|
require 'csv'
|
@@ -14,7 +15,7 @@ response = agent.submit(form, submit_button)
|
|
14
15
|
|
15
16
|
rows = CSV.parse(response.content, headers: true, col_sep: "\t")
|
16
17
|
domains = rows.map do |row|
|
17
|
-
row['Webbadress'] unless row['Namn']
|
18
|
+
row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
|
18
19
|
end
|
19
20
|
|
20
21
|
Gman::Importer.new('Swedish Administrative Authorities' => domains).import
|
data/script/vendor-swot
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Vendors the Swot-maintained list of adademic domains into config/academic.txt
|
4
6
|
# Source: https://github.com/leereilly/swot/
|
@@ -12,7 +14,7 @@
|
|
12
14
|
#
|
13
15
|
# Note: We do this, because as a bajillion individual files, Swot takes up 30MB
|
14
16
|
|
15
|
-
require '
|
17
|
+
require 'gman'
|
16
18
|
require 'swot'
|
17
19
|
|
18
20
|
# Generate array of all Swot domains
|
data/script/vendor-us
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
# Vendors the USA.gov-maintained list of US domains into domains.txt
|
4
6
|
# Source: https://github.com/GSA-OCSIT/govt-urls
|
@@ -13,10 +15,10 @@
|
|
13
15
|
require './lib/gman'
|
14
16
|
require 'open-uri'
|
15
17
|
|
16
|
-
blacklist = %w
|
18
|
+
blacklist = %w[usagovQUASI usagovFEDgov]
|
17
19
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
18
20
|
|
19
|
-
data = open(source).read
|
21
|
+
data = URI.open(source).read
|
20
22
|
data = data.split('_' * 74)
|
21
23
|
data = data.last.strip
|
22
24
|
data = data.split(/\r?\n/).reject(&:empty?)
|
@@ -24,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
|
|
24
26
|
domains = {}
|
25
27
|
group = ''
|
26
28
|
data.each do |row|
|
27
|
-
if
|
29
|
+
if /^\w/.match?(row)
|
28
30
|
group = row
|
29
31
|
domains[group] = []
|
30
32
|
else
|
File without changes
|