gman 6.0.1 → 7.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/CODEOWNERS +3 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
- data/.github/config.yml +23 -0
- data/.github/funding.yml +1 -0
- data/.github/no-response.yml +15 -0
- data/.github/release-drafter.yml +4 -0
- data/.github/settings.yml +33 -0
- data/.github/stale.yml +29 -0
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -5
- data/.rubocop_todo.yml +84 -0
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/bin/gman +6 -4
- data/bin/gman_filter +5 -7
- data/config/domains.txt +8446 -173
- data/config/vendor/academic.txt +8038 -0
- data/config/vendor/dotgovs.csv +5786 -5560
- data/docs/CODE_OF_CONDUCT.md +46 -0
- data/docs/CONTRIBUTING.md +92 -0
- data/{README.md → docs/README.md} +3 -3
- data/docs/SECURITY.md +3 -0
- data/docs/_config.yml +2 -0
- data/gman.gemspec +18 -17
- data/lib/gman.rb +25 -21
- data/lib/gman/country_codes.rb +17 -17
- data/lib/gman/domain_list.rb +123 -41
- data/lib/gman/identifier.rb +59 -21
- data/lib/gman/importer.rb +39 -40
- data/lib/gman/locality.rb +23 -21
- data/lib/gman/version.rb +3 -1
- data/script/add +2 -0
- data/script/alphabetize +2 -0
- data/script/cibuild +1 -1
- data/script/dedupe +2 -1
- data/script/profile +2 -1
- data/script/prune +5 -3
- data/script/reconcile-us +6 -3
- data/script/vendor +1 -1
- data/script/vendor-federal-de +3 -3
- data/script/vendor-municipal-de +3 -3
- data/script/vendor-nl +4 -1
- data/script/vendor-public-suffix +7 -6
- data/script/vendor-se +3 -3
- data/script/vendor-swot +43 -0
- data/script/vendor-us +8 -5
- data/spec/fixtures/domains.txt +4 -0
- data/{test → spec}/fixtures/obama.txt +0 -0
- data/spec/gman/bin_spec.rb +101 -0
- data/spec/gman/country_code_spec.rb +39 -0
- data/spec/gman/domain_list_spec.rb +110 -0
- data/spec/gman/domains_spec.rb +25 -0
- data/spec/gman/identifier_spec.rb +218 -0
- data/spec/gman/importer_spec.rb +236 -0
- data/spec/gman/locality_spec.rb +24 -0
- data/spec/gman_spec.rb +74 -0
- data/spec/spec_helper.rb +31 -0
- metadata +86 -73
- data/CONTRIBUTING.md +0 -22
- data/Rakefile +0 -22
- data/test/fixtures/domains.txt +0 -2
- data/test/helper.rb +0 -40
- data/test/test_gman.rb +0 -62
- data/test/test_gman_bin.rb +0 -75
- data/test/test_gman_country_codes.rb +0 -18
- data/test/test_gman_domains.rb +0 -33
- data/test/test_gman_filter.rb +0 -17
- data/test/test_gman_identifier.rb +0 -106
- data/test/test_gman_importer.rb +0 -250
- data/test/test_gman_locality.rb +0 -10
data/lib/gman/identifier.rb
CHANGED
@@ -1,9 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Gman
|
4
|
+
# Defines an instance method that delegates to a hash's key
|
5
|
+
#
|
6
|
+
# hash_method - a symbol representing the instance method to delegate to. The
|
7
|
+
# instance method should return a hash or respond to #[]
|
8
|
+
# key - the key to call within the hash
|
9
|
+
# method - (optional) the instance method the key should be aliased to.
|
10
|
+
# If not specified, defaults to the hash key
|
11
|
+
# default - (optional) value to return if value is nil (defaults to nil)
|
12
|
+
#
|
13
|
+
# Returns a symbol representing the instance method
|
14
|
+
def self.def_hash_delegator(hash_method, key, method = nil, default = nil)
|
15
|
+
method ||= key.to_s.downcase.sub(' ', '_')
|
16
|
+
define_method(method) do
|
17
|
+
hash = send(hash_method)
|
18
|
+
if hash.respond_to? :[]
|
19
|
+
hash[key.to_s] || default
|
20
|
+
else
|
21
|
+
default
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def_hash_delegator :dotgov_listing, :Agency
|
27
|
+
def_hash_delegator :dotgov_listing, :Organization
|
28
|
+
def_hash_delegator :dotgov_listing, :City
|
29
|
+
def_hash_delegator :dotgov_listing, :"Domain Type"
|
30
|
+
private :domain_type
|
31
|
+
|
2
32
|
def type
|
3
|
-
[
|
33
|
+
%i[state district cog city federal county].each do |type|
|
4
34
|
return type if send "#{type}?"
|
5
35
|
end
|
6
36
|
return if list_category.nil?
|
37
|
+
|
7
38
|
if list_category.include?('usagov')
|
8
39
|
:unknown
|
9
40
|
else
|
@@ -14,7 +45,7 @@ class Gman
|
|
14
45
|
def state
|
15
46
|
if matches
|
16
47
|
matches[4].upcase
|
17
|
-
elsif dotgov_listing
|
48
|
+
elsif dotgov_listing['State']
|
18
49
|
dotgov_listing['State']
|
19
50
|
elsif list_category
|
20
51
|
matches = list_category.match(/usagov([A-Z]{2})/)
|
@@ -22,27 +53,23 @@ class Gman
|
|
22
53
|
end
|
23
54
|
end
|
24
55
|
|
25
|
-
def city
|
26
|
-
dotgov_listing['City'] if dotgov_listing
|
27
|
-
end
|
28
|
-
|
29
|
-
def agency
|
30
|
-
dotgov_listing['Agency'] if federal?
|
31
|
-
end
|
32
|
-
|
33
56
|
def dotgov?
|
34
57
|
domain.tld == 'gov'
|
35
58
|
end
|
36
59
|
|
37
60
|
def federal?
|
38
|
-
|
61
|
+
return false unless dotgov_listing
|
62
|
+
|
63
|
+
domain_type =~ /^Federal Agency/i
|
39
64
|
end
|
40
65
|
|
41
66
|
def city?
|
42
67
|
if matches
|
43
|
-
%w
|
68
|
+
%w[ci town vil].include?(matches[3])
|
44
69
|
elsif dotgov_listing
|
45
|
-
|
70
|
+
domain_type == 'City'
|
71
|
+
else
|
72
|
+
false
|
46
73
|
end
|
47
74
|
end
|
48
75
|
|
@@ -50,7 +77,9 @@ class Gman
|
|
50
77
|
if matches
|
51
78
|
matches[3] == 'co'
|
52
79
|
elsif dotgov_listing
|
53
|
-
|
80
|
+
domain_type == 'County'
|
81
|
+
else
|
82
|
+
false
|
54
83
|
end
|
55
84
|
end
|
56
85
|
|
@@ -58,40 +87,49 @@ class Gman
|
|
58
87
|
if matches
|
59
88
|
matches[1] == 'state'
|
60
89
|
elsif dotgov_listing
|
61
|
-
|
90
|
+
domain_type == 'State/Local Govt'
|
91
|
+
else
|
92
|
+
false
|
62
93
|
end
|
63
94
|
end
|
64
95
|
|
65
96
|
def district?
|
66
|
-
|
97
|
+
return false unless matches
|
98
|
+
|
99
|
+
matches[1] == 'dst'
|
67
100
|
end
|
68
101
|
|
69
102
|
def cog?
|
70
|
-
|
103
|
+
return false unless matches
|
104
|
+
|
105
|
+
matches[1] == 'cog'
|
71
106
|
end
|
72
107
|
|
73
108
|
private
|
74
109
|
|
75
110
|
def list_category
|
76
111
|
@list_category ||= begin
|
77
|
-
match = Gman.list.find(domain.to_s)
|
112
|
+
match = Gman.list.public_suffix_list.find(domain.to_s)
|
78
113
|
return unless match
|
79
|
-
|
80
|
-
|
114
|
+
|
115
|
+
regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.value)}\n}im
|
116
|
+
matches = Gman.list.contents.match(regex)
|
81
117
|
matches[1] if matches
|
82
118
|
end
|
83
119
|
end
|
84
120
|
|
85
121
|
def matches
|
86
122
|
return @matches if defined? @matches
|
123
|
+
|
87
124
|
@matches = domain.to_s.match(Locality::REGEX)
|
88
125
|
end
|
89
126
|
|
90
127
|
def dotgov_listing
|
91
128
|
return @dotgov_listing if defined? @dotgov_listing
|
92
129
|
return unless dotgov?
|
130
|
+
|
93
131
|
@dotgov_listing = Gman.dotgov_list.find do |listing|
|
94
|
-
listing['Domain Name'].casecmp("#{domain.sld}.gov")
|
132
|
+
listing['Domain Name'].casecmp("#{domain.sld}.gov").zero?
|
95
133
|
end
|
96
134
|
end
|
97
135
|
|
data/lib/gman/importer.rb
CHANGED
@@ -1,18 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Utility functions for parsing and manipulating public-suffix domain lists
|
2
4
|
# Only used in development and not loaded by default
|
3
5
|
require 'yaml'
|
4
6
|
require 'open-uri'
|
5
7
|
require 'resolv'
|
6
8
|
require 'logger'
|
9
|
+
require 'swot'
|
7
10
|
require_relative '../gman'
|
8
11
|
require_relative './domain_list'
|
9
12
|
|
10
13
|
class Gman
|
11
14
|
class Importer
|
12
|
-
attr_accessor :
|
15
|
+
attr_accessor :domain_list
|
13
16
|
|
14
17
|
# Known false positives from vendored lists
|
15
|
-
BLACKLIST = %w
|
18
|
+
BLACKLIST = %w[
|
16
19
|
business.centurytel.net
|
17
20
|
chesnee.net
|
18
21
|
citlink.net
|
@@ -38,31 +41,32 @@ class Gman
|
|
38
41
|
wctc.net
|
39
42
|
webconnections.net
|
40
43
|
webpages.charter.net
|
41
|
-
|
44
|
+
].freeze
|
42
45
|
|
43
46
|
REGEX_CHECKS = {
|
44
|
-
'home. regex'
|
45
|
-
'user. regex'
|
46
|
-
'sites. regex'
|
47
|
-
'weebly'
|
48
|
-
'wordpress'
|
49
|
-
'govoffice'
|
50
|
-
'homestead'
|
51
|
-
'wix.com'
|
52
|
-
'blogspot.com'
|
53
|
-
'tripod.com'
|
47
|
+
'home. regex' => /^home\./,
|
48
|
+
'user. regex' => /^users?\./,
|
49
|
+
'sites. regex' => /^sites?\./,
|
50
|
+
'weebly' => /weebly\.com$/,
|
51
|
+
'wordpress' => /wordpress\.com$/,
|
52
|
+
'govoffice' => /govoffice\d?\.com$/,
|
53
|
+
'homestead' => /homestead\.com$/,
|
54
|
+
'wix.com' => /wix\.com$/,
|
55
|
+
'blogspot.com' => /blogspot\.com$/,
|
56
|
+
'tripod.com' => /tripod\.com$/,
|
54
57
|
'squarespace.com' => /squarespace\.com$/,
|
55
|
-
'github.io'
|
56
|
-
'tumblr'
|
57
|
-
'locality'
|
58
|
+
'github.io' => /github\.io$/,
|
59
|
+
'tumblr' => /tumblr\.com$/,
|
60
|
+
'locality' => Gman::Locality::REGEX,
|
61
|
+
'french edu' => /^ac-.*?\.fr/
|
58
62
|
}.freeze
|
59
63
|
|
60
64
|
def initialize(domains)
|
61
|
-
@
|
65
|
+
@domain_list = DomainList.new(data: domains)
|
62
66
|
end
|
63
67
|
|
64
68
|
def logger
|
65
|
-
@logger ||= Logger.new(
|
69
|
+
@logger ||= Logger.new($stdout)
|
66
70
|
end
|
67
71
|
|
68
72
|
def normalize_domain(domain)
|
@@ -71,9 +75,10 @@ class Gman
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def valid_domain?(domain, options = {})
|
74
|
-
return false unless ensure_valid(domain)
|
75
78
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
79
|
+
return false unless ensure_valid(domain)
|
76
80
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
81
|
+
|
77
82
|
true
|
78
83
|
end
|
79
84
|
|
@@ -81,6 +86,7 @@ class Gman
|
|
81
86
|
# rather than a bool and silence log output
|
82
87
|
def reject(domain, reason)
|
83
88
|
return reason if ENV['RECONCILING']
|
89
|
+
|
84
90
|
logger.info "👎 `#{domain}`: #{reason}"
|
85
91
|
false
|
86
92
|
end
|
@@ -89,30 +95,26 @@ class Gman
|
|
89
95
|
@current ||= DomainList.current
|
90
96
|
end
|
91
97
|
|
92
|
-
def import(options)
|
98
|
+
def import(options = {})
|
93
99
|
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
94
|
-
logger.info "Adding: #{
|
100
|
+
logger.info "Adding: #{domain_list.count} domains"
|
95
101
|
|
96
102
|
normalize_domains!
|
97
103
|
ensure_validity!(options)
|
98
104
|
|
99
|
-
if domains.count == 0
|
100
|
-
logger.info 'Nothing to add. Aborting'
|
101
|
-
exit 0
|
102
|
-
end
|
103
|
-
|
104
105
|
add_to_current
|
105
106
|
logger.info "New: #{current.count} domains"
|
106
107
|
end
|
107
108
|
|
108
109
|
def resolver
|
109
|
-
@resolver ||= Resolv::DNS.new(nameserver: ['
|
110
|
+
@resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
|
110
111
|
end
|
111
112
|
|
112
113
|
# Verifies that the given domain has an MX record, and thus is valid
|
113
114
|
def domain_resolves?(domain)
|
114
115
|
domain = Addressable::URI.new(host: domain).normalize.host
|
115
116
|
return true if ip?(domain)
|
117
|
+
|
116
118
|
returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
|
117
119
|
end
|
118
120
|
|
@@ -120,16 +122,17 @@ class Gman
|
|
120
122
|
|
121
123
|
def ensure_regex(domain)
|
122
124
|
REGEX_CHECKS.each do |msg, regex|
|
123
|
-
return reject(domain, msg) if domain
|
125
|
+
return reject(domain, msg) if domain&.match?(regex)
|
124
126
|
end
|
125
127
|
true
|
126
128
|
end
|
127
129
|
|
128
130
|
def ensure_valid(domain)
|
129
131
|
return false if domain.empty?
|
132
|
+
|
130
133
|
if BLACKLIST.include?(domain)
|
131
134
|
reject(domain, 'blacklist')
|
132
|
-
elsif !PublicSuffix.valid?(".#{domain}")
|
135
|
+
elsif !PublicSuffix.valid?("foo.#{domain}")
|
133
136
|
reject(domain, 'invalid')
|
134
137
|
elsif Swot.is_academic?(domain)
|
135
138
|
reject(domain, 'academic')
|
@@ -140,11 +143,13 @@ class Gman
|
|
140
143
|
|
141
144
|
def ensure_resolves(domain)
|
142
145
|
return reject(domain, 'unresolvable') unless domain_resolves?(domain)
|
146
|
+
|
143
147
|
true
|
144
148
|
end
|
145
149
|
|
146
150
|
def ensure_not_dupe(domain)
|
147
151
|
return true unless dupe?(domain)
|
152
|
+
|
148
153
|
if current.domains.include?(domain)
|
149
154
|
reject(domain, 'duplicate')
|
150
155
|
else
|
@@ -158,22 +163,22 @@ class Gman
|
|
158
163
|
end
|
159
164
|
|
160
165
|
def normalize_domains!
|
161
|
-
|
166
|
+
domain_list.to_h.each_value do |domains|
|
162
167
|
domains.map! { |domain| normalize_domain(domain) }
|
163
168
|
domains.uniq!
|
164
169
|
end
|
165
170
|
end
|
166
171
|
|
167
172
|
def ensure_validity!(options = {})
|
168
|
-
|
173
|
+
domain_list.data.each_value do |domains|
|
169
174
|
domains.select! { |domain| valid_domain?(domain, options) }
|
170
175
|
end
|
171
176
|
end
|
172
177
|
|
173
178
|
def add_to_current
|
174
|
-
|
175
|
-
current.
|
176
|
-
current.
|
179
|
+
domain_list.data.each do |group, domains|
|
180
|
+
current.data[group] ||= []
|
181
|
+
current.data[group].concat domains
|
177
182
|
end
|
178
183
|
current.write
|
179
184
|
end
|
@@ -192,9 +197,3 @@ class Gman
|
|
192
197
|
end
|
193
198
|
end
|
194
199
|
end
|
195
|
-
|
196
|
-
class Gman
|
197
|
-
def self.import(hash, options = {})
|
198
|
-
Gman::Importer.new(hash).import(options)
|
199
|
-
end
|
200
|
-
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,17 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class Gman
|
4
|
+
# Second level .us domains for states and locality
|
5
|
+
# See http://en.wikipedia.org/wiki/.us
|
6
|
+
#
|
7
|
+
# Examples:
|
8
|
+
# * foo.state.il.us
|
9
|
+
# * ci.foo.il.us
|
10
|
+
#
|
11
|
+
# Not:
|
12
|
+
# * state.foo.il.us
|
13
|
+
# * foo.ci.il.us
|
14
|
+
# * k12.il.us
|
15
|
+
# * ci.foo.zx.us
|
2
16
|
class Locality
|
3
|
-
AFFINITY_NAMESPACES = %w
|
17
|
+
AFFINITY_NAMESPACES = %w[state dst cog].freeze
|
4
18
|
|
5
|
-
STATES = %w
|
19
|
+
STATES = %w[
|
6
20
|
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
7
21
|
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
8
22
|
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
9
|
-
|
23
|
+
].freeze
|
10
24
|
|
11
|
-
LOCALITY_DOMAINS = %w
|
25
|
+
LOCALITY_DOMAINS = %w[
|
12
26
|
ci co borough boro city county
|
13
27
|
parish town twp vi vil village
|
14
|
-
|
28
|
+
].freeze
|
15
29
|
|
16
30
|
REGEX = /
|
17
31
|
(
|
@@ -19,22 +33,10 @@ class Gman
|
|
19
33
|
|
|
20
34
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
21
35
|
)\.(#{Regexp.union(STATES)})\.us
|
22
|
-
/x
|
23
|
-
end
|
36
|
+
/x.freeze
|
24
37
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
# Examples:
|
29
|
-
# * foo.state.il.us
|
30
|
-
# * ci.foo.il.us
|
31
|
-
#
|
32
|
-
# Not:
|
33
|
-
# * state.foo.il.us
|
34
|
-
# * foo.ci.il.us
|
35
|
-
# * k12.il.us
|
36
|
-
# * ci.foo.zx.us
|
37
|
-
def locality?
|
38
|
-
!domain.to_s.match(Locality::REGEX).nil?
|
38
|
+
def self.valid?(domain)
|
39
|
+
!domain.to_s.match(Locality::REGEX).nil?
|
40
|
+
end
|
39
41
|
end
|
40
42
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
data/script/alphabetize
CHANGED
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'yaml'
|
4
5
|
require 'open-uri'
|
@@ -12,7 +13,7 @@ puts "Current list contains #{current.count} domains..."
|
|
12
13
|
|
13
14
|
dupe = current.count - current.domains.uniq.count
|
14
15
|
puts "Found #{dupe} duplicate domains"
|
15
|
-
exit 0 if dupe
|
16
|
+
exit 0 if dupe.zero?
|
16
17
|
|
17
18
|
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
18
19
|
|