gman 6.0.1 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rake_tasks +0 -0
- data/.rubocop.yml +3 -0
- data/config/domains.txt +19 -32
- data/config/vendor/academic.txt +8039 -0
- data/config/vendor/dotgovs.csv +5560 -5560
- data/gman.gemspec +1 -1
- data/lib/gman.rb +22 -20
- data/lib/gman/domain_list.rb +107 -41
- data/lib/gman/identifier.rb +2 -2
- data/lib/gman/importer.rb +10 -21
- data/lib/gman/locality.rb +15 -15
- data/lib/gman/version.rb +1 -1
- data/script/vendor +1 -1
- data/script/vendor-federal-de +1 -2
- data/script/vendor-municipal-de +1 -2
- data/script/vendor-nl +2 -1
- data/script/vendor-public-suffix +1 -2
- data/script/vendor-se +1 -2
- data/script/vendor-swot +41 -0
- data/script/vendor-us +3 -2
- data/test/helper.rb +8 -0
- data/test/test_gman.rb +0 -6
- data/test/test_gman_domain_list.rb +112 -0
- data/test/test_gman_domains.rb +5 -6
- data/test/test_gman_importer.rb +26 -32
- metadata +20 -15
data/gman.gemspec
CHANGED
@@ -25,11 +25,11 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.require_paths = ['lib']
|
26
26
|
s.required_ruby_version = '~> 2.0'
|
27
27
|
|
28
|
-
s.add_dependency('swot', '~> 1.0')
|
29
28
|
s.add_dependency('iso_country_codes', '~> 0.6')
|
30
29
|
s.add_dependency('naughty_or_nice', '~> 2.0')
|
31
30
|
s.add_dependency('colored', '~> 1.2')
|
32
31
|
|
32
|
+
s.add_development_dependency('swot', '~> 1.0')
|
33
33
|
s.add_development_dependency('rake', '~> 10.4')
|
34
34
|
s.add_development_dependency('shoulda', '~> 3.5')
|
35
35
|
s.add_development_dependency('rdoc', '~> 4.2')
|
data/lib/gman.rb
CHANGED
@@ -1,38 +1,40 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
|
1
3
|
require 'naughty_or_nice'
|
2
4
|
require 'swot'
|
3
5
|
require 'iso_country_codes'
|
4
6
|
require 'csv'
|
5
7
|
require_relative 'gman/version'
|
6
8
|
require_relative 'gman/country_codes'
|
7
|
-
require_relative 'gman/locality'
|
8
9
|
require_relative 'gman/identifier'
|
9
10
|
|
10
11
|
class Gman
|
11
12
|
include NaughtyOrNice
|
12
13
|
|
14
|
+
autoload :DomainList, 'gman/domain_list'
|
15
|
+
autoload :Importer, 'gman/importer'
|
16
|
+
autoload :Locality, 'gman/locality'
|
17
|
+
|
13
18
|
class << self
|
14
|
-
# returns an instance of our custom public suffix list
|
15
|
-
# list behaves like PublicSuffix::List
|
16
|
-
# but is limited to our whitelisted domains
|
17
19
|
def list
|
18
|
-
@list ||=
|
20
|
+
@list ||= DomainList.new(path: list_path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def academic_list
|
24
|
+
@academic_list ||= DomainList.new(path: academic_list_path)
|
19
25
|
end
|
20
26
|
|
21
27
|
def config_path
|
22
|
-
File.expand_path '../config', File.dirname(__FILE__)
|
28
|
+
@config_path ||= File.expand_path '../config', File.dirname(__FILE__)
|
23
29
|
end
|
24
30
|
|
25
31
|
# Returns the absolute path to the domain list
|
26
32
|
def list_path
|
27
|
-
|
28
|
-
File.expand_path '../test/fixtures/domains.txt', File.dirname(__FILE__)
|
29
|
-
else
|
30
|
-
File.expand_path 'domains.txt', config_path
|
31
|
-
end
|
33
|
+
File.expand_path 'domains.txt', config_path
|
32
34
|
end
|
33
35
|
|
34
|
-
def
|
35
|
-
|
36
|
+
def academic_list_path
|
37
|
+
File.expand_path 'vendor/academic.txt', config_path
|
36
38
|
end
|
37
39
|
end
|
38
40
|
|
@@ -47,21 +49,21 @@ class Gman
|
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
52
|
+
def locality?
|
53
|
+
Locality.valid?(domain)
|
54
|
+
end
|
55
|
+
|
50
56
|
private
|
51
57
|
|
52
58
|
def valid_domain?
|
53
|
-
domain && domain.valid? && !academic?
|
59
|
+
@valid_domains ||= domain && domain.valid? && !academic?
|
54
60
|
end
|
55
61
|
|
56
62
|
def academic?
|
57
|
-
domain &&
|
63
|
+
@academic ||= domain && Gman.academic_list.valid?(to_s)
|
58
64
|
end
|
59
65
|
|
60
|
-
# domain is on the domain list and
|
61
|
-
# domain is not explicitly blacklisted and
|
62
|
-
# domain matches a standard public suffix list rule
|
63
66
|
def public_suffix_valid?
|
64
|
-
|
65
|
-
!rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
|
67
|
+
@public_suffix_valid ||= Gman.list.valid?(to_s)
|
66
68
|
end
|
67
69
|
end
|
data/lib/gman/domain_list.rb
CHANGED
@@ -1,39 +1,102 @@
|
|
1
1
|
class Gman
|
2
2
|
class DomainList
|
3
|
-
attr_accessor :list
|
4
|
-
alias to_h list
|
5
|
-
|
6
3
|
COMMENT_REGEX = %r{//[/\s]*(.*)$}i
|
7
4
|
|
8
|
-
|
9
|
-
|
5
|
+
attr_writer :data, :path, :contents
|
6
|
+
|
7
|
+
class << self
|
8
|
+
# The current, government domain list
|
9
|
+
def current
|
10
|
+
DomainList.new(path: Gman.list_path)
|
11
|
+
end
|
12
|
+
|
13
|
+
def from_file(path)
|
14
|
+
DomainList.new(path: path)
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_hash(hash)
|
18
|
+
DomainList.new(data: hash)
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_public_suffix(string)
|
22
|
+
DomainList.new(contents: string)
|
23
|
+
end
|
24
|
+
alias from_string from_public_suffix
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(path: nil, contents: nil, data: nil)
|
28
|
+
@path = path
|
29
|
+
@contents = contents
|
30
|
+
@data = data.reject { |_, domains| domains.compact.empty? } if data
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the raw content of the domain list as a string
|
34
|
+
def contents
|
35
|
+
@contents ||= if path
|
36
|
+
File.new(path, 'r:utf-8').read
|
37
|
+
else
|
38
|
+
to_s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the parsed contents of the domain list as a hash
|
43
|
+
# in the form for group => domains
|
44
|
+
def data
|
45
|
+
@data ||= string_to_hash(contents)
|
46
|
+
end
|
47
|
+
alias to_h data
|
48
|
+
|
49
|
+
# Returns the path to the domain list on disk
|
50
|
+
def path
|
51
|
+
@path ||= Gman.list_path
|
52
|
+
end
|
53
|
+
|
54
|
+
# returns an instance of our custom public suffix list
|
55
|
+
# list behaves like PublicSuffix::List
|
56
|
+
# but is limited to our whitelisted domains
|
57
|
+
def public_suffix_list
|
58
|
+
@public_suffix_list ||= PublicSuffix::List.parse(contents)
|
59
|
+
end
|
60
|
+
|
61
|
+
# domain is on the domain list and
|
62
|
+
# domain is not explicitly blacklisted and
|
63
|
+
# domain matches a standard public suffix list rule
|
64
|
+
def valid?(domain)
|
65
|
+
rule = public_suffix_list.find(domain)
|
66
|
+
!rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
|
10
67
|
end
|
11
68
|
|
69
|
+
# Returns an array of strings representing the list groups
|
12
70
|
def groups
|
13
|
-
|
71
|
+
data.keys
|
14
72
|
end
|
15
73
|
|
74
|
+
# Return an array of strings representing all domains on the list
|
16
75
|
def domains
|
17
|
-
|
76
|
+
data.values.flatten.compact.sort.uniq
|
18
77
|
end
|
19
78
|
|
79
|
+
# Return the total number of domains in the list
|
20
80
|
def count
|
21
81
|
domains.count
|
22
82
|
end
|
23
83
|
|
84
|
+
# Alphabetize groups and domains within each group
|
24
85
|
def alphabetize
|
25
|
-
@
|
26
|
-
@
|
86
|
+
@data = data.sort_by { |k, _v| k.downcase }.to_h
|
87
|
+
@data.each { |_group, domains| domains.sort!.uniq! }
|
27
88
|
end
|
28
89
|
|
90
|
+
# Write the domain list to disk
|
29
91
|
def write
|
30
92
|
alphabetize
|
31
|
-
File.write(
|
93
|
+
File.write(path, to_public_suffix)
|
32
94
|
end
|
33
95
|
|
34
|
-
|
96
|
+
# The string representation of the domain list, in public suffix format
|
97
|
+
def to_s
|
35
98
|
current_group = output = ''
|
36
|
-
|
99
|
+
data.sort_by { |group, _| group.downcase }.each do |group, domains|
|
37
100
|
if group != current_group
|
38
101
|
output << "\n\n" unless current_group.empty? # first entry
|
39
102
|
output << "// #{group}\n"
|
@@ -43,45 +106,48 @@ class Gman
|
|
43
106
|
end
|
44
107
|
output
|
45
108
|
end
|
109
|
+
alias to_public_suffix to_s
|
46
110
|
|
47
|
-
|
48
|
-
|
49
|
-
|
111
|
+
# Given a domain, find any domain on the list that includes that domain
|
112
|
+
# E.g., `fcc.gov` would be the parent of `data.fcc.gov`
|
113
|
+
def parent_domain(domain)
|
114
|
+
domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
|
50
115
|
end
|
51
116
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
117
|
+
private
|
118
|
+
|
119
|
+
# Parse a public-suffix formatted string into a hash of groups => [domains]
|
120
|
+
def string_to_hash(string)
|
121
|
+
return unless string
|
122
|
+
lines = string_to_array(string)
|
123
|
+
array_to_hash(lines)
|
56
124
|
end
|
57
125
|
|
58
|
-
def
|
59
|
-
|
126
|
+
def string_to_array(string)
|
127
|
+
string.gsub(/\r\n?/, "\n").split("\n")
|
60
128
|
end
|
61
129
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
domains.each do |line|
|
71
|
-
if line =~ COMMENT_REGEX
|
72
|
-
group = COMMENT_REGEX.match(line)[1]
|
73
|
-
else
|
74
|
-
safe_push(domain_hash, group, line.downcase)
|
75
|
-
end
|
130
|
+
def array_to_hash(lines)
|
131
|
+
domain_hash = {}
|
132
|
+
group = ''
|
133
|
+
lines.each do |line|
|
134
|
+
if line =~ COMMENT_REGEX
|
135
|
+
group = COMMENT_REGEX.match(line)[1]
|
136
|
+
else
|
137
|
+
safe_push(domain_hash, group, line.downcase)
|
76
138
|
end
|
77
|
-
domain_hash
|
78
139
|
end
|
140
|
+
domain_hash
|
141
|
+
end
|
79
142
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
143
|
+
# Add a value to an array in a hash, creating the array if necessary
|
144
|
+
# hash - the hash
|
145
|
+
# key - the key within that hash to add the value to
|
146
|
+
# value - the single value to push into the array at hash[key]
|
147
|
+
def safe_push(hash, key, value)
|
148
|
+
return if value.empty?
|
149
|
+
hash[key] ||= []
|
150
|
+
hash[key].push value
|
85
151
|
end
|
86
152
|
end
|
87
153
|
end
|
data/lib/gman/identifier.rb
CHANGED
@@ -74,10 +74,10 @@ class Gman
|
|
74
74
|
|
75
75
|
def list_category
|
76
76
|
@list_category ||= begin
|
77
|
-
match = Gman.list.find(domain.to_s)
|
77
|
+
match = Gman.list.public_suffix_list.find(domain.to_s)
|
78
78
|
return unless match
|
79
79
|
regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.name)}\n}im
|
80
|
-
matches = Gman.
|
80
|
+
matches = Gman.list.contents.match(regex)
|
81
81
|
matches[1] if matches
|
82
82
|
end
|
83
83
|
end
|
data/lib/gman/importer.rb
CHANGED
@@ -9,7 +9,7 @@ require_relative './domain_list'
|
|
9
9
|
|
10
10
|
class Gman
|
11
11
|
class Importer
|
12
|
-
attr_accessor :
|
12
|
+
attr_accessor :domain_list
|
13
13
|
|
14
14
|
# Known false positives from vendored lists
|
15
15
|
BLACKLIST = %w(
|
@@ -58,7 +58,7 @@ class Gman
|
|
58
58
|
}.freeze
|
59
59
|
|
60
60
|
def initialize(domains)
|
61
|
-
@
|
61
|
+
@domain_list = DomainList.new(data: domains)
|
62
62
|
end
|
63
63
|
|
64
64
|
def logger
|
@@ -71,8 +71,8 @@ class Gman
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def valid_domain?(domain, options = {})
|
74
|
-
return false unless ensure_valid(domain)
|
75
74
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
75
|
+
return false unless ensure_valid(domain)
|
76
76
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
77
77
|
true
|
78
78
|
end
|
@@ -89,18 +89,13 @@ class Gman
|
|
89
89
|
@current ||= DomainList.current
|
90
90
|
end
|
91
91
|
|
92
|
-
def import(options)
|
92
|
+
def import(options = {})
|
93
93
|
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
94
|
-
logger.info "Adding: #{
|
94
|
+
logger.info "Adding: #{domain_list.count} domains"
|
95
95
|
|
96
96
|
normalize_domains!
|
97
97
|
ensure_validity!(options)
|
98
98
|
|
99
|
-
if domains.count == 0
|
100
|
-
logger.info 'Nothing to add. Aborting'
|
101
|
-
exit 0
|
102
|
-
end
|
103
|
-
|
104
99
|
add_to_current
|
105
100
|
logger.info "New: #{current.count} domains"
|
106
101
|
end
|
@@ -158,22 +153,22 @@ class Gman
|
|
158
153
|
end
|
159
154
|
|
160
155
|
def normalize_domains!
|
161
|
-
|
156
|
+
domain_list.to_h.each do |_group, domains|
|
162
157
|
domains.map! { |domain| normalize_domain(domain) }
|
163
158
|
domains.uniq!
|
164
159
|
end
|
165
160
|
end
|
166
161
|
|
167
162
|
def ensure_validity!(options = {})
|
168
|
-
|
163
|
+
domain_list.data.each do |_group, domains|
|
169
164
|
domains.select! { |domain| valid_domain?(domain, options) }
|
170
165
|
end
|
171
166
|
end
|
172
167
|
|
173
168
|
def add_to_current
|
174
|
-
|
175
|
-
current.
|
176
|
-
current.
|
169
|
+
domain_list.data.each do |group, domains|
|
170
|
+
current.data[group] ||= []
|
171
|
+
current.data[group].concat domains
|
177
172
|
end
|
178
173
|
current.write
|
179
174
|
end
|
@@ -192,9 +187,3 @@ class Gman
|
|
192
187
|
end
|
193
188
|
end
|
194
189
|
end
|
195
|
-
|
196
|
-
class Gman
|
197
|
-
def self.import(hash, options = {})
|
198
|
-
Gman::Importer.new(hash).import(options)
|
199
|
-
end
|
200
|
-
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,4 +1,16 @@
|
|
1
1
|
class Gman
|
2
|
+
# Second level .us domains for states and locality
|
3
|
+
# See http://en.wikipedia.org/wiki/.us
|
4
|
+
#
|
5
|
+
# Examples:
|
6
|
+
# * foo.state.il.us
|
7
|
+
# * ci.foo.il.us
|
8
|
+
#
|
9
|
+
# Not:
|
10
|
+
# * state.foo.il.us
|
11
|
+
# * foo.ci.il.us
|
12
|
+
# * k12.il.us
|
13
|
+
# * ci.foo.zx.us
|
2
14
|
class Locality
|
3
15
|
AFFINITY_NAMESPACES = %w(state dst cog).freeze
|
4
16
|
|
@@ -20,21 +32,9 @@ class Gman
|
|
20
32
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
21
33
|
)\.(#{Regexp.union(STATES)})\.us
|
22
34
|
/x
|
23
|
-
end
|
24
35
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
# Examples:
|
29
|
-
# * foo.state.il.us
|
30
|
-
# * ci.foo.il.us
|
31
|
-
#
|
32
|
-
# Not:
|
33
|
-
# * state.foo.il.us
|
34
|
-
# * foo.ci.il.us
|
35
|
-
# * k12.il.us
|
36
|
-
# * ci.foo.zx.us
|
37
|
-
def locality?
|
38
|
-
!domain.to_s.match(Locality::REGEX).nil?
|
36
|
+
def self.valid?(domain)
|
37
|
+
!domain.to_s.match(Locality::REGEX).nil?
|
38
|
+
end
|
39
39
|
end
|
40
40
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/vendor
CHANGED
data/script/vendor-federal-de
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'csv'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
9
8
|
|
@@ -11,4 +10,4 @@ domains = open(url).read.encode('UTF-8')
|
|
11
10
|
domains = CSV.parse(domains, headers: true)
|
12
11
|
domains = domains.map { |row| row['Domain Name'] }
|
13
12
|
|
14
|
-
Gman.
|
13
|
+
Gman::Importer.new('German Federal' => domains).import
|