gman 6.0.1 → 7.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rake_tasks +0 -0
- data/.rubocop.yml +3 -0
- data/config/domains.txt +19 -32
- data/config/vendor/academic.txt +8039 -0
- data/config/vendor/dotgovs.csv +5560 -5560
- data/gman.gemspec +1 -1
- data/lib/gman.rb +22 -20
- data/lib/gman/domain_list.rb +107 -41
- data/lib/gman/identifier.rb +2 -2
- data/lib/gman/importer.rb +10 -21
- data/lib/gman/locality.rb +15 -15
- data/lib/gman/version.rb +1 -1
- data/script/vendor +1 -1
- data/script/vendor-federal-de +1 -2
- data/script/vendor-municipal-de +1 -2
- data/script/vendor-nl +2 -1
- data/script/vendor-public-suffix +1 -2
- data/script/vendor-se +1 -2
- data/script/vendor-swot +41 -0
- data/script/vendor-us +3 -2
- data/test/helper.rb +8 -0
- data/test/test_gman.rb +0 -6
- data/test/test_gman_domain_list.rb +112 -0
- data/test/test_gman_domains.rb +5 -6
- data/test/test_gman_importer.rb +26 -32
- metadata +20 -15
data/gman.gemspec
CHANGED
@@ -25,11 +25,11 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.require_paths = ['lib']
|
26
26
|
s.required_ruby_version = '~> 2.0'
|
27
27
|
|
28
|
-
s.add_dependency('swot', '~> 1.0')
|
29
28
|
s.add_dependency('iso_country_codes', '~> 0.6')
|
30
29
|
s.add_dependency('naughty_or_nice', '~> 2.0')
|
31
30
|
s.add_dependency('colored', '~> 1.2')
|
32
31
|
|
32
|
+
s.add_development_dependency('swot', '~> 1.0')
|
33
33
|
s.add_development_dependency('rake', '~> 10.4')
|
34
34
|
s.add_development_dependency('shoulda', '~> 3.5')
|
35
35
|
s.add_development_dependency('rdoc', '~> 4.2')
|
data/lib/gman.rb
CHANGED
@@ -1,38 +1,40 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
|
1
3
|
require 'naughty_or_nice'
|
2
4
|
require 'swot'
|
3
5
|
require 'iso_country_codes'
|
4
6
|
require 'csv'
|
5
7
|
require_relative 'gman/version'
|
6
8
|
require_relative 'gman/country_codes'
|
7
|
-
require_relative 'gman/locality'
|
8
9
|
require_relative 'gman/identifier'
|
9
10
|
|
10
11
|
class Gman
|
11
12
|
include NaughtyOrNice
|
12
13
|
|
14
|
+
autoload :DomainList, 'gman/domain_list'
|
15
|
+
autoload :Importer, 'gman/importer'
|
16
|
+
autoload :Locality, 'gman/locality'
|
17
|
+
|
13
18
|
class << self
|
14
|
-
# returns an instance of our custom public suffix list
|
15
|
-
# list behaves like PublicSuffix::List
|
16
|
-
# but is limited to our whitelisted domains
|
17
19
|
def list
|
18
|
-
@list ||=
|
20
|
+
@list ||= DomainList.new(path: list_path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def academic_list
|
24
|
+
@academic_list ||= DomainList.new(path: academic_list_path)
|
19
25
|
end
|
20
26
|
|
21
27
|
def config_path
|
22
|
-
File.expand_path '../config', File.dirname(__FILE__)
|
28
|
+
@config_path ||= File.expand_path '../config', File.dirname(__FILE__)
|
23
29
|
end
|
24
30
|
|
25
31
|
# Returns the absolute path to the domain list
|
26
32
|
def list_path
|
27
|
-
|
28
|
-
File.expand_path '../test/fixtures/domains.txt', File.dirname(__FILE__)
|
29
|
-
else
|
30
|
-
File.expand_path 'domains.txt', config_path
|
31
|
-
end
|
33
|
+
File.expand_path 'domains.txt', config_path
|
32
34
|
end
|
33
35
|
|
34
|
-
def
|
35
|
-
|
36
|
+
def academic_list_path
|
37
|
+
File.expand_path 'vendor/academic.txt', config_path
|
36
38
|
end
|
37
39
|
end
|
38
40
|
|
@@ -47,21 +49,21 @@ class Gman
|
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
52
|
+
def locality?
|
53
|
+
Locality.valid?(domain)
|
54
|
+
end
|
55
|
+
|
50
56
|
private
|
51
57
|
|
52
58
|
def valid_domain?
|
53
|
-
domain && domain.valid? && !academic?
|
59
|
+
@valid_domains ||= domain && domain.valid? && !academic?
|
54
60
|
end
|
55
61
|
|
56
62
|
def academic?
|
57
|
-
domain &&
|
63
|
+
@academic ||= domain && Gman.academic_list.valid?(to_s)
|
58
64
|
end
|
59
65
|
|
60
|
-
# domain is on the domain list and
|
61
|
-
# domain is not explicitly blacklisted and
|
62
|
-
# domain matches a standard public suffix list rule
|
63
66
|
def public_suffix_valid?
|
64
|
-
|
65
|
-
!rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
|
67
|
+
@public_suffix_valid ||= Gman.list.valid?(to_s)
|
66
68
|
end
|
67
69
|
end
|
data/lib/gman/domain_list.rb
CHANGED
@@ -1,39 +1,102 @@
|
|
1
1
|
class Gman
|
2
2
|
class DomainList
|
3
|
-
attr_accessor :list
|
4
|
-
alias to_h list
|
5
|
-
|
6
3
|
COMMENT_REGEX = %r{//[/\s]*(.*)$}i
|
7
4
|
|
8
|
-
|
9
|
-
|
5
|
+
attr_writer :data, :path, :contents
|
6
|
+
|
7
|
+
class << self
|
8
|
+
# The current, government domain list
|
9
|
+
def current
|
10
|
+
DomainList.new(path: Gman.list_path)
|
11
|
+
end
|
12
|
+
|
13
|
+
def from_file(path)
|
14
|
+
DomainList.new(path: path)
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_hash(hash)
|
18
|
+
DomainList.new(data: hash)
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_public_suffix(string)
|
22
|
+
DomainList.new(contents: string)
|
23
|
+
end
|
24
|
+
alias from_string from_public_suffix
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(path: nil, contents: nil, data: nil)
|
28
|
+
@path = path
|
29
|
+
@contents = contents
|
30
|
+
@data = data.reject { |_, domains| domains.compact.empty? } if data
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the raw content of the domain list as a string
|
34
|
+
def contents
|
35
|
+
@contents ||= if path
|
36
|
+
File.new(path, 'r:utf-8').read
|
37
|
+
else
|
38
|
+
to_s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the parsed contents of the domain list as a hash
|
43
|
+
# in the form for group => domains
|
44
|
+
def data
|
45
|
+
@data ||= string_to_hash(contents)
|
46
|
+
end
|
47
|
+
alias to_h data
|
48
|
+
|
49
|
+
# Returns the path to the domain list on disk
|
50
|
+
def path
|
51
|
+
@path ||= Gman.list_path
|
52
|
+
end
|
53
|
+
|
54
|
+
# returns an instance of our custom public suffix list
|
55
|
+
# list behaves like PublicSuffix::List
|
56
|
+
# but is limited to our whitelisted domains
|
57
|
+
def public_suffix_list
|
58
|
+
@public_suffix_list ||= PublicSuffix::List.parse(contents)
|
59
|
+
end
|
60
|
+
|
61
|
+
# domain is on the domain list and
|
62
|
+
# domain is not explicitly blacklisted and
|
63
|
+
# domain matches a standard public suffix list rule
|
64
|
+
def valid?(domain)
|
65
|
+
rule = public_suffix_list.find(domain)
|
66
|
+
!rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
|
10
67
|
end
|
11
68
|
|
69
|
+
# Returns an array of strings representing the list groups
|
12
70
|
def groups
|
13
|
-
|
71
|
+
data.keys
|
14
72
|
end
|
15
73
|
|
74
|
+
# Return an array of strings representing all domains on the list
|
16
75
|
def domains
|
17
|
-
|
76
|
+
data.values.flatten.compact.sort.uniq
|
18
77
|
end
|
19
78
|
|
79
|
+
# Return the total number of domains in the list
|
20
80
|
def count
|
21
81
|
domains.count
|
22
82
|
end
|
23
83
|
|
84
|
+
# Alphabetize groups and domains within each group
|
24
85
|
def alphabetize
|
25
|
-
@
|
26
|
-
@
|
86
|
+
@data = data.sort_by { |k, _v| k.downcase }.to_h
|
87
|
+
@data.each { |_group, domains| domains.sort!.uniq! }
|
27
88
|
end
|
28
89
|
|
90
|
+
# Write the domain list to disk
|
29
91
|
def write
|
30
92
|
alphabetize
|
31
|
-
File.write(
|
93
|
+
File.write(path, to_public_suffix)
|
32
94
|
end
|
33
95
|
|
34
|
-
|
96
|
+
# The string representation of the domain list, in public suffix format
|
97
|
+
def to_s
|
35
98
|
current_group = output = ''
|
36
|
-
|
99
|
+
data.sort_by { |group, _| group.downcase }.each do |group, domains|
|
37
100
|
if group != current_group
|
38
101
|
output << "\n\n" unless current_group.empty? # first entry
|
39
102
|
output << "// #{group}\n"
|
@@ -43,45 +106,48 @@ class Gman
|
|
43
106
|
end
|
44
107
|
output
|
45
108
|
end
|
109
|
+
alias to_public_suffix to_s
|
46
110
|
|
47
|
-
|
48
|
-
|
49
|
-
|
111
|
+
# Given a domain, find any domain on the list that includes that domain
|
112
|
+
# E.g., `fcc.gov` would be the parent of `data.fcc.gov`
|
113
|
+
def parent_domain(domain)
|
114
|
+
domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
|
50
115
|
end
|
51
116
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
117
|
+
private
|
118
|
+
|
119
|
+
# Parse a public-suffix formatted string into a hash of groups => [domains]
|
120
|
+
def string_to_hash(string)
|
121
|
+
return unless string
|
122
|
+
lines = string_to_array(string)
|
123
|
+
array_to_hash(lines)
|
56
124
|
end
|
57
125
|
|
58
|
-
def
|
59
|
-
|
126
|
+
def string_to_array(string)
|
127
|
+
string.gsub(/\r\n?/, "\n").split("\n")
|
60
128
|
end
|
61
129
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
domains.each do |line|
|
71
|
-
if line =~ COMMENT_REGEX
|
72
|
-
group = COMMENT_REGEX.match(line)[1]
|
73
|
-
else
|
74
|
-
safe_push(domain_hash, group, line.downcase)
|
75
|
-
end
|
130
|
+
def array_to_hash(lines)
|
131
|
+
domain_hash = {}
|
132
|
+
group = ''
|
133
|
+
lines.each do |line|
|
134
|
+
if line =~ COMMENT_REGEX
|
135
|
+
group = COMMENT_REGEX.match(line)[1]
|
136
|
+
else
|
137
|
+
safe_push(domain_hash, group, line.downcase)
|
76
138
|
end
|
77
|
-
domain_hash
|
78
139
|
end
|
140
|
+
domain_hash
|
141
|
+
end
|
79
142
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
143
|
+
# Add a value to an array in a hash, creating the array if necessary
|
144
|
+
# hash - the hash
|
145
|
+
# key - the key within that hash to add the value to
|
146
|
+
# value - the single value to push into the array at hash[key]
|
147
|
+
def safe_push(hash, key, value)
|
148
|
+
return if value.empty?
|
149
|
+
hash[key] ||= []
|
150
|
+
hash[key].push value
|
85
151
|
end
|
86
152
|
end
|
87
153
|
end
|
data/lib/gman/identifier.rb
CHANGED
@@ -74,10 +74,10 @@ class Gman
|
|
74
74
|
|
75
75
|
def list_category
|
76
76
|
@list_category ||= begin
|
77
|
-
match = Gman.list.find(domain.to_s)
|
77
|
+
match = Gman.list.public_suffix_list.find(domain.to_s)
|
78
78
|
return unless match
|
79
79
|
regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.name)}\n}im
|
80
|
-
matches = Gman.
|
80
|
+
matches = Gman.list.contents.match(regex)
|
81
81
|
matches[1] if matches
|
82
82
|
end
|
83
83
|
end
|
data/lib/gman/importer.rb
CHANGED
@@ -9,7 +9,7 @@ require_relative './domain_list'
|
|
9
9
|
|
10
10
|
class Gman
|
11
11
|
class Importer
|
12
|
-
attr_accessor :
|
12
|
+
attr_accessor :domain_list
|
13
13
|
|
14
14
|
# Known false positives from vendored lists
|
15
15
|
BLACKLIST = %w(
|
@@ -58,7 +58,7 @@ class Gman
|
|
58
58
|
}.freeze
|
59
59
|
|
60
60
|
def initialize(domains)
|
61
|
-
@
|
61
|
+
@domain_list = DomainList.new(data: domains)
|
62
62
|
end
|
63
63
|
|
64
64
|
def logger
|
@@ -71,8 +71,8 @@ class Gman
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def valid_domain?(domain, options = {})
|
74
|
-
return false unless ensure_valid(domain)
|
75
74
|
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
|
75
|
+
return false unless ensure_valid(domain)
|
76
76
|
return false if !options[:skip_resolve] && !ensure_resolves(domain)
|
77
77
|
true
|
78
78
|
end
|
@@ -89,18 +89,13 @@ class Gman
|
|
89
89
|
@current ||= DomainList.current
|
90
90
|
end
|
91
91
|
|
92
|
-
def import(options)
|
92
|
+
def import(options = {})
|
93
93
|
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
94
|
-
logger.info "Adding: #{
|
94
|
+
logger.info "Adding: #{domain_list.count} domains"
|
95
95
|
|
96
96
|
normalize_domains!
|
97
97
|
ensure_validity!(options)
|
98
98
|
|
99
|
-
if domains.count == 0
|
100
|
-
logger.info 'Nothing to add. Aborting'
|
101
|
-
exit 0
|
102
|
-
end
|
103
|
-
|
104
99
|
add_to_current
|
105
100
|
logger.info "New: #{current.count} domains"
|
106
101
|
end
|
@@ -158,22 +153,22 @@ class Gman
|
|
158
153
|
end
|
159
154
|
|
160
155
|
def normalize_domains!
|
161
|
-
|
156
|
+
domain_list.to_h.each do |_group, domains|
|
162
157
|
domains.map! { |domain| normalize_domain(domain) }
|
163
158
|
domains.uniq!
|
164
159
|
end
|
165
160
|
end
|
166
161
|
|
167
162
|
def ensure_validity!(options = {})
|
168
|
-
|
163
|
+
domain_list.data.each do |_group, domains|
|
169
164
|
domains.select! { |domain| valid_domain?(domain, options) }
|
170
165
|
end
|
171
166
|
end
|
172
167
|
|
173
168
|
def add_to_current
|
174
|
-
|
175
|
-
current.
|
176
|
-
current.
|
169
|
+
domain_list.data.each do |group, domains|
|
170
|
+
current.data[group] ||= []
|
171
|
+
current.data[group].concat domains
|
177
172
|
end
|
178
173
|
current.write
|
179
174
|
end
|
@@ -192,9 +187,3 @@ class Gman
|
|
192
187
|
end
|
193
188
|
end
|
194
189
|
end
|
195
|
-
|
196
|
-
class Gman
|
197
|
-
def self.import(hash, options = {})
|
198
|
-
Gman::Importer.new(hash).import(options)
|
199
|
-
end
|
200
|
-
end
|
data/lib/gman/locality.rb
CHANGED
@@ -1,4 +1,16 @@
|
|
1
1
|
class Gman
|
2
|
+
# Second level .us domains for states and locality
|
3
|
+
# See http://en.wikipedia.org/wiki/.us
|
4
|
+
#
|
5
|
+
# Examples:
|
6
|
+
# * foo.state.il.us
|
7
|
+
# * ci.foo.il.us
|
8
|
+
#
|
9
|
+
# Not:
|
10
|
+
# * state.foo.il.us
|
11
|
+
# * foo.ci.il.us
|
12
|
+
# * k12.il.us
|
13
|
+
# * ci.foo.zx.us
|
2
14
|
class Locality
|
3
15
|
AFFINITY_NAMESPACES = %w(state dst cog).freeze
|
4
16
|
|
@@ -20,21 +32,9 @@ class Gman
|
|
20
32
|
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
21
33
|
)\.(#{Regexp.union(STATES)})\.us
|
22
34
|
/x
|
23
|
-
end
|
24
35
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
# Examples:
|
29
|
-
# * foo.state.il.us
|
30
|
-
# * ci.foo.il.us
|
31
|
-
#
|
32
|
-
# Not:
|
33
|
-
# * state.foo.il.us
|
34
|
-
# * foo.ci.il.us
|
35
|
-
# * k12.il.us
|
36
|
-
# * ci.foo.zx.us
|
37
|
-
def locality?
|
38
|
-
!domain.to_s.match(Locality::REGEX).nil?
|
36
|
+
def self.valid?(domain)
|
37
|
+
!domain.to_s.match(Locality::REGEX).nil?
|
38
|
+
end
|
39
39
|
end
|
40
40
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/vendor
CHANGED
data/script/vendor-federal-de
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'csv'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
9
8
|
|
@@ -11,4 +10,4 @@ domains = open(url).read.encode('UTF-8')
|
|
11
10
|
domains = CSV.parse(domains, headers: true)
|
12
11
|
domains = domains.map { |row| row['Domain Name'] }
|
13
12
|
|
14
|
-
Gman.
|
13
|
+
Gman::Importer.new('German Federal' => domains).import
|