gman 6.0.1 → 7.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -25,11 +25,11 @@ Gem::Specification.new do |s|
25
25
  s.require_paths = ['lib']
26
26
  s.required_ruby_version = '~> 2.0'
27
27
 
28
- s.add_dependency('swot', '~> 1.0')
29
28
  s.add_dependency('iso_country_codes', '~> 0.6')
30
29
  s.add_dependency('naughty_or_nice', '~> 2.0')
31
30
  s.add_dependency('colored', '~> 1.2')
32
31
 
32
+ s.add_development_dependency('swot', '~> 1.0')
33
33
  s.add_development_dependency('rake', '~> 10.4')
34
34
  s.add_development_dependency('shoulda', '~> 3.5')
35
35
  s.add_development_dependency('rdoc', '~> 4.2')
@@ -1,38 +1,40 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+
1
3
  require 'naughty_or_nice'
2
4
  require 'swot'
3
5
  require 'iso_country_codes'
4
6
  require 'csv'
5
7
  require_relative 'gman/version'
6
8
  require_relative 'gman/country_codes'
7
- require_relative 'gman/locality'
8
9
  require_relative 'gman/identifier'
9
10
 
10
11
  class Gman
11
12
  include NaughtyOrNice
12
13
 
14
+ autoload :DomainList, 'gman/domain_list'
15
+ autoload :Importer, 'gman/importer'
16
+ autoload :Locality, 'gman/locality'
17
+
13
18
  class << self
14
- # returns an instance of our custom public suffix list
15
- # list behaves like PublicSuffix::List
16
- # but is limited to our whitelisted domains
17
19
  def list
18
- @list ||= PublicSuffix::List.parse(list_contents)
20
+ @list ||= DomainList.new(path: list_path)
21
+ end
22
+
23
+ def academic_list
24
+ @academic_list ||= DomainList.new(path: academic_list_path)
19
25
  end
20
26
 
21
27
  def config_path
22
- File.expand_path '../config', File.dirname(__FILE__)
28
+ @config_path ||= File.expand_path '../config', File.dirname(__FILE__)
23
29
  end
24
30
 
25
31
  # Returns the absolute path to the domain list
26
32
  def list_path
27
- if ENV['GMAN_STUB_DOMAINS']
28
- File.expand_path '../test/fixtures/domains.txt', File.dirname(__FILE__)
29
- else
30
- File.expand_path 'domains.txt', config_path
31
- end
33
+ File.expand_path 'domains.txt', config_path
32
34
  end
33
35
 
34
- def list_contents
35
- @list_contents ||= File.new(list_path, 'r:utf-8').read
36
+ def academic_list_path
37
+ File.expand_path 'vendor/academic.txt', config_path
36
38
  end
37
39
  end
38
40
 
@@ -47,21 +49,21 @@ class Gman
47
49
  end
48
50
  end
49
51
 
52
+ def locality?
53
+ Locality.valid?(domain)
54
+ end
55
+
50
56
  private
51
57
 
52
58
  def valid_domain?
53
- domain && domain.valid? && !academic?
59
+ @valid_domains ||= domain && domain.valid? && !academic?
54
60
  end
55
61
 
56
62
  def academic?
57
- domain && Swot.is_academic?(domain)
63
+ @academic ||= domain && Gman.academic_list.valid?(to_s)
58
64
  end
59
65
 
60
- # domain is on the domain list and
61
- # domain is not explicitly blacklisted and
62
- # domain matches a standard public suffix list rule
63
66
  def public_suffix_valid?
64
- rule = Gman.list.find(to_s)
65
- !rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
67
+ @public_suffix_valid ||= Gman.list.valid?(to_s)
66
68
  end
67
69
  end
@@ -1,39 +1,102 @@
1
1
  class Gman
2
2
  class DomainList
3
- attr_accessor :list
4
- alias to_h list
5
-
6
3
  COMMENT_REGEX = %r{//[/\s]*(.*)$}i
7
4
 
8
- def initialize(list)
9
- @list = list.reject { |_group, domains| domains.compact.empty? }
5
+ attr_writer :data, :path, :contents
6
+
7
+ class << self
8
+ # The current, government domain list
9
+ def current
10
+ DomainList.new(path: Gman.list_path)
11
+ end
12
+
13
+ def from_file(path)
14
+ DomainList.new(path: path)
15
+ end
16
+
17
+ def from_hash(hash)
18
+ DomainList.new(data: hash)
19
+ end
20
+
21
+ def from_public_suffix(string)
22
+ DomainList.new(contents: string)
23
+ end
24
+ alias from_string from_public_suffix
25
+ end
26
+
27
+ def initialize(path: nil, contents: nil, data: nil)
28
+ @path = path
29
+ @contents = contents
30
+ @data = data.reject { |_, domains| domains.compact.empty? } if data
31
+ end
32
+
33
+ # Returns the raw content of the domain list as a string
34
+ def contents
35
+ @contents ||= if path
36
+ File.new(path, 'r:utf-8').read
37
+ else
38
+ to_s
39
+ end
40
+ end
41
+
42
+ # Returns the parsed contents of the domain list as a hash
43
+ # in the form for group => domains
44
+ def data
45
+ @data ||= string_to_hash(contents)
46
+ end
47
+ alias to_h data
48
+
49
+ # Returns the path to the domain list on disk
50
+ def path
51
+ @path ||= Gman.list_path
52
+ end
53
+
54
+ # returns an instance of our custom public suffix list
55
+ # list behaves like PublicSuffix::List
56
+ # but is limited to our whitelisted domains
57
+ def public_suffix_list
58
+ @public_suffix_list ||= PublicSuffix::List.parse(contents)
59
+ end
60
+
61
+ # domain is on the domain list and
62
+ # domain is not explicitly blacklisted and
63
+ # domain matches a standard public suffix list rule
64
+ def valid?(domain)
65
+ rule = public_suffix_list.find(domain)
66
+ !rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
10
67
  end
11
68
 
69
+ # Returns an array of strings representing the list groups
12
70
  def groups
13
- list.keys
71
+ data.keys
14
72
  end
15
73
 
74
+ # Return an array of strings representing all domains on the list
16
75
  def domains
17
- list.values.flatten.compact.sort.uniq
76
+ data.values.flatten.compact.sort.uniq
18
77
  end
19
78
 
79
+ # Return the total number of domains in the list
20
80
  def count
21
81
  domains.count
22
82
  end
23
83
 
84
+ # Alphabetize groups and domains within each group
24
85
  def alphabetize
25
- @list = @list.sort_by { |k, _v| k.downcase }.to_h
26
- @list.each { |_group, domains| domains.sort!.uniq! }
86
+ @data = data.sort_by { |k, _v| k.downcase }.to_h
87
+ @data.each { |_group, domains| domains.sort!.uniq! }
27
88
  end
28
89
 
90
+ # Write the domain list to disk
29
91
  def write
30
92
  alphabetize
31
- File.write(Gman.list_path, to_public_suffix)
93
+ File.write(path, to_public_suffix)
32
94
  end
33
95
 
34
- def to_public_suffix
96
+ # The string representation of the domain list, in public suffix format
97
+ def to_s
35
98
  current_group = output = ''
36
- list.sort_by { |group, _domains| group.downcase }.each do |group, domains|
99
+ data.sort_by { |group, _| group.downcase }.each do |group, domains|
37
100
  if group != current_group
38
101
  output << "\n\n" unless current_group.empty? # first entry
39
102
  output << "// #{group}\n"
@@ -43,45 +106,48 @@ class Gman
43
106
  end
44
107
  output
45
108
  end
109
+ alias to_public_suffix to_s
46
110
 
47
- def self.current
48
- current = File.open(Gman.list_path).read
49
- DomainList.from_public_suffix(current)
111
+ # Given a domain, find any domain on the list that includes that domain
112
+ # E.g., `fcc.gov` would be the parent of `data.fcc.gov`
113
+ def parent_domain(domain)
114
+ domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
50
115
  end
51
116
 
52
- def self.from_public_suffix(string)
53
- string = string.gsub(/\r\n?/, "\n").split("\n")
54
- hash = array_to_hash(string)
55
- DomainList.new(hash)
117
+ private
118
+
119
+ # Parse a public-suffix formatted string into a hash of groups => [domains]
120
+ def string_to_hash(string)
121
+ return unless string
122
+ lines = string_to_array(string)
123
+ array_to_hash(lines)
56
124
  end
57
125
 
58
- def parent_domain(domain)
59
- domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
126
+ def string_to_array(string)
127
+ string.gsub(/\r\n?/, "\n").split("\n")
60
128
  end
61
129
 
62
- class << self
63
- private
64
-
65
- # Given an array of comments/domains in public suffix format
66
- # Converts to a hash in the form of :group => [domain1, domain2...]
67
- def array_to_hash(domains)
68
- domain_hash = {}
69
- group = ''
70
- domains.each do |line|
71
- if line =~ COMMENT_REGEX
72
- group = COMMENT_REGEX.match(line)[1]
73
- else
74
- safe_push(domain_hash, group, line.downcase)
75
- end
130
+ def array_to_hash(lines)
131
+ domain_hash = {}
132
+ group = ''
133
+ lines.each do |line|
134
+ if line =~ COMMENT_REGEX
135
+ group = COMMENT_REGEX.match(line)[1]
136
+ else
137
+ safe_push(domain_hash, group, line.downcase)
76
138
  end
77
- domain_hash
78
139
  end
140
+ domain_hash
141
+ end
79
142
 
80
- def safe_push(hash, key, value)
81
- return if value.empty?
82
- hash[key] ||= []
83
- hash[key].push value
84
- end
143
+ # Add a value to an array in a hash, creating the array if necessary
144
+ # hash - the hash
145
+ # key - the key within that hash to add the value to
146
+ # value - the single value to push into the array at hash[key]
147
+ def safe_push(hash, key, value)
148
+ return if value.empty?
149
+ hash[key] ||= []
150
+ hash[key].push value
85
151
  end
86
152
  end
87
153
  end
@@ -74,10 +74,10 @@ class Gman
74
74
 
75
75
  def list_category
76
76
  @list_category ||= begin
77
- match = Gman.list.find(domain.to_s)
77
+ match = Gman.list.public_suffix_list.find(domain.to_s)
78
78
  return unless match
79
79
  regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.name)}\n}im
80
- matches = Gman.list_contents.match(regex)
80
+ matches = Gman.list.contents.match(regex)
81
81
  matches[1] if matches
82
82
  end
83
83
  end
@@ -9,7 +9,7 @@ require_relative './domain_list'
9
9
 
10
10
  class Gman
11
11
  class Importer
12
- attr_accessor :domains
12
+ attr_accessor :domain_list
13
13
 
14
14
  # Known false positives from vendored lists
15
15
  BLACKLIST = %w(
@@ -58,7 +58,7 @@ class Gman
58
58
  }.freeze
59
59
 
60
60
  def initialize(domains)
61
- @domains = DomainList.new(domains)
61
+ @domain_list = DomainList.new(data: domains)
62
62
  end
63
63
 
64
64
  def logger
@@ -71,8 +71,8 @@ class Gman
71
71
  end
72
72
 
73
73
  def valid_domain?(domain, options = {})
74
- return false unless ensure_valid(domain)
75
74
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
75
+ return false unless ensure_valid(domain)
76
76
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
77
77
  true
78
78
  end
@@ -89,18 +89,13 @@ class Gman
89
89
  @current ||= DomainList.current
90
90
  end
91
91
 
92
- def import(options)
92
+ def import(options = {})
93
93
  logger.info "Current: #{Gman::DomainList.current.count} domains"
94
- logger.info "Adding: #{domains.count} domains"
94
+ logger.info "Adding: #{domain_list.count} domains"
95
95
 
96
96
  normalize_domains!
97
97
  ensure_validity!(options)
98
98
 
99
- if domains.count == 0
100
- logger.info 'Nothing to add. Aborting'
101
- exit 0
102
- end
103
-
104
99
  add_to_current
105
100
  logger.info "New: #{current.count} domains"
106
101
  end
@@ -158,22 +153,22 @@ class Gman
158
153
  end
159
154
 
160
155
  def normalize_domains!
161
- domains.list.each do |_group, domains|
156
+ domain_list.to_h.each do |_group, domains|
162
157
  domains.map! { |domain| normalize_domain(domain) }
163
158
  domains.uniq!
164
159
  end
165
160
  end
166
161
 
167
162
  def ensure_validity!(options = {})
168
- domains.list.each do |_group, domains|
163
+ domain_list.data.each do |_group, domains|
169
164
  domains.select! { |domain| valid_domain?(domain, options) }
170
165
  end
171
166
  end
172
167
 
173
168
  def add_to_current
174
- domains.list.each do |group, domains|
175
- current.list[group] ||= []
176
- current.list[group].concat domains
169
+ domain_list.data.each do |group, domains|
170
+ current.data[group] ||= []
171
+ current.data[group].concat domains
177
172
  end
178
173
  current.write
179
174
  end
@@ -192,9 +187,3 @@ class Gman
192
187
  end
193
188
  end
194
189
  end
195
-
196
- class Gman
197
- def self.import(hash, options = {})
198
- Gman::Importer.new(hash).import(options)
199
- end
200
- end
@@ -1,4 +1,16 @@
1
1
  class Gman
2
+ # Second level .us domains for states and locality
3
+ # See http://en.wikipedia.org/wiki/.us
4
+ #
5
+ # Examples:
6
+ # * foo.state.il.us
7
+ # * ci.foo.il.us
8
+ #
9
+ # Not:
10
+ # * state.foo.il.us
11
+ # * foo.ci.il.us
12
+ # * k12.il.us
13
+ # * ci.foo.zx.us
2
14
  class Locality
3
15
  AFFINITY_NAMESPACES = %w(state dst cog).freeze
4
16
 
@@ -20,21 +32,9 @@ class Gman
20
32
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
21
33
  )\.(#{Regexp.union(STATES)})\.us
22
34
  /x
23
- end
24
35
 
25
- # Second level .us domains for states and locality
26
- # See http://en.wikipedia.org/wiki/.us
27
- #
28
- # Examples:
29
- # * foo.state.il.us
30
- # * ci.foo.il.us
31
- #
32
- # Not:
33
- # * state.foo.il.us
34
- # * foo.ci.il.us
35
- # * k12.il.us
36
- # * ci.foo.zx.us
37
- def locality?
38
- !domain.to_s.match(Locality::REGEX).nil?
36
+ def self.valid?(domain)
37
+ !domain.to_s.match(Locality::REGEX).nil?
38
+ end
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '6.0.1'.freeze
2
+ VERSION = '7.0.0'.freeze
3
3
  end
@@ -6,7 +6,7 @@ for file in script/vendor-*; do
6
6
  echo "*************************************"
7
7
  echo "Vendoring $file"
8
8
  echo "*************************************"
9
- "$file"
9
+ bundle exec "$file"
10
10
  fi
11
11
  done
12
12
 
@@ -3,7 +3,6 @@
3
3
  require 'csv'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
8
 
@@ -11,4 +10,4 @@ domains = open(url).read.encode('UTF-8')
11
10
  domains = CSV.parse(domains, headers: true)
12
11
  domains = domains.map { |row| row['Domain Name'] }
13
12
 
14
- Gman.import('German Federal' => domains)
13
+ Gman::Importer.new('German Federal' => domains).import