gman 6.0.1 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,11 +25,11 @@ Gem::Specification.new do |s|
25
25
  s.require_paths = ['lib']
26
26
  s.required_ruby_version = '~> 2.0'
27
27
 
28
- s.add_dependency('swot', '~> 1.0')
29
28
  s.add_dependency('iso_country_codes', '~> 0.6')
30
29
  s.add_dependency('naughty_or_nice', '~> 2.0')
31
30
  s.add_dependency('colored', '~> 1.2')
32
31
 
32
+ s.add_development_dependency('swot', '~> 1.0')
33
33
  s.add_development_dependency('rake', '~> 10.4')
34
34
  s.add_development_dependency('shoulda', '~> 3.5')
35
35
  s.add_development_dependency('rdoc', '~> 4.2')
@@ -1,38 +1,40 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+
1
3
  require 'naughty_or_nice'
2
4
  require 'swot'
3
5
  require 'iso_country_codes'
4
6
  require 'csv'
5
7
  require_relative 'gman/version'
6
8
  require_relative 'gman/country_codes'
7
- require_relative 'gman/locality'
8
9
  require_relative 'gman/identifier'
9
10
 
10
11
  class Gman
11
12
  include NaughtyOrNice
12
13
 
14
+ autoload :DomainList, 'gman/domain_list'
15
+ autoload :Importer, 'gman/importer'
16
+ autoload :Locality, 'gman/locality'
17
+
13
18
  class << self
14
- # returns an instance of our custom public suffix list
15
- # list behaves like PublicSuffix::List
16
- # but is limited to our whitelisted domains
17
19
  def list
18
- @list ||= PublicSuffix::List.parse(list_contents)
20
+ @list ||= DomainList.new(path: list_path)
21
+ end
22
+
23
+ def academic_list
24
+ @academic_list ||= DomainList.new(path: academic_list_path)
19
25
  end
20
26
 
21
27
  def config_path
22
- File.expand_path '../config', File.dirname(__FILE__)
28
+ @config_path ||= File.expand_path '../config', File.dirname(__FILE__)
23
29
  end
24
30
 
25
31
  # Returns the absolute path to the domain list
26
32
  def list_path
27
- if ENV['GMAN_STUB_DOMAINS']
28
- File.expand_path '../test/fixtures/domains.txt', File.dirname(__FILE__)
29
- else
30
- File.expand_path 'domains.txt', config_path
31
- end
33
+ File.expand_path 'domains.txt', config_path
32
34
  end
33
35
 
34
- def list_contents
35
- @list_contents ||= File.new(list_path, 'r:utf-8').read
36
+ def academic_list_path
37
+ File.expand_path 'vendor/academic.txt', config_path
36
38
  end
37
39
  end
38
40
 
@@ -47,21 +49,21 @@ class Gman
47
49
  end
48
50
  end
49
51
 
52
+ def locality?
53
+ Locality.valid?(domain)
54
+ end
55
+
50
56
  private
51
57
 
52
58
  def valid_domain?
53
- domain && domain.valid? && !academic?
59
+ @valid_domains ||= domain && domain.valid? && !academic?
54
60
  end
55
61
 
56
62
  def academic?
57
- domain && Swot.is_academic?(domain)
63
+ @academic ||= domain && Gman.academic_list.valid?(to_s)
58
64
  end
59
65
 
60
- # domain is on the domain list and
61
- # domain is not explicitly blacklisted and
62
- # domain matches a standard public suffix list rule
63
66
  def public_suffix_valid?
64
- rule = Gman.list.find(to_s)
65
- !rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
67
+ @public_suffix_valid ||= Gman.list.valid?(to_s)
66
68
  end
67
69
  end
@@ -1,39 +1,102 @@
1
1
  class Gman
2
2
  class DomainList
3
- attr_accessor :list
4
- alias to_h list
5
-
6
3
  COMMENT_REGEX = %r{//[/\s]*(.*)$}i
7
4
 
8
- def initialize(list)
9
- @list = list.reject { |_group, domains| domains.compact.empty? }
5
+ attr_writer :data, :path, :contents
6
+
7
+ class << self
8
+ # The current, government domain list
9
+ def current
10
+ DomainList.new(path: Gman.list_path)
11
+ end
12
+
13
+ def from_file(path)
14
+ DomainList.new(path: path)
15
+ end
16
+
17
+ def from_hash(hash)
18
+ DomainList.new(data: hash)
19
+ end
20
+
21
+ def from_public_suffix(string)
22
+ DomainList.new(contents: string)
23
+ end
24
+ alias from_string from_public_suffix
25
+ end
26
+
27
+ def initialize(path: nil, contents: nil, data: nil)
28
+ @path = path
29
+ @contents = contents
30
+ @data = data.reject { |_, domains| domains.compact.empty? } if data
31
+ end
32
+
33
+ # Returns the raw content of the domain list as a string
34
+ def contents
35
+ @contents ||= if path
36
+ File.new(path, 'r:utf-8').read
37
+ else
38
+ to_s
39
+ end
40
+ end
41
+
42
+ # Returns the parsed contents of the domain list as a hash
43
+ # in the form for group => domains
44
+ def data
45
+ @data ||= string_to_hash(contents)
46
+ end
47
+ alias to_h data
48
+
49
+ # Returns the path to the domain list on disk
50
+ def path
51
+ @path ||= Gman.list_path
52
+ end
53
+
54
+ # returns an instance of our custom public suffix list
55
+ # list behaves like PublicSuffix::List
56
+ # but is limited to our whitelisted domains
57
+ def public_suffix_list
58
+ @public_suffix_list ||= PublicSuffix::List.parse(contents)
59
+ end
60
+
61
+ # domain is on the domain list and
62
+ # domain is not explicitly blacklisted and
63
+ # domain matches a standard public suffix list rule
64
+ def valid?(domain)
65
+ rule = public_suffix_list.find(domain)
66
+ !rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
10
67
  end
11
68
 
69
+ # Returns an array of strings representing the list groups
12
70
  def groups
13
- list.keys
71
+ data.keys
14
72
  end
15
73
 
74
+ # Return an array of strings representing all domains on the list
16
75
  def domains
17
- list.values.flatten.compact.sort.uniq
76
+ data.values.flatten.compact.sort.uniq
18
77
  end
19
78
 
79
+ # Return the total number of domains in the list
20
80
  def count
21
81
  domains.count
22
82
  end
23
83
 
84
+ # Alphabetize groups and domains within each group
24
85
  def alphabetize
25
- @list = @list.sort_by { |k, _v| k.downcase }.to_h
26
- @list.each { |_group, domains| domains.sort!.uniq! }
86
+ @data = data.sort_by { |k, _v| k.downcase }.to_h
87
+ @data.each { |_group, domains| domains.sort!.uniq! }
27
88
  end
28
89
 
90
+ # Write the domain list to disk
29
91
  def write
30
92
  alphabetize
31
- File.write(Gman.list_path, to_public_suffix)
93
+ File.write(path, to_public_suffix)
32
94
  end
33
95
 
34
- def to_public_suffix
96
+ # The string representation of the domain list, in public suffix format
97
+ def to_s
35
98
  current_group = output = ''
36
- list.sort_by { |group, _domains| group.downcase }.each do |group, domains|
99
+ data.sort_by { |group, _| group.downcase }.each do |group, domains|
37
100
  if group != current_group
38
101
  output << "\n\n" unless current_group.empty? # first entry
39
102
  output << "// #{group}\n"
@@ -43,45 +106,48 @@ class Gman
43
106
  end
44
107
  output
45
108
  end
109
+ alias to_public_suffix to_s
46
110
 
47
- def self.current
48
- current = File.open(Gman.list_path).read
49
- DomainList.from_public_suffix(current)
111
+ # Given a domain, find any domain on the list that includes that domain
112
+ # E.g., `fcc.gov` would be the parent of `data.fcc.gov`
113
+ def parent_domain(domain)
114
+ domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
50
115
  end
51
116
 
52
- def self.from_public_suffix(string)
53
- string = string.gsub(/\r\n?/, "\n").split("\n")
54
- hash = array_to_hash(string)
55
- DomainList.new(hash)
117
+ private
118
+
119
+ # Parse a public-suffix formatted string into a hash of groups => [domains]
120
+ def string_to_hash(string)
121
+ return unless string
122
+ lines = string_to_array(string)
123
+ array_to_hash(lines)
56
124
  end
57
125
 
58
- def parent_domain(domain)
59
- domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
126
+ def string_to_array(string)
127
+ string.gsub(/\r\n?/, "\n").split("\n")
60
128
  end
61
129
 
62
- class << self
63
- private
64
-
65
- # Given an array of comments/domains in public suffix format
66
- # Converts to a hash in the form of :group => [domain1, domain2...]
67
- def array_to_hash(domains)
68
- domain_hash = {}
69
- group = ''
70
- domains.each do |line|
71
- if line =~ COMMENT_REGEX
72
- group = COMMENT_REGEX.match(line)[1]
73
- else
74
- safe_push(domain_hash, group, line.downcase)
75
- end
130
+ def array_to_hash(lines)
131
+ domain_hash = {}
132
+ group = ''
133
+ lines.each do |line|
134
+ if line =~ COMMENT_REGEX
135
+ group = COMMENT_REGEX.match(line)[1]
136
+ else
137
+ safe_push(domain_hash, group, line.downcase)
76
138
  end
77
- domain_hash
78
139
  end
140
+ domain_hash
141
+ end
79
142
 
80
- def safe_push(hash, key, value)
81
- return if value.empty?
82
- hash[key] ||= []
83
- hash[key].push value
84
- end
143
+ # Add a value to an array in a hash, creating the array if necessary
144
+ # hash - the hash
145
+ # key - the key within that hash to add the value to
146
+ # value - the single value to push into the array at hash[key]
147
+ def safe_push(hash, key, value)
148
+ return if value.empty?
149
+ hash[key] ||= []
150
+ hash[key].push value
85
151
  end
86
152
  end
87
153
  end
@@ -74,10 +74,10 @@ class Gman
74
74
 
75
75
  def list_category
76
76
  @list_category ||= begin
77
- match = Gman.list.find(domain.to_s)
77
+ match = Gman.list.public_suffix_list.find(domain.to_s)
78
78
  return unless match
79
79
  regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.name)}\n}im
80
- matches = Gman.list_contents.match(regex)
80
+ matches = Gman.list.contents.match(regex)
81
81
  matches[1] if matches
82
82
  end
83
83
  end
@@ -9,7 +9,7 @@ require_relative './domain_list'
9
9
 
10
10
  class Gman
11
11
  class Importer
12
- attr_accessor :domains
12
+ attr_accessor :domain_list
13
13
 
14
14
  # Known false positives from vendored lists
15
15
  BLACKLIST = %w(
@@ -58,7 +58,7 @@ class Gman
58
58
  }.freeze
59
59
 
60
60
  def initialize(domains)
61
- @domains = DomainList.new(domains)
61
+ @domain_list = DomainList.new(data: domains)
62
62
  end
63
63
 
64
64
  def logger
@@ -71,8 +71,8 @@ class Gman
71
71
  end
72
72
 
73
73
  def valid_domain?(domain, options = {})
74
- return false unless ensure_valid(domain)
75
74
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
75
+ return false unless ensure_valid(domain)
76
76
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
77
77
  true
78
78
  end
@@ -89,18 +89,13 @@ class Gman
89
89
  @current ||= DomainList.current
90
90
  end
91
91
 
92
- def import(options)
92
+ def import(options = {})
93
93
  logger.info "Current: #{Gman::DomainList.current.count} domains"
94
- logger.info "Adding: #{domains.count} domains"
94
+ logger.info "Adding: #{domain_list.count} domains"
95
95
 
96
96
  normalize_domains!
97
97
  ensure_validity!(options)
98
98
 
99
- if domains.count == 0
100
- logger.info 'Nothing to add. Aborting'
101
- exit 0
102
- end
103
-
104
99
  add_to_current
105
100
  logger.info "New: #{current.count} domains"
106
101
  end
@@ -158,22 +153,22 @@ class Gman
158
153
  end
159
154
 
160
155
  def normalize_domains!
161
- domains.list.each do |_group, domains|
156
+ domain_list.to_h.each do |_group, domains|
162
157
  domains.map! { |domain| normalize_domain(domain) }
163
158
  domains.uniq!
164
159
  end
165
160
  end
166
161
 
167
162
  def ensure_validity!(options = {})
168
- domains.list.each do |_group, domains|
163
+ domain_list.data.each do |_group, domains|
169
164
  domains.select! { |domain| valid_domain?(domain, options) }
170
165
  end
171
166
  end
172
167
 
173
168
  def add_to_current
174
- domains.list.each do |group, domains|
175
- current.list[group] ||= []
176
- current.list[group].concat domains
169
+ domain_list.data.each do |group, domains|
170
+ current.data[group] ||= []
171
+ current.data[group].concat domains
177
172
  end
178
173
  current.write
179
174
  end
@@ -192,9 +187,3 @@ class Gman
192
187
  end
193
188
  end
194
189
  end
195
-
196
- class Gman
197
- def self.import(hash, options = {})
198
- Gman::Importer.new(hash).import(options)
199
- end
200
- end
@@ -1,4 +1,16 @@
1
1
  class Gman
2
+ # Second level .us domains for states and locality
3
+ # See http://en.wikipedia.org/wiki/.us
4
+ #
5
+ # Examples:
6
+ # * foo.state.il.us
7
+ # * ci.foo.il.us
8
+ #
9
+ # Not:
10
+ # * state.foo.il.us
11
+ # * foo.ci.il.us
12
+ # * k12.il.us
13
+ # * ci.foo.zx.us
2
14
  class Locality
3
15
  AFFINITY_NAMESPACES = %w(state dst cog).freeze
4
16
 
@@ -20,21 +32,9 @@ class Gman
20
32
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
21
33
  )\.(#{Regexp.union(STATES)})\.us
22
34
  /x
23
- end
24
35
 
25
- # Second level .us domains for states and locality
26
- # See http://en.wikipedia.org/wiki/.us
27
- #
28
- # Examples:
29
- # * foo.state.il.us
30
- # * ci.foo.il.us
31
- #
32
- # Not:
33
- # * state.foo.il.us
34
- # * foo.ci.il.us
35
- # * k12.il.us
36
- # * ci.foo.zx.us
37
- def locality?
38
- !domain.to_s.match(Locality::REGEX).nil?
36
+ def self.valid?(domain)
37
+ !domain.to_s.match(Locality::REGEX).nil?
38
+ end
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '6.0.1'.freeze
2
+ VERSION = '7.0.0'.freeze
3
3
  end
@@ -6,7 +6,7 @@ for file in script/vendor-*; do
6
6
  echo "*************************************"
7
7
  echo "Vendoring $file"
8
8
  echo "*************************************"
9
- "$file"
9
+ bundle exec "$file"
10
10
  fi
11
11
  done
12
12
 
@@ -3,7 +3,6 @@
3
3
  require 'csv'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
9
8
 
@@ -11,4 +10,4 @@ domains = open(url).read.encode('UTF-8')
11
10
  domains = CSV.parse(domains, headers: true)
12
11
  domains = domains.map { |row| row['Domain Name'] }
13
12
 
14
- Gman.import('German Federal' => domains)
13
+ Gman::Importer.new('German Federal' => domains).import