gman 4.6.5 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/script/vendor-us CHANGED
@@ -3,9 +3,6 @@
3
3
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
4
  # Source: https://github.com/GSA-OCSIT/govt-urls
5
5
  #
6
- # Normalizes and cleans inputs, validates domains, rejects academic domains, and
7
- # sorts, ensures uniqueness, and merges into the existing lib/domains.txt list
8
- #
9
6
  # Usage: script/vendor-us
10
7
  #
11
8
  # Will automatically fetch latest version of the list and merge
@@ -13,70 +10,13 @@
13
10
  #
14
11
  # It's also probably a good idea to run `script/ci-build` for good measure
15
12
 
16
- require 'rubygems'
17
- require 'public_suffix'
18
- require 'swot'
19
- require 'yaml'
20
- require 'open-uri'
21
- require './lib/gman'
22
- require './lib/gman/parser'
23
-
24
- SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
25
- BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
26
- domain_hash = {}
27
-
28
- domain_hash = YAML.load(open(SOURCE).read)
29
- puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..."
30
-
31
- # Normalize ALL THE THINGS
32
- domain_hash.each do |group, domains|
33
- domains.map! { |domain| domain.strip } # Strip trailing slashes
34
- domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
35
- domains.map! { |domain| domain.downcase } # make lower case
36
- domains.reject! { |domain| domain.empty? } # Reject empty strings
37
- end
38
-
39
- # filter
40
- domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist
41
- domain_hash.each do |group, domains|
42
- puts "Filtering #{group}..."
43
- domains.reject! { |domain| domain.match /\// } # Reject URLs
44
- domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain
45
- domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
46
- end
47
- puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains"
48
-
49
- # Grab existing list
50
- current = Gman::Parser.file_to_array( Gman::list_path )
51
- current_hash = Gman::Parser.array_to_hash(current)
52
- puts "Current list contains #{current.size} domains... merging"
53
-
54
- # Lazy deep merge
55
- domain_hash.each do |group,domains|
56
- current_hash[group] = [] if current_hash[group].nil?
57
- current_hash[group].concat domains
58
- current_hash[group].sort! # Alphabetize
59
- current_hash[group].uniq! # Ensure uniqueness
60
- end
61
-
62
- # Sort by group
63
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
64
-
65
- # PublicSuffix Formatted Output
66
- current_group = ""
67
- output = ""
68
- current_hash.each do |group, domains|
69
- if group != current_group
70
- output << "\n\n" unless current_group.empty? # first entry
71
- output << "// #{group}\n"
72
- current_group = group
73
- end
74
- output << domains.join("\n")
75
- end
13
+ require './lib/gman/importer'
76
14
 
77
- puts "merged. Writing..."
15
+ source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
16
+ blacklist = ["usagovQUASI", "usagovFED", "usagovPW"]
78
17
 
79
- File.open(Gman.list_path, "w") { |file| file.write output }
18
+ data = open(source).read
19
+ domains = YAML.load(data)
20
+ domains.reject! { |group,domain| blacklist.include?(group) }
80
21
 
81
- result = Gman::Parser.file_to_array( Gman::list_path )
82
- puts "New list contains #{result.size} domains. Fin."
22
+ Gman.import(domains)
data/test/helper.rb CHANGED
@@ -16,10 +16,12 @@ require 'shoulda'
16
16
 
17
17
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
- require 'gman'
19
+ require_relative "../lib/gman"
20
+ require_relative "../lib/gman/domain_list"
21
+
20
22
  require 'net/dns'
21
23
  require 'net/dns/resolver'
22
- require './lib/gman/parser'
24
+ require './lib/gman/importer'
23
25
 
24
26
  def test_bin(*args)
25
27
  output, status = Open3.capture2e("bundle", "exec", "gman", *args)
@@ -2,9 +2,9 @@ require File.join(File.dirname(__FILE__), 'helper')
2
2
 
3
3
  class TestGmanCountryCodes < Minitest::Test
4
4
  should "determine a domain's country" do
5
- assert_equal "United States", Gman.new("whitehouse.gov").country.name
6
- assert_equal "United States", Gman.new("army.mil").country.name
7
- assert_equal "United Kingdom", Gman.new("foo.gov.uk").country.name
5
+ assert_equal "United States of America", Gman.new("whitehouse.gov").country.name
6
+ assert_equal "United States of America", Gman.new("army.mil").country.name
7
+ assert_equal "United Kingdom of Great Britain and Northern Ireland", Gman.new("foo.gov.uk").country.name
8
8
  assert_equal "Canada", Gman.new("foo.gc.ca").country.name
9
9
  end
10
10
  end
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), 'helper')
2
+
3
+ class TestDomains < Minitest::Test
4
+
5
+ WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
6
+
7
+ def resolve_domains?
8
+ ENV["GMAN_RESOLVE_DOMAINS"] == "true"
9
+ end
10
+
11
+ should "only contains valid domains" do
12
+ importer = Gman::Importer.new({})
13
+ if resolve_domains?
14
+ importer.logger.info "Validating that all domains resolve. This may take a while..."
15
+ else
16
+ importer.logger.info "Skipping domain resolution. Run `GMAN_RESOLVE_DOMAINS=true rake test` to validate that domains resolve."
17
+ end
18
+
19
+ invalid = []
20
+ Parallel.each(Gman::DomainList.current.list, :in_threads => 2) do |group, domains|
21
+ next if WHITELIST.include?(group)
22
+ invalid.push domains.reject { |domain|
23
+ importer.valid_domain?(domain, :skip_dupe => true, :skip_resolve => !resolve_domains?)
24
+ }
25
+ end
26
+ assert_equal [], invalid.flatten.reject { |e| e.empty? }
27
+ end
28
+ end
@@ -98,7 +98,6 @@ class TestGmanIdentifier < Minitest::Test
98
98
  end
99
99
 
100
100
  should "detect the state" do
101
- assert_equal "PR", Gman.new("sanjuan.pr").state
102
101
  assert_equal "OR", Gman.new("ashland.or.us").state
103
102
  refute Gman.new("canada.ca").state
104
103
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.5
4
+ version: 4.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-15 00:00:00.000000000 Z
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: swot
@@ -188,29 +188,30 @@ files:
188
188
  - gman.gemspec
189
189
  - lib/gman.rb
190
190
  - lib/gman/country_codes.rb
191
+ - lib/gman/domain_list.rb
191
192
  - lib/gman/identifier.rb
193
+ - lib/gman/importer.rb
192
194
  - lib/gman/locality.rb
193
- - lib/gman/parser.rb
194
195
  - lib/gman/sanctions.rb
195
196
  - script/alphabetize
196
- - script/build
197
197
  - script/cibuild
198
198
  - script/console
199
199
  - script/dedupe
200
200
  - script/prune
201
201
  - script/release
202
- - script/state-domains
203
- - script/vendor-de
202
+ - script/vendor-federal-de
204
203
  - script/vendor-gov-list
204
+ - script/vendor-municipal-de
205
205
  - script/vendor-nl
206
+ - script/vendor-public-suffix
206
207
  - script/vendor-se
207
208
  - script/vendor-us
208
209
  - test/helper.rb
209
210
  - test/obama.txt
210
- - test/test_domains.rb
211
211
  - test/test_gman.rb
212
212
  - test/test_gman_bin.rb
213
213
  - test/test_gman_country_codes.rb
214
+ - test/test_gman_domains.rb
214
215
  - test/test_gman_filter.rb
215
216
  - test/test_gman_identifier.rb
216
217
  - test/test_gman_locality.rb
@@ -242,10 +243,10 @@ summary: Check if a given domain or email address belong to a governemnt entity
242
243
  test_files:
243
244
  - test/helper.rb
244
245
  - test/obama.txt
245
- - test/test_domains.rb
246
246
  - test/test_gman.rb
247
247
  - test/test_gman_bin.rb
248
248
  - test/test_gman_country_codes.rb
249
+ - test/test_gman_domains.rb
249
250
  - test/test_gman_filter.rb
250
251
  - test/test_gman_identifier.rb
251
252
  - test/test_gman_locality.rb
data/lib/gman/parser.rb DELETED
@@ -1,59 +0,0 @@
1
- # Utility functions for parsing and manipulating public-suffix formatted domain lists
2
- require 'net/dns'
3
- require 'net/dns/resolver'
4
-
5
- class Gman < NaughtyOrNice
6
- class Parser
7
-
8
- COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
9
-
10
- class << self
11
-
12
- # Given a public-suffix list formatted file
13
- # Converts to a hash in the form of :group => [domain1, domain2...]
14
- def file_to_hash(file)
15
- array_to_hash(file_to_array(file))
16
- end
17
-
18
- # Given a public-suffix list formatted file
19
- # Convert it into an array of comments and domains representing each line
20
- def file_to_array(file)
21
- domains = File.open(file).read
22
- domains.gsub! /\r\n?/, "\n" # Normalize line endings
23
- domains = domains.split("\n")
24
- end
25
-
26
- # Given an array of comments/domains in public suffix format
27
- # Converts to a hash in the form of :group => [domain1, domain2...]
28
- def array_to_hash(domains)
29
- group = ""
30
- domain_hash = {}
31
- domains.each do |line|
32
- next if line.empty?
33
- if match = COMMENT_REGEX.match(line)
34
- group = match[1]
35
- else
36
- domain_hash[group] = [] if domain_hash[group].nil?
37
- domain_hash[group].push line.downcase
38
- end
39
- end
40
- domain_hash
41
- end
42
-
43
- def resolver
44
- @resolver ||= begin
45
- resolver = Net::DNS::Resolver.new
46
- resolver.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"]
47
- resolver
48
- end
49
- end
50
-
51
- # Verifies that the given domain has an MX record, and thus is valid
52
- def domain_resolves?(domain)
53
- resolver.search(domain).header.anCount > 0 ||
54
- resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
55
- resolver.search(domain, Net::DNS::MX).header.anCount > 0
56
- end
57
- end
58
- end
59
- end
data/script/state-domains DELETED
@@ -1,38 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # Strips domains in the form of e.g., city.<locality>.<state>.us from the domain list
3
-
4
- require './lib/gman'
5
- require './lib/gman/parser'
6
-
7
- current = Gman::Parser.file_to_array( Gman::list_path )
8
- domain_hash = Gman::Parser.array_to_hash(current)
9
-
10
- puts "Checking for state gov regex'd domains in the list..."
11
- puts "Starting with #{current.size} domains..."
12
-
13
- domain_hash.each do |group, domains|
14
- next unless group =~ /usagov[A-Z]{2}/
15
- state = group[-2,2].downcase
16
- domain_hash[group].reject! { |d| d =~ Gman::LOCALITY_REGEX }
17
- domain_hash[group].uniq!
18
- domain_hash[group].sort!
19
- end
20
-
21
- # PublicSuffix Formatted Output
22
- current_group = ""
23
- output = ""
24
- domain_hash.each do |group, domains|
25
- if group != current_group
26
- output << "\n\n" unless current_group.empty? # first entry
27
- output << "// #{group}\n"
28
- current_group = group
29
- end
30
- output << domains.join("\n")
31
- end
32
-
33
- File.open(Gman.list_path, "w") { |file| file.write output }
34
-
35
- result = Gman::Parser.file_to_array( Gman::list_path )
36
- puts "New list contains #{result.size} domains. Fin."
37
-
38
- exit 1 if current.size != result.size
data/script/vendor-de DELETED
@@ -1,44 +0,0 @@
1
- #! /usr/bin/env ruby
2
-
3
- require 'csv'
4
- require 'open-uri'
5
- require './lib/gman'
6
- require './lib/gman/parser'
7
-
8
- source = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
9
-
10
- csv = open(source).read.force_encoding("iso-8859-1").encode("UTF-8")
11
-
12
- # For some reason, the header row is actually the last row
13
- # Pop the last line off the file and prepend it at the begining
14
- # So that when we pass it to CSV it detects the headers properly
15
- lines = csv.split("\n")
16
- lines.unshift lines.pop
17
- csv = lines.join("\n")
18
-
19
- data = CSV.parse(csv, :headers => true, :col_sep => ";")
20
- domains = data.map { |row| row["Internet"].to_s.downcase.strip.gsub /^www./, "" }
21
-
22
- domains.reject! { |domain| domain.empty? }
23
- domains.select! { |domain| PublicSuffix.valid?(".#{domain}") } # Validate domain
24
- domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
25
-
26
- current = Gman::Parser.file_to_array( Gman::list_path )
27
- current_hash = Gman::Parser.array_to_hash(current)
28
-
29
- current_hash["German Municipalities"] = domains
30
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
31
-
32
- # PublicSuffix Formatted Output
33
- current_group = ""
34
- output = ""
35
- current_hash.each do |group, domains|
36
- if group != current_group
37
- output << "\n\n" unless current_group.empty? # first entry
38
- output << "// #{group}\n"
39
- current_group = group
40
- end
41
- output << domains.join("\n")
42
- end
43
-
44
- File.open(Gman.list_path, "w") { |file| file.write output }
data/test/test_domains.rb DELETED
@@ -1,54 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'helper')
2
-
3
- class TestDomains < Minitest::Test
4
-
5
- WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
6
- DOMAINS = Gman::Parser.file_to_hash(Gman.list_path)
7
-
8
- def whitelisted?(domain)
9
- WHITELIST.each do |group|
10
- return true if DOMAINS[group].include? domain
11
- end
12
- false
13
- end
14
-
15
- should "only contain resolvable domains" do
16
- unresolvables = []
17
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
18
- next if whitelisted? entry.name
19
- resolves = Gman::Parser.domain_resolves?(entry.name)
20
- unresolvables.push entry.name unless resolves
21
- end
22
- assert_equal [], unresolvables
23
- end
24
-
25
- should "not contain any educational domains" do
26
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
27
- assert_equal false, Swot::is_academic?(entry.name), "#{entry.name} is an academic domain"
28
- end
29
- end
30
-
31
- should "not contain any invalid domains" do
32
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
33
- assert_equal true, PublicSuffix.valid?("foo.#{entry.name}"), "#{entry.name} is not a valid domain"
34
- end
35
- end
36
-
37
- should "pass any url on the list" do
38
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
39
- assert_equal true, Gman.valid?("http://foo.#{entry.name}/bar"), "http://foo.#{entry.name}/bar is not a valid"
40
- end
41
- end
42
-
43
- should "pass any email on the list" do
44
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
45
- assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
46
- end
47
- end
48
-
49
- should "pass any domain on the list" do
50
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
51
- assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
52
- end
53
- end
54
- end