gman 4.6.5 → 4.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/script/vendor-us CHANGED
@@ -3,9 +3,6 @@
3
3
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
4
  # Source: https://github.com/GSA-OCSIT/govt-urls
5
5
  #
6
- # Normalizes and cleans inputs, validates domains, rejects academic domains, and
7
- # sorts, ensures uniqueness, and merges into the existing lib/domains.txt list
8
- #
9
6
  # Usage: script/vendor-us
10
7
  #
11
8
  # Will automatically fetch latest version of the list and merge
@@ -13,70 +10,13 @@
13
10
  #
14
11
  # It's also probably a good idea to run `script/ci-build` for good measure
15
12
 
16
- require 'rubygems'
17
- require 'public_suffix'
18
- require 'swot'
19
- require 'yaml'
20
- require 'open-uri'
21
- require './lib/gman'
22
- require './lib/gman/parser'
23
-
24
- SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
25
- BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
26
- domain_hash = {}
27
-
28
- domain_hash = YAML.load(open(SOURCE).read)
29
- puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..."
30
-
31
- # Normalize ALL THE THINGS
32
- domain_hash.each do |group, domains|
33
- domains.map! { |domain| domain.strip } # Strip trailing slashes
34
- domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
35
- domains.map! { |domain| domain.downcase } # make lower case
36
- domains.reject! { |domain| domain.empty? } # Reject empty strings
37
- end
38
-
39
- # filter
40
- domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist
41
- domain_hash.each do |group, domains|
42
- puts "Filtering #{group}..."
43
- domains.reject! { |domain| domain.match /\// } # Reject URLs
44
- domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain
45
- domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
46
- end
47
- puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains"
48
-
49
- # Grab existing list
50
- current = Gman::Parser.file_to_array( Gman::list_path )
51
- current_hash = Gman::Parser.array_to_hash(current)
52
- puts "Current list contains #{current.size} domains... merging"
53
-
54
- # Lazy deep merge
55
- domain_hash.each do |group,domains|
56
- current_hash[group] = [] if current_hash[group].nil?
57
- current_hash[group].concat domains
58
- current_hash[group].sort! # Alphabetize
59
- current_hash[group].uniq! # Ensure uniqueness
60
- end
61
-
62
- # Sort by group
63
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
64
-
65
- # PublicSuffix Formatted Output
66
- current_group = ""
67
- output = ""
68
- current_hash.each do |group, domains|
69
- if group != current_group
70
- output << "\n\n" unless current_group.empty? # first entry
71
- output << "// #{group}\n"
72
- current_group = group
73
- end
74
- output << domains.join("\n")
75
- end
13
+ require './lib/gman/importer'
76
14
 
77
- puts "merged. Writing..."
15
+ source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
16
+ blacklist = ["usagovQUASI", "usagovFED", "usagovPW"]
78
17
 
79
- File.open(Gman.list_path, "w") { |file| file.write output }
18
+ data = open(source).read
19
+ domains = YAML.load(data)
20
+ domains.reject! { |group,domain| blacklist.include?(group) }
80
21
 
81
- result = Gman::Parser.file_to_array( Gman::list_path )
82
- puts "New list contains #{result.size} domains. Fin."
22
+ Gman.import(domains)
data/test/helper.rb CHANGED
@@ -16,10 +16,12 @@ require 'shoulda'
16
16
 
17
17
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
19
- require 'gman'
19
+ require_relative "../lib/gman"
20
+ require_relative "../lib/gman/domain_list"
21
+
20
22
  require 'net/dns'
21
23
  require 'net/dns/resolver'
22
- require './lib/gman/parser'
24
+ require './lib/gman/importer'
23
25
 
24
26
  def test_bin(*args)
25
27
  output, status = Open3.capture2e("bundle", "exec", "gman", *args)
@@ -2,9 +2,9 @@ require File.join(File.dirname(__FILE__), 'helper')
2
2
 
3
3
  class TestGmanCountryCodes < Minitest::Test
4
4
  should "determine a domain's country" do
5
- assert_equal "United States", Gman.new("whitehouse.gov").country.name
6
- assert_equal "United States", Gman.new("army.mil").country.name
7
- assert_equal "United Kingdom", Gman.new("foo.gov.uk").country.name
5
+ assert_equal "United States of America", Gman.new("whitehouse.gov").country.name
6
+ assert_equal "United States of America", Gman.new("army.mil").country.name
7
+ assert_equal "United Kingdom of Great Britain and Northern Ireland", Gman.new("foo.gov.uk").country.name
8
8
  assert_equal "Canada", Gman.new("foo.gc.ca").country.name
9
9
  end
10
10
  end
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), 'helper')
2
+
3
+ class TestDomains < Minitest::Test
4
+
5
+ WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
6
+
7
+ def resolve_domains?
8
+ ENV["GMAN_RESOLVE_DOMAINS"] == "true"
9
+ end
10
+
11
+ should "only contains valid domains" do
12
+ importer = Gman::Importer.new({})
13
+ if resolve_domains?
14
+ importer.logger.info "Validating that all domains resolve. This may take a while..."
15
+ else
16
+ importer.logger.info "Skipping domain resolution. Run `GMAN_RESOLVE_DOMAINS=true rake test` to validate that domains resolve."
17
+ end
18
+
19
+ invalid = []
20
+ Parallel.each(Gman::DomainList.current.list, :in_threads => 2) do |group, domains|
21
+ next if WHITELIST.include?(group)
22
+ invalid.push domains.reject { |domain|
23
+ importer.valid_domain?(domain, :skip_dupe => true, :skip_resolve => !resolve_domains?)
24
+ }
25
+ end
26
+ assert_equal [], invalid.flatten.reject { |e| e.empty? }
27
+ end
28
+ end
@@ -98,7 +98,6 @@ class TestGmanIdentifier < Minitest::Test
98
98
  end
99
99
 
100
100
  should "detect the state" do
101
- assert_equal "PR", Gman.new("sanjuan.pr").state
102
101
  assert_equal "OR", Gman.new("ashland.or.us").state
103
102
  refute Gman.new("canada.ca").state
104
103
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.5
4
+ version: 4.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-15 00:00:00.000000000 Z
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: swot
@@ -188,29 +188,30 @@ files:
188
188
  - gman.gemspec
189
189
  - lib/gman.rb
190
190
  - lib/gman/country_codes.rb
191
+ - lib/gman/domain_list.rb
191
192
  - lib/gman/identifier.rb
193
+ - lib/gman/importer.rb
192
194
  - lib/gman/locality.rb
193
- - lib/gman/parser.rb
194
195
  - lib/gman/sanctions.rb
195
196
  - script/alphabetize
196
- - script/build
197
197
  - script/cibuild
198
198
  - script/console
199
199
  - script/dedupe
200
200
  - script/prune
201
201
  - script/release
202
- - script/state-domains
203
- - script/vendor-de
202
+ - script/vendor-federal-de
204
203
  - script/vendor-gov-list
204
+ - script/vendor-municipal-de
205
205
  - script/vendor-nl
206
+ - script/vendor-public-suffix
206
207
  - script/vendor-se
207
208
  - script/vendor-us
208
209
  - test/helper.rb
209
210
  - test/obama.txt
210
- - test/test_domains.rb
211
211
  - test/test_gman.rb
212
212
  - test/test_gman_bin.rb
213
213
  - test/test_gman_country_codes.rb
214
+ - test/test_gman_domains.rb
214
215
  - test/test_gman_filter.rb
215
216
  - test/test_gman_identifier.rb
216
217
  - test/test_gman_locality.rb
@@ -242,10 +243,10 @@ summary: Check if a given domain or email address belong to a governemnt entity
242
243
  test_files:
243
244
  - test/helper.rb
244
245
  - test/obama.txt
245
- - test/test_domains.rb
246
246
  - test/test_gman.rb
247
247
  - test/test_gman_bin.rb
248
248
  - test/test_gman_country_codes.rb
249
+ - test/test_gman_domains.rb
249
250
  - test/test_gman_filter.rb
250
251
  - test/test_gman_identifier.rb
251
252
  - test/test_gman_locality.rb
data/lib/gman/parser.rb DELETED
@@ -1,59 +0,0 @@
1
- # Utility functions for parsing and manipulating public-suffix formatted domain lists
2
- require 'net/dns'
3
- require 'net/dns/resolver'
4
-
5
- class Gman < NaughtyOrNice
6
- class Parser
7
-
8
- COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
9
-
10
- class << self
11
-
12
- # Given a public-suffix list formatted file
13
- # Converts to a hash in the form of :group => [domain1, domain2...]
14
- def file_to_hash(file)
15
- array_to_hash(file_to_array(file))
16
- end
17
-
18
- # Given a public-suffix list formatted file
19
- # Convert it into an array of comments and domains representing each line
20
- def file_to_array(file)
21
- domains = File.open(file).read
22
- domains.gsub! /\r\n?/, "\n" # Normalize line endings
23
- domains = domains.split("\n")
24
- end
25
-
26
- # Given an array of comments/domains in public suffix format
27
- # Converts to a hash in the form of :group => [domain1, domain2...]
28
- def array_to_hash(domains)
29
- group = ""
30
- domain_hash = {}
31
- domains.each do |line|
32
- next if line.empty?
33
- if match = COMMENT_REGEX.match(line)
34
- group = match[1]
35
- else
36
- domain_hash[group] = [] if domain_hash[group].nil?
37
- domain_hash[group].push line.downcase
38
- end
39
- end
40
- domain_hash
41
- end
42
-
43
- def resolver
44
- @resolver ||= begin
45
- resolver = Net::DNS::Resolver.new
46
- resolver.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"]
47
- resolver
48
- end
49
- end
50
-
51
- # Verifies that the given domain has an MX record, and thus is valid
52
- def domain_resolves?(domain)
53
- resolver.search(domain).header.anCount > 0 ||
54
- resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
55
- resolver.search(domain, Net::DNS::MX).header.anCount > 0
56
- end
57
- end
58
- end
59
- end
data/script/state-domains DELETED
@@ -1,38 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # Strips domains in the form of e.g., city.<locality>.<state>.us from the domain list
3
-
4
- require './lib/gman'
5
- require './lib/gman/parser'
6
-
7
- current = Gman::Parser.file_to_array( Gman::list_path )
8
- domain_hash = Gman::Parser.array_to_hash(current)
9
-
10
- puts "Checking for state gov regex'd domains in the list..."
11
- puts "Starting with #{current.size} domains..."
12
-
13
- domain_hash.each do |group, domains|
14
- next unless group =~ /usagov[A-Z]{2}/
15
- state = group[-2,2].downcase
16
- domain_hash[group].reject! { |d| d =~ Gman::LOCALITY_REGEX }
17
- domain_hash[group].uniq!
18
- domain_hash[group].sort!
19
- end
20
-
21
- # PublicSuffix Formatted Output
22
- current_group = ""
23
- output = ""
24
- domain_hash.each do |group, domains|
25
- if group != current_group
26
- output << "\n\n" unless current_group.empty? # first entry
27
- output << "// #{group}\n"
28
- current_group = group
29
- end
30
- output << domains.join("\n")
31
- end
32
-
33
- File.open(Gman.list_path, "w") { |file| file.write output }
34
-
35
- result = Gman::Parser.file_to_array( Gman::list_path )
36
- puts "New list contains #{result.size} domains. Fin."
37
-
38
- exit 1 if current.size != result.size
data/script/vendor-de DELETED
@@ -1,44 +0,0 @@
1
- #! /usr/bin/env ruby
2
-
3
- require 'csv'
4
- require 'open-uri'
5
- require './lib/gman'
6
- require './lib/gman/parser'
7
-
8
- source = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
9
-
10
- csv = open(source).read.force_encoding("iso-8859-1").encode("UTF-8")
11
-
12
- # For some reason, the header row is actually the last row
13
- # Pop the last line off the file and prepend it at the begining
14
- # So that when we pass it to CSV it detects the headers properly
15
- lines = csv.split("\n")
16
- lines.unshift lines.pop
17
- csv = lines.join("\n")
18
-
19
- data = CSV.parse(csv, :headers => true, :col_sep => ";")
20
- domains = data.map { |row| row["Internet"].to_s.downcase.strip.gsub /^www./, "" }
21
-
22
- domains.reject! { |domain| domain.empty? }
23
- domains.select! { |domain| PublicSuffix.valid?(".#{domain}") } # Validate domain
24
- domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
25
-
26
- current = Gman::Parser.file_to_array( Gman::list_path )
27
- current_hash = Gman::Parser.array_to_hash(current)
28
-
29
- current_hash["German Municipalities"] = domains
30
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
31
-
32
- # PublicSuffix Formatted Output
33
- current_group = ""
34
- output = ""
35
- current_hash.each do |group, domains|
36
- if group != current_group
37
- output << "\n\n" unless current_group.empty? # first entry
38
- output << "// #{group}\n"
39
- current_group = group
40
- end
41
- output << domains.join("\n")
42
- end
43
-
44
- File.open(Gman.list_path, "w") { |file| file.write output }
data/test/test_domains.rb DELETED
@@ -1,54 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'helper')
2
-
3
- class TestDomains < Minitest::Test
4
-
5
- WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
6
- DOMAINS = Gman::Parser.file_to_hash(Gman.list_path)
7
-
8
- def whitelisted?(domain)
9
- WHITELIST.each do |group|
10
- return true if DOMAINS[group].include? domain
11
- end
12
- false
13
- end
14
-
15
- should "only contain resolvable domains" do
16
- unresolvables = []
17
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
18
- next if whitelisted? entry.name
19
- resolves = Gman::Parser.domain_resolves?(entry.name)
20
- unresolvables.push entry.name unless resolves
21
- end
22
- assert_equal [], unresolvables
23
- end
24
-
25
- should "not contain any educational domains" do
26
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
27
- assert_equal false, Swot::is_academic?(entry.name), "#{entry.name} is an academic domain"
28
- end
29
- end
30
-
31
- should "not contain any invalid domains" do
32
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
33
- assert_equal true, PublicSuffix.valid?("foo.#{entry.name}"), "#{entry.name} is not a valid domain"
34
- end
35
- end
36
-
37
- should "pass any url on the list" do
38
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
39
- assert_equal true, Gman.valid?("http://foo.#{entry.name}/bar"), "http://foo.#{entry.name}/bar is not a valid"
40
- end
41
- end
42
-
43
- should "pass any email on the list" do
44
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
45
- assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
46
- end
47
- end
48
-
49
- should "pass any domain on the list" do
50
- Parallel.each(Gman.list, :in_threads => 2) do |entry|
51
- assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
52
- end
53
- end
54
- end