gman 4.6.5 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +0 -7
- data/config/domains.txt +283 -367
- data/gman.gemspec +1 -1
- data/lib/gman/domain_list.rb +78 -0
- data/lib/gman/importer.rb +135 -0
- data/script/alphabetize +4 -34
- data/script/cibuild +0 -1
- data/script/dedupe +10 -27
- data/script/prune +5 -2
- data/script/vendor-federal-de +14 -0
- data/script/vendor-municipal-de +23 -0
- data/script/vendor-nl +9 -4
- data/script/{build → vendor-public-suffix} +4 -6
- data/script/vendor-se +4 -38
- data/script/vendor-us +7 -67
- data/test/helper.rb +4 -2
- data/test/test_gman_country_codes.rb +3 -3
- data/test/test_gman_domains.rb +28 -0
- data/test/test_gman_identifier.rb +0 -1
- metadata +9 -8
- data/lib/gman/parser.rb +0 -59
- data/script/state-domains +0 -38
- data/script/vendor-de +0 -44
- data/test/test_domains.rb +0 -54
data/script/vendor-us
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
# Vendors the USA.gov-maintained list of US domains into domains.txt
|
4
4
|
# Source: https://github.com/GSA-OCSIT/govt-urls
|
5
5
|
#
|
6
|
-
# Normalizes and cleans inputs, validates domains, rejects academic domains, and
|
7
|
-
# sorts, ensures uniqueness, and merges into the existing lib/domains.txt list
|
8
|
-
#
|
9
6
|
# Usage: script/vendor-us
|
10
7
|
#
|
11
8
|
# Will automatically fetch latest version of the list and merge
|
@@ -13,70 +10,13 @@
|
|
13
10
|
#
|
14
11
|
# It's also probably a good idea to run `script/ci-build` for good measure
|
15
12
|
|
16
|
-
require '
|
17
|
-
require 'public_suffix'
|
18
|
-
require 'swot'
|
19
|
-
require 'yaml'
|
20
|
-
require 'open-uri'
|
21
|
-
require './lib/gman'
|
22
|
-
require './lib/gman/parser'
|
23
|
-
|
24
|
-
SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
25
|
-
BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
|
26
|
-
domain_hash = {}
|
27
|
-
|
28
|
-
domain_hash = YAML.load(open(SOURCE).read)
|
29
|
-
puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..."
|
30
|
-
|
31
|
-
# Normalize ALL THE THINGS
|
32
|
-
domain_hash.each do |group, domains|
|
33
|
-
domains.map! { |domain| domain.strip } # Strip trailing slashes
|
34
|
-
domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
|
35
|
-
domains.map! { |domain| domain.downcase } # make lower case
|
36
|
-
domains.reject! { |domain| domain.empty? } # Reject empty strings
|
37
|
-
end
|
38
|
-
|
39
|
-
# filter
|
40
|
-
domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist
|
41
|
-
domain_hash.each do |group, domains|
|
42
|
-
puts "Filtering #{group}..."
|
43
|
-
domains.reject! { |domain| domain.match /\// } # Reject URLs
|
44
|
-
domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain
|
45
|
-
domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
|
46
|
-
end
|
47
|
-
puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains"
|
48
|
-
|
49
|
-
# Grab existing list
|
50
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
51
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
52
|
-
puts "Current list contains #{current.size} domains... merging"
|
53
|
-
|
54
|
-
# Lazy deep merge
|
55
|
-
domain_hash.each do |group,domains|
|
56
|
-
current_hash[group] = [] if current_hash[group].nil?
|
57
|
-
current_hash[group].concat domains
|
58
|
-
current_hash[group].sort! # Alphabetize
|
59
|
-
current_hash[group].uniq! # Ensure uniqueness
|
60
|
-
end
|
61
|
-
|
62
|
-
# Sort by group
|
63
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
64
|
-
|
65
|
-
# PublicSuffix Formatted Output
|
66
|
-
current_group = ""
|
67
|
-
output = ""
|
68
|
-
current_hash.each do |group, domains|
|
69
|
-
if group != current_group
|
70
|
-
output << "\n\n" unless current_group.empty? # first entry
|
71
|
-
output << "// #{group}\n"
|
72
|
-
current_group = group
|
73
|
-
end
|
74
|
-
output << domains.join("\n")
|
75
|
-
end
|
13
|
+
require './lib/gman/importer'
|
76
14
|
|
77
|
-
|
15
|
+
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
16
|
+
blacklist = ["usagovQUASI", "usagovFED", "usagovPW"]
|
78
17
|
|
79
|
-
|
18
|
+
data = open(source).read
|
19
|
+
domains = YAML.load(data)
|
20
|
+
domains.reject! { |group,domain| blacklist.include?(group) }
|
80
21
|
|
81
|
-
|
82
|
-
puts "New list contains #{result.size} domains. Fin."
|
22
|
+
Gman.import(domains)
|
data/test/helper.rb
CHANGED
@@ -16,10 +16,12 @@ require 'shoulda'
|
|
16
16
|
|
17
17
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
18
18
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
|
-
|
19
|
+
require_relative "../lib/gman"
|
20
|
+
require_relative "../lib/gman/domain_list"
|
21
|
+
|
20
22
|
require 'net/dns'
|
21
23
|
require 'net/dns/resolver'
|
22
|
-
require './lib/gman/
|
24
|
+
require './lib/gman/importer'
|
23
25
|
|
24
26
|
def test_bin(*args)
|
25
27
|
output, status = Open3.capture2e("bundle", "exec", "gman", *args)
|
@@ -2,9 +2,9 @@ require File.join(File.dirname(__FILE__), 'helper')
|
|
2
2
|
|
3
3
|
class TestGmanCountryCodes < Minitest::Test
|
4
4
|
should "determine a domain's country" do
|
5
|
-
assert_equal "United States", Gman.new("whitehouse.gov").country.name
|
6
|
-
assert_equal "United States", Gman.new("army.mil").country.name
|
7
|
-
assert_equal "United Kingdom", Gman.new("foo.gov.uk").country.name
|
5
|
+
assert_equal "United States of America", Gman.new("whitehouse.gov").country.name
|
6
|
+
assert_equal "United States of America", Gman.new("army.mil").country.name
|
7
|
+
assert_equal "United Kingdom of Great Britain and Northern Ireland", Gman.new("foo.gov.uk").country.name
|
8
8
|
assert_equal "Canada", Gman.new("foo.gc.ca").country.name
|
9
9
|
end
|
10
10
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
class TestDomains < Minitest::Test
|
4
|
+
|
5
|
+
WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
|
6
|
+
|
7
|
+
def resolve_domains?
|
8
|
+
ENV["GMAN_RESOLVE_DOMAINS"] == "true"
|
9
|
+
end
|
10
|
+
|
11
|
+
should "only contains valid domains" do
|
12
|
+
importer = Gman::Importer.new({})
|
13
|
+
if resolve_domains?
|
14
|
+
importer.logger.info "Validating that all domains resolve. This may take a while..."
|
15
|
+
else
|
16
|
+
importer.logger.info "Skipping domain resolution. Run `GMAN_RESOLVE_DOMAINS=true rake test` to validate that domains resolve."
|
17
|
+
end
|
18
|
+
|
19
|
+
invalid = []
|
20
|
+
Parallel.each(Gman::DomainList.current.list, :in_threads => 2) do |group, domains|
|
21
|
+
next if WHITELIST.include?(group)
|
22
|
+
invalid.push domains.reject { |domain|
|
23
|
+
importer.valid_domain?(domain, :skip_dupe => true, :skip_resolve => !resolve_domains?)
|
24
|
+
}
|
25
|
+
end
|
26
|
+
assert_equal [], invalid.flatten.reject { |e| e.empty? }
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: swot
|
@@ -188,29 +188,30 @@ files:
|
|
188
188
|
- gman.gemspec
|
189
189
|
- lib/gman.rb
|
190
190
|
- lib/gman/country_codes.rb
|
191
|
+
- lib/gman/domain_list.rb
|
191
192
|
- lib/gman/identifier.rb
|
193
|
+
- lib/gman/importer.rb
|
192
194
|
- lib/gman/locality.rb
|
193
|
-
- lib/gman/parser.rb
|
194
195
|
- lib/gman/sanctions.rb
|
195
196
|
- script/alphabetize
|
196
|
-
- script/build
|
197
197
|
- script/cibuild
|
198
198
|
- script/console
|
199
199
|
- script/dedupe
|
200
200
|
- script/prune
|
201
201
|
- script/release
|
202
|
-
- script/
|
203
|
-
- script/vendor-de
|
202
|
+
- script/vendor-federal-de
|
204
203
|
- script/vendor-gov-list
|
204
|
+
- script/vendor-municipal-de
|
205
205
|
- script/vendor-nl
|
206
|
+
- script/vendor-public-suffix
|
206
207
|
- script/vendor-se
|
207
208
|
- script/vendor-us
|
208
209
|
- test/helper.rb
|
209
210
|
- test/obama.txt
|
210
|
-
- test/test_domains.rb
|
211
211
|
- test/test_gman.rb
|
212
212
|
- test/test_gman_bin.rb
|
213
213
|
- test/test_gman_country_codes.rb
|
214
|
+
- test/test_gman_domains.rb
|
214
215
|
- test/test_gman_filter.rb
|
215
216
|
- test/test_gman_identifier.rb
|
216
217
|
- test/test_gman_locality.rb
|
@@ -242,10 +243,10 @@ summary: Check if a given domain or email address belong to a governemnt entity
|
|
242
243
|
test_files:
|
243
244
|
- test/helper.rb
|
244
245
|
- test/obama.txt
|
245
|
-
- test/test_domains.rb
|
246
246
|
- test/test_gman.rb
|
247
247
|
- test/test_gman_bin.rb
|
248
248
|
- test/test_gman_country_codes.rb
|
249
|
+
- test/test_gman_domains.rb
|
249
250
|
- test/test_gman_filter.rb
|
250
251
|
- test/test_gman_identifier.rb
|
251
252
|
- test/test_gman_locality.rb
|
data/lib/gman/parser.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
# Utility functions for parsing and manipulating public-suffix formatted domain lists
|
2
|
-
require 'net/dns'
|
3
|
-
require 'net/dns/resolver'
|
4
|
-
|
5
|
-
class Gman < NaughtyOrNice
|
6
|
-
class Parser
|
7
|
-
|
8
|
-
COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
|
9
|
-
|
10
|
-
class << self
|
11
|
-
|
12
|
-
# Given a public-suffix list formatted file
|
13
|
-
# Converts to a hash in the form of :group => [domain1, domain2...]
|
14
|
-
def file_to_hash(file)
|
15
|
-
array_to_hash(file_to_array(file))
|
16
|
-
end
|
17
|
-
|
18
|
-
# Given a public-suffix list formatted file
|
19
|
-
# Convert it into an array of comments and domains representing each line
|
20
|
-
def file_to_array(file)
|
21
|
-
domains = File.open(file).read
|
22
|
-
domains.gsub! /\r\n?/, "\n" # Normalize line endings
|
23
|
-
domains = domains.split("\n")
|
24
|
-
end
|
25
|
-
|
26
|
-
# Given an array of comments/domains in public suffix format
|
27
|
-
# Converts to a hash in the form of :group => [domain1, domain2...]
|
28
|
-
def array_to_hash(domains)
|
29
|
-
group = ""
|
30
|
-
domain_hash = {}
|
31
|
-
domains.each do |line|
|
32
|
-
next if line.empty?
|
33
|
-
if match = COMMENT_REGEX.match(line)
|
34
|
-
group = match[1]
|
35
|
-
else
|
36
|
-
domain_hash[group] = [] if domain_hash[group].nil?
|
37
|
-
domain_hash[group].push line.downcase
|
38
|
-
end
|
39
|
-
end
|
40
|
-
domain_hash
|
41
|
-
end
|
42
|
-
|
43
|
-
def resolver
|
44
|
-
@resolver ||= begin
|
45
|
-
resolver = Net::DNS::Resolver.new
|
46
|
-
resolver.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"]
|
47
|
-
resolver
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# Verifies that the given domain has an MX record, and thus is valid
|
52
|
-
def domain_resolves?(domain)
|
53
|
-
resolver.search(domain).header.anCount > 0 ||
|
54
|
-
resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
|
55
|
-
resolver.search(domain, Net::DNS::MX).header.anCount > 0
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
data/script/state-domains
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# Strips domains in the form of e.g., city.<locality>.<state>.us from the domain list
|
3
|
-
|
4
|
-
require './lib/gman'
|
5
|
-
require './lib/gman/parser'
|
6
|
-
|
7
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
8
|
-
domain_hash = Gman::Parser.array_to_hash(current)
|
9
|
-
|
10
|
-
puts "Checking for state gov regex'd domains in the list..."
|
11
|
-
puts "Starting with #{current.size} domains..."
|
12
|
-
|
13
|
-
domain_hash.each do |group, domains|
|
14
|
-
next unless group =~ /usagov[A-Z]{2}/
|
15
|
-
state = group[-2,2].downcase
|
16
|
-
domain_hash[group].reject! { |d| d =~ Gman::LOCALITY_REGEX }
|
17
|
-
domain_hash[group].uniq!
|
18
|
-
domain_hash[group].sort!
|
19
|
-
end
|
20
|
-
|
21
|
-
# PublicSuffix Formatted Output
|
22
|
-
current_group = ""
|
23
|
-
output = ""
|
24
|
-
domain_hash.each do |group, domains|
|
25
|
-
if group != current_group
|
26
|
-
output << "\n\n" unless current_group.empty? # first entry
|
27
|
-
output << "// #{group}\n"
|
28
|
-
current_group = group
|
29
|
-
end
|
30
|
-
output << domains.join("\n")
|
31
|
-
end
|
32
|
-
|
33
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
34
|
-
|
35
|
-
result = Gman::Parser.file_to_array( Gman::list_path )
|
36
|
-
puts "New list contains #{result.size} domains. Fin."
|
37
|
-
|
38
|
-
exit 1 if current.size != result.size
|
data/script/vendor-de
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'csv'
|
4
|
-
require 'open-uri'
|
5
|
-
require './lib/gman'
|
6
|
-
require './lib/gman/parser'
|
7
|
-
|
8
|
-
source = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
|
9
|
-
|
10
|
-
csv = open(source).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
-
|
12
|
-
# For some reason, the header row is actually the last row
|
13
|
-
# Pop the last line off the file and prepend it at the begining
|
14
|
-
# So that when we pass it to CSV it detects the headers properly
|
15
|
-
lines = csv.split("\n")
|
16
|
-
lines.unshift lines.pop
|
17
|
-
csv = lines.join("\n")
|
18
|
-
|
19
|
-
data = CSV.parse(csv, :headers => true, :col_sep => ";")
|
20
|
-
domains = data.map { |row| row["Internet"].to_s.downcase.strip.gsub /^www./, "" }
|
21
|
-
|
22
|
-
domains.reject! { |domain| domain.empty? }
|
23
|
-
domains.select! { |domain| PublicSuffix.valid?(".#{domain}") } # Validate domain
|
24
|
-
domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
|
25
|
-
|
26
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
27
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
28
|
-
|
29
|
-
current_hash["German Municipalities"] = domains
|
30
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
31
|
-
|
32
|
-
# PublicSuffix Formatted Output
|
33
|
-
current_group = ""
|
34
|
-
output = ""
|
35
|
-
current_hash.each do |group, domains|
|
36
|
-
if group != current_group
|
37
|
-
output << "\n\n" unless current_group.empty? # first entry
|
38
|
-
output << "// #{group}\n"
|
39
|
-
current_group = group
|
40
|
-
end
|
41
|
-
output << domains.join("\n")
|
42
|
-
end
|
43
|
-
|
44
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
data/test/test_domains.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
-
|
3
|
-
class TestDomains < Minitest::Test
|
4
|
-
|
5
|
-
WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
|
6
|
-
DOMAINS = Gman::Parser.file_to_hash(Gman.list_path)
|
7
|
-
|
8
|
-
def whitelisted?(domain)
|
9
|
-
WHITELIST.each do |group|
|
10
|
-
return true if DOMAINS[group].include? domain
|
11
|
-
end
|
12
|
-
false
|
13
|
-
end
|
14
|
-
|
15
|
-
should "only contain resolvable domains" do
|
16
|
-
unresolvables = []
|
17
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
18
|
-
next if whitelisted? entry.name
|
19
|
-
resolves = Gman::Parser.domain_resolves?(entry.name)
|
20
|
-
unresolvables.push entry.name unless resolves
|
21
|
-
end
|
22
|
-
assert_equal [], unresolvables
|
23
|
-
end
|
24
|
-
|
25
|
-
should "not contain any educational domains" do
|
26
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
27
|
-
assert_equal false, Swot::is_academic?(entry.name), "#{entry.name} is an academic domain"
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
should "not contain any invalid domains" do
|
32
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
33
|
-
assert_equal true, PublicSuffix.valid?("foo.#{entry.name}"), "#{entry.name} is not a valid domain"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
should "pass any url on the list" do
|
38
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
39
|
-
assert_equal true, Gman.valid?("http://foo.#{entry.name}/bar"), "http://foo.#{entry.name}/bar is not a valid"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
should "pass any email on the list" do
|
44
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
45
|
-
assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
should "pass any domain on the list" do
|
50
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
51
|
-
assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|