gman 4.6.5 → 4.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +0 -7
- data/config/domains.txt +283 -367
- data/gman.gemspec +1 -1
- data/lib/gman/domain_list.rb +78 -0
- data/lib/gman/importer.rb +135 -0
- data/script/alphabetize +4 -34
- data/script/cibuild +0 -1
- data/script/dedupe +10 -27
- data/script/prune +5 -2
- data/script/vendor-federal-de +14 -0
- data/script/vendor-municipal-de +23 -0
- data/script/vendor-nl +9 -4
- data/script/{build → vendor-public-suffix} +4 -6
- data/script/vendor-se +4 -38
- data/script/vendor-us +7 -67
- data/test/helper.rb +4 -2
- data/test/test_gman_country_codes.rb +3 -3
- data/test/test_gman_domains.rb +28 -0
- data/test/test_gman_identifier.rb +0 -1
- metadata +9 -8
- data/lib/gman/parser.rb +0 -59
- data/script/state-domains +0 -38
- data/script/vendor-de +0 -44
- data/test/test_domains.rb +0 -54
data/script/vendor-us
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
# Vendors the USA.gov-maintained list of US domains into domains.txt
|
4
4
|
# Source: https://github.com/GSA-OCSIT/govt-urls
|
5
5
|
#
|
6
|
-
# Normalizes and cleans inputs, validates domains, rejects academic domains, and
|
7
|
-
# sorts, ensures uniqueness, and merges into the existing lib/domains.txt list
|
8
|
-
#
|
9
6
|
# Usage: script/vendor-us
|
10
7
|
#
|
11
8
|
# Will automatically fetch latest version of the list and merge
|
@@ -13,70 +10,13 @@
|
|
13
10
|
#
|
14
11
|
# It's also probably a good idea to run `script/ci-build` for good measure
|
15
12
|
|
16
|
-
require '
|
17
|
-
require 'public_suffix'
|
18
|
-
require 'swot'
|
19
|
-
require 'yaml'
|
20
|
-
require 'open-uri'
|
21
|
-
require './lib/gman'
|
22
|
-
require './lib/gman/parser'
|
23
|
-
|
24
|
-
SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
25
|
-
BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
|
26
|
-
domain_hash = {}
|
27
|
-
|
28
|
-
domain_hash = YAML.load(open(SOURCE).read)
|
29
|
-
puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..."
|
30
|
-
|
31
|
-
# Normalize ALL THE THINGS
|
32
|
-
domain_hash.each do |group, domains|
|
33
|
-
domains.map! { |domain| domain.strip } # Strip trailing slashes
|
34
|
-
domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
|
35
|
-
domains.map! { |domain| domain.downcase } # make lower case
|
36
|
-
domains.reject! { |domain| domain.empty? } # Reject empty strings
|
37
|
-
end
|
38
|
-
|
39
|
-
# filter
|
40
|
-
domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist
|
41
|
-
domain_hash.each do |group, domains|
|
42
|
-
puts "Filtering #{group}..."
|
43
|
-
domains.reject! { |domain| domain.match /\// } # Reject URLs
|
44
|
-
domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain
|
45
|
-
domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
|
46
|
-
end
|
47
|
-
puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains"
|
48
|
-
|
49
|
-
# Grab existing list
|
50
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
51
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
52
|
-
puts "Current list contains #{current.size} domains... merging"
|
53
|
-
|
54
|
-
# Lazy deep merge
|
55
|
-
domain_hash.each do |group,domains|
|
56
|
-
current_hash[group] = [] if current_hash[group].nil?
|
57
|
-
current_hash[group].concat domains
|
58
|
-
current_hash[group].sort! # Alphabetize
|
59
|
-
current_hash[group].uniq! # Ensure uniqueness
|
60
|
-
end
|
61
|
-
|
62
|
-
# Sort by group
|
63
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
64
|
-
|
65
|
-
# PublicSuffix Formatted Output
|
66
|
-
current_group = ""
|
67
|
-
output = ""
|
68
|
-
current_hash.each do |group, domains|
|
69
|
-
if group != current_group
|
70
|
-
output << "\n\n" unless current_group.empty? # first entry
|
71
|
-
output << "// #{group}\n"
|
72
|
-
current_group = group
|
73
|
-
end
|
74
|
-
output << domains.join("\n")
|
75
|
-
end
|
13
|
+
require './lib/gman/importer'
|
76
14
|
|
77
|
-
|
15
|
+
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
16
|
+
blacklist = ["usagovQUASI", "usagovFED", "usagovPW"]
|
78
17
|
|
79
|
-
|
18
|
+
data = open(source).read
|
19
|
+
domains = YAML.load(data)
|
20
|
+
domains.reject! { |group,domain| blacklist.include?(group) }
|
80
21
|
|
81
|
-
|
82
|
-
puts "New list contains #{result.size} domains. Fin."
|
22
|
+
Gman.import(domains)
|
data/test/helper.rb
CHANGED
@@ -16,10 +16,12 @@ require 'shoulda'
|
|
16
16
|
|
17
17
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
18
18
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
|
-
|
19
|
+
require_relative "../lib/gman"
|
20
|
+
require_relative "../lib/gman/domain_list"
|
21
|
+
|
20
22
|
require 'net/dns'
|
21
23
|
require 'net/dns/resolver'
|
22
|
-
require './lib/gman/
|
24
|
+
require './lib/gman/importer'
|
23
25
|
|
24
26
|
def test_bin(*args)
|
25
27
|
output, status = Open3.capture2e("bundle", "exec", "gman", *args)
|
@@ -2,9 +2,9 @@ require File.join(File.dirname(__FILE__), 'helper')
|
|
2
2
|
|
3
3
|
class TestGmanCountryCodes < Minitest::Test
|
4
4
|
should "determine a domain's country" do
|
5
|
-
assert_equal "United States", Gman.new("whitehouse.gov").country.name
|
6
|
-
assert_equal "United States", Gman.new("army.mil").country.name
|
7
|
-
assert_equal "United Kingdom", Gman.new("foo.gov.uk").country.name
|
5
|
+
assert_equal "United States of America", Gman.new("whitehouse.gov").country.name
|
6
|
+
assert_equal "United States of America", Gman.new("army.mil").country.name
|
7
|
+
assert_equal "United Kingdom of Great Britain and Northern Ireland", Gman.new("foo.gov.uk").country.name
|
8
8
|
assert_equal "Canada", Gman.new("foo.gc.ca").country.name
|
9
9
|
end
|
10
10
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
class TestDomains < Minitest::Test
|
4
|
+
|
5
|
+
WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
|
6
|
+
|
7
|
+
def resolve_domains?
|
8
|
+
ENV["GMAN_RESOLVE_DOMAINS"] == "true"
|
9
|
+
end
|
10
|
+
|
11
|
+
should "only contains valid domains" do
|
12
|
+
importer = Gman::Importer.new({})
|
13
|
+
if resolve_domains?
|
14
|
+
importer.logger.info "Validating that all domains resolve. This may take a while..."
|
15
|
+
else
|
16
|
+
importer.logger.info "Skipping domain resolution. Run `GMAN_RESOLVE_DOMAINS=true rake test` to validate that domains resolve."
|
17
|
+
end
|
18
|
+
|
19
|
+
invalid = []
|
20
|
+
Parallel.each(Gman::DomainList.current.list, :in_threads => 2) do |group, domains|
|
21
|
+
next if WHITELIST.include?(group)
|
22
|
+
invalid.push domains.reject { |domain|
|
23
|
+
importer.valid_domain?(domain, :skip_dupe => true, :skip_resolve => !resolve_domains?)
|
24
|
+
}
|
25
|
+
end
|
26
|
+
assert_equal [], invalid.flatten.reject { |e| e.empty? }
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: swot
|
@@ -188,29 +188,30 @@ files:
|
|
188
188
|
- gman.gemspec
|
189
189
|
- lib/gman.rb
|
190
190
|
- lib/gman/country_codes.rb
|
191
|
+
- lib/gman/domain_list.rb
|
191
192
|
- lib/gman/identifier.rb
|
193
|
+
- lib/gman/importer.rb
|
192
194
|
- lib/gman/locality.rb
|
193
|
-
- lib/gman/parser.rb
|
194
195
|
- lib/gman/sanctions.rb
|
195
196
|
- script/alphabetize
|
196
|
-
- script/build
|
197
197
|
- script/cibuild
|
198
198
|
- script/console
|
199
199
|
- script/dedupe
|
200
200
|
- script/prune
|
201
201
|
- script/release
|
202
|
-
- script/
|
203
|
-
- script/vendor-de
|
202
|
+
- script/vendor-federal-de
|
204
203
|
- script/vendor-gov-list
|
204
|
+
- script/vendor-municipal-de
|
205
205
|
- script/vendor-nl
|
206
|
+
- script/vendor-public-suffix
|
206
207
|
- script/vendor-se
|
207
208
|
- script/vendor-us
|
208
209
|
- test/helper.rb
|
209
210
|
- test/obama.txt
|
210
|
-
- test/test_domains.rb
|
211
211
|
- test/test_gman.rb
|
212
212
|
- test/test_gman_bin.rb
|
213
213
|
- test/test_gman_country_codes.rb
|
214
|
+
- test/test_gman_domains.rb
|
214
215
|
- test/test_gman_filter.rb
|
215
216
|
- test/test_gman_identifier.rb
|
216
217
|
- test/test_gman_locality.rb
|
@@ -242,10 +243,10 @@ summary: Check if a given domain or email address belong to a governemnt entity
|
|
242
243
|
test_files:
|
243
244
|
- test/helper.rb
|
244
245
|
- test/obama.txt
|
245
|
-
- test/test_domains.rb
|
246
246
|
- test/test_gman.rb
|
247
247
|
- test/test_gman_bin.rb
|
248
248
|
- test/test_gman_country_codes.rb
|
249
|
+
- test/test_gman_domains.rb
|
249
250
|
- test/test_gman_filter.rb
|
250
251
|
- test/test_gman_identifier.rb
|
251
252
|
- test/test_gman_locality.rb
|
data/lib/gman/parser.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
# Utility functions for parsing and manipulating public-suffix formatted domain lists
|
2
|
-
require 'net/dns'
|
3
|
-
require 'net/dns/resolver'
|
4
|
-
|
5
|
-
class Gman < NaughtyOrNice
|
6
|
-
class Parser
|
7
|
-
|
8
|
-
COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
|
9
|
-
|
10
|
-
class << self
|
11
|
-
|
12
|
-
# Given a public-suffix list formatted file
|
13
|
-
# Converts to a hash in the form of :group => [domain1, domain2...]
|
14
|
-
def file_to_hash(file)
|
15
|
-
array_to_hash(file_to_array(file))
|
16
|
-
end
|
17
|
-
|
18
|
-
# Given a public-suffix list formatted file
|
19
|
-
# Convert it into an array of comments and domains representing each line
|
20
|
-
def file_to_array(file)
|
21
|
-
domains = File.open(file).read
|
22
|
-
domains.gsub! /\r\n?/, "\n" # Normalize line endings
|
23
|
-
domains = domains.split("\n")
|
24
|
-
end
|
25
|
-
|
26
|
-
# Given an array of comments/domains in public suffix format
|
27
|
-
# Converts to a hash in the form of :group => [domain1, domain2...]
|
28
|
-
def array_to_hash(domains)
|
29
|
-
group = ""
|
30
|
-
domain_hash = {}
|
31
|
-
domains.each do |line|
|
32
|
-
next if line.empty?
|
33
|
-
if match = COMMENT_REGEX.match(line)
|
34
|
-
group = match[1]
|
35
|
-
else
|
36
|
-
domain_hash[group] = [] if domain_hash[group].nil?
|
37
|
-
domain_hash[group].push line.downcase
|
38
|
-
end
|
39
|
-
end
|
40
|
-
domain_hash
|
41
|
-
end
|
42
|
-
|
43
|
-
def resolver
|
44
|
-
@resolver ||= begin
|
45
|
-
resolver = Net::DNS::Resolver.new
|
46
|
-
resolver.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"]
|
47
|
-
resolver
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# Verifies that the given domain has an MX record, and thus is valid
|
52
|
-
def domain_resolves?(domain)
|
53
|
-
resolver.search(domain).header.anCount > 0 ||
|
54
|
-
resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
|
55
|
-
resolver.search(domain, Net::DNS::MX).header.anCount > 0
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
data/script/state-domains
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# Strips domains in the form of e.g., city.<locality>.<state>.us from the domain list
|
3
|
-
|
4
|
-
require './lib/gman'
|
5
|
-
require './lib/gman/parser'
|
6
|
-
|
7
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
8
|
-
domain_hash = Gman::Parser.array_to_hash(current)
|
9
|
-
|
10
|
-
puts "Checking for state gov regex'd domains in the list..."
|
11
|
-
puts "Starting with #{current.size} domains..."
|
12
|
-
|
13
|
-
domain_hash.each do |group, domains|
|
14
|
-
next unless group =~ /usagov[A-Z]{2}/
|
15
|
-
state = group[-2,2].downcase
|
16
|
-
domain_hash[group].reject! { |d| d =~ Gman::LOCALITY_REGEX }
|
17
|
-
domain_hash[group].uniq!
|
18
|
-
domain_hash[group].sort!
|
19
|
-
end
|
20
|
-
|
21
|
-
# PublicSuffix Formatted Output
|
22
|
-
current_group = ""
|
23
|
-
output = ""
|
24
|
-
domain_hash.each do |group, domains|
|
25
|
-
if group != current_group
|
26
|
-
output << "\n\n" unless current_group.empty? # first entry
|
27
|
-
output << "// #{group}\n"
|
28
|
-
current_group = group
|
29
|
-
end
|
30
|
-
output << domains.join("\n")
|
31
|
-
end
|
32
|
-
|
33
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
34
|
-
|
35
|
-
result = Gman::Parser.file_to_array( Gman::list_path )
|
36
|
-
puts "New list contains #{result.size} domains. Fin."
|
37
|
-
|
38
|
-
exit 1 if current.size != result.size
|
data/script/vendor-de
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'csv'
|
4
|
-
require 'open-uri'
|
5
|
-
require './lib/gman'
|
6
|
-
require './lib/gman/parser'
|
7
|
-
|
8
|
-
source = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
|
9
|
-
|
10
|
-
csv = open(source).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
-
|
12
|
-
# For some reason, the header row is actually the last row
|
13
|
-
# Pop the last line off the file and prepend it at the begining
|
14
|
-
# So that when we pass it to CSV it detects the headers properly
|
15
|
-
lines = csv.split("\n")
|
16
|
-
lines.unshift lines.pop
|
17
|
-
csv = lines.join("\n")
|
18
|
-
|
19
|
-
data = CSV.parse(csv, :headers => true, :col_sep => ";")
|
20
|
-
domains = data.map { |row| row["Internet"].to_s.downcase.strip.gsub /^www./, "" }
|
21
|
-
|
22
|
-
domains.reject! { |domain| domain.empty? }
|
23
|
-
domains.select! { |domain| PublicSuffix.valid?(".#{domain}") } # Validate domain
|
24
|
-
domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains
|
25
|
-
|
26
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
27
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
28
|
-
|
29
|
-
current_hash["German Municipalities"] = domains
|
30
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
31
|
-
|
32
|
-
# PublicSuffix Formatted Output
|
33
|
-
current_group = ""
|
34
|
-
output = ""
|
35
|
-
current_hash.each do |group, domains|
|
36
|
-
if group != current_group
|
37
|
-
output << "\n\n" unless current_group.empty? # first entry
|
38
|
-
output << "// #{group}\n"
|
39
|
-
current_group = group
|
40
|
-
end
|
41
|
-
output << domains.join("\n")
|
42
|
-
end
|
43
|
-
|
44
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
data/test/test_domains.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
-
|
3
|
-
class TestDomains < Minitest::Test
|
4
|
-
|
5
|
-
WHITELIST = [ "non-us gov", "non-us mil", "US Federal"]
|
6
|
-
DOMAINS = Gman::Parser.file_to_hash(Gman.list_path)
|
7
|
-
|
8
|
-
def whitelisted?(domain)
|
9
|
-
WHITELIST.each do |group|
|
10
|
-
return true if DOMAINS[group].include? domain
|
11
|
-
end
|
12
|
-
false
|
13
|
-
end
|
14
|
-
|
15
|
-
should "only contain resolvable domains" do
|
16
|
-
unresolvables = []
|
17
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
18
|
-
next if whitelisted? entry.name
|
19
|
-
resolves = Gman::Parser.domain_resolves?(entry.name)
|
20
|
-
unresolvables.push entry.name unless resolves
|
21
|
-
end
|
22
|
-
assert_equal [], unresolvables
|
23
|
-
end
|
24
|
-
|
25
|
-
should "not contain any educational domains" do
|
26
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
27
|
-
assert_equal false, Swot::is_academic?(entry.name), "#{entry.name} is an academic domain"
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
should "not contain any invalid domains" do
|
32
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
33
|
-
assert_equal true, PublicSuffix.valid?("foo.#{entry.name}"), "#{entry.name} is not a valid domain"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
should "pass any url on the list" do
|
38
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
39
|
-
assert_equal true, Gman.valid?("http://foo.#{entry.name}/bar"), "http://foo.#{entry.name}/bar is not a valid"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
should "pass any email on the list" do
|
44
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
45
|
-
assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
should "pass any domain on the list" do
|
50
|
-
Parallel.each(Gman.list, :in_threads => 2) do |entry|
|
51
|
-
assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|