gman 5.0.9 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -1
- data/Gemfile +1 -0
- data/README.md +16 -22
- data/Rakefile +3 -3
- data/bin/gman +10 -11
- data/bin/gman_filter +7 -7
- data/config/domains.txt +19 -19
- data/config/vendor/dotgovs.csv +398 -355
- data/gman.gemspec +34 -27
- data/lib/gman.rb +29 -23
- data/lib/gman/country_codes.rb +14 -15
- data/lib/gman/domain_list.rb +34 -25
- data/lib/gman/identifier.rb +39 -43
- data/lib/gman/importer.rb +111 -61
- data/lib/gman/locality.rb +22 -10
- data/lib/gman/version.rb +1 -1
- data/script/add +2 -2
- data/script/alphabetize +2 -2
- data/script/cibuild +2 -0
- data/script/dedupe +2 -2
- data/script/profile +5 -2
- data/script/prune +7 -7
- data/script/reconcile-us +26 -21
- data/script/vendor-federal-de +5 -5
- data/script/vendor-municipal-de +5 -5
- data/script/vendor-nl +12 -4
- data/script/vendor-public-suffix +8 -8
- data/script/vendor-se +8 -6
- data/script/vendor-us +7 -7
- data/test/fixtures/domains.txt +2 -0
- data/test/{obama.txt → fixtures/obama.txt} +0 -0
- data/test/helper.rb +19 -5
- data/test/test_gman.rb +43 -38
- data/test/test_gman_bin.rb +37 -43
- data/test/test_gman_country_codes.rb +10 -6
- data/test/test_gman_domains.rb +15 -10
- data/test/test_gman_filter.rb +5 -7
- data/test/test_gman_identifier.rb +36 -35
- data/test/test_gman_importer.rb +250 -0
- data/test/test_gman_locality.rb +5 -5
- metadata +28 -10
- data/lib/gman/sanctions.rb +0 -29
- data/test/test_gman_sanctions.rb +0 -20
data/lib/gman/locality.rb
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
class Gman
|
2
|
+
class Locality
|
3
|
+
AFFINITY_NAMESPACES = %w(state dst cog).freeze
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
STATES = %w(
|
6
|
+
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
7
|
+
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
8
|
+
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
9
|
+
).freeze
|
10
|
+
|
11
|
+
LOCALITY_DOMAINS = %w(
|
12
|
+
ci co borough boro city county
|
13
|
+
parish town twp vi vil village
|
14
|
+
).freeze
|
15
|
+
|
16
|
+
REGEX = /
|
17
|
+
(
|
18
|
+
(#{Regexp.union(AFFINITY_NAMESPACES)})
|
19
|
+
|
|
20
|
+
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
21
|
+
)\.(#{Regexp.union(STATES)})\.us
|
22
|
+
/x
|
23
|
+
end
|
12
24
|
|
13
25
|
# Second level .us domains for states and locality
|
14
26
|
# See http://en.wikipedia.org/wiki/.us
|
@@ -23,6 +35,6 @@ class Gman
|
|
23
35
|
# * k12.il.us
|
24
36
|
# * ci.foo.zx.us
|
25
37
|
def locality?
|
26
|
-
|
38
|
+
!domain.to_s.match(Locality::REGEX).nil?
|
27
39
|
end
|
28
40
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
@@ -7,11 +7,11 @@
|
|
7
7
|
require './lib/gman/importer'
|
8
8
|
|
9
9
|
if ARGV.length < 2
|
10
|
-
puts
|
10
|
+
puts 'Usage: script/add [GROUP] [DOMAIN(S)]'
|
11
11
|
exit 1
|
12
12
|
end
|
13
13
|
|
14
14
|
group = ARGV[0]
|
15
15
|
domains = ARGV.drop(1)
|
16
16
|
|
17
|
-
Gman.import(
|
17
|
+
Gman.import(group => domains)
|
data/script/alphabetize
CHANGED
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -7,7 +7,7 @@ require './lib/gman/importer'
|
|
7
7
|
|
8
8
|
current = Gman::DomainList.current
|
9
9
|
|
10
|
-
puts
|
10
|
+
puts 'Checking for duplicate domains in the domain list...'
|
11
11
|
puts "Current list contains #{current.count} domains..."
|
12
12
|
|
13
13
|
dupe = current.count - current.domains.uniq.count
|
@@ -16,6 +16,6 @@ exit 0 if dupe == 0
|
|
16
16
|
|
17
17
|
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
18
18
|
|
19
|
-
puts
|
19
|
+
puts 'Duplicate domains:'
|
20
20
|
puts dupes
|
21
21
|
exit 1
|
data/script/profile
CHANGED
@@ -3,9 +3,12 @@
|
|
3
3
|
require 'ruby-prof'
|
4
4
|
require './lib/gman'
|
5
5
|
|
6
|
-
# Pick N random domains directly,
|
6
|
+
# Pick N random domains directly,
|
7
|
+
# without pre-loading the Gman list for an accurate benchmark
|
7
8
|
count = (ARGV[0] || 100).to_i
|
8
|
-
domains = File.readlines(
|
9
|
+
domains = File.readlines('./config/domains.txt')
|
10
|
+
domains = domains.select { |l| l =~ /^[a-z0-9]/i }
|
11
|
+
domains = domains.sample(count)
|
9
12
|
|
10
13
|
RubyProf.start
|
11
14
|
domains.each do |domain|
|
data/script/prune
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
# Given an array of domains, removes them from the list
|
3
3
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
4
|
|
5
|
-
require_relative
|
6
|
-
require_relative
|
5
|
+
require_relative '../lib/gman'
|
6
|
+
require_relative '../lib/gman/domain_list'
|
7
7
|
|
8
8
|
domains = ARGV
|
9
|
-
domains = domains.clone.map { |d| d.
|
9
|
+
domains = domains.clone.map { |d| d.delete ',' }
|
10
10
|
|
11
|
-
list = File.open(
|
11
|
+
list = File.open('./config/domains.txt').read
|
12
12
|
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
13
13
|
|
14
14
|
domains.each do |domain|
|
15
|
-
list.gsub!
|
15
|
+
list.gsub!(/^#{domain}$\n/, '')
|
16
16
|
end
|
17
17
|
|
18
18
|
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
19
19
|
|
20
|
-
File.write
|
20
|
+
File.write './config/domains.txt', list
|
data/script/reconcile-us
CHANGED
@@ -8,57 +8,62 @@
|
|
8
8
|
require './lib/gman/importer'
|
9
9
|
require 'yaml'
|
10
10
|
|
11
|
-
ENV[
|
12
|
-
blacklist = [
|
13
|
-
source =
|
11
|
+
ENV['RECONCILING'] = 'true'
|
12
|
+
blacklist = ['usagovQUASI']
|
13
|
+
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
14
14
|
|
15
15
|
data = open(source).read
|
16
|
-
data = data.split(
|
16
|
+
data = data.split('_' * 74)
|
17
17
|
data = data.last.strip
|
18
|
-
data = data.split(/\r?\n/).reject
|
18
|
+
data = data.split(/\r?\n/).reject(&:empty?)
|
19
19
|
|
20
20
|
domains = {}
|
21
|
-
group =
|
21
|
+
group = ''
|
22
22
|
data.each do |row|
|
23
23
|
if row =~ /^\w/
|
24
24
|
group = row
|
25
25
|
domains[group] = []
|
26
26
|
else
|
27
|
-
domains[group].push row.sub("\.\t",
|
27
|
+
domains[group].push row.sub("\.\t", '').strip
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
domains.reject! { |
|
31
|
+
domains.reject! { |g, _domain| blacklist.include?(g) }
|
32
32
|
importer = Gman::Importer.new(domains)
|
33
33
|
|
34
34
|
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
35
|
|
36
|
-
importer.domains.list.each do |
|
37
|
-
|
38
|
-
|
36
|
+
importer.domains.list.each do |_group, d|
|
37
|
+
d.map! { |domain| Gman.new(domain).to_s }
|
38
|
+
d.map! { |domain| importer.normalize_domain(domain) }
|
39
39
|
end
|
40
40
|
|
41
|
-
|
41
|
+
count = importer.domains.domains.count
|
42
|
+
importer.logger.info "Filtered down to #{count} normalized domains"
|
42
43
|
|
43
44
|
missing = {}
|
44
|
-
importer.domains.list.each do |
|
45
|
-
next unless importer.current.list[
|
46
|
-
missing[
|
45
|
+
importer.domains.list.each do |g, usagovdomains|
|
46
|
+
next unless importer.current.list[g]
|
47
|
+
missing[g] = importer.current.list[g] - usagovdomains
|
47
48
|
end
|
48
49
|
|
49
|
-
missing.reject! { |
|
50
|
+
missing.reject! { |_key, value| value.empty? }
|
50
51
|
|
51
|
-
|
52
|
+
count = missing.values.count
|
53
|
+
importer.logger.info "Found #{count} domains not on the USA.gov list"
|
52
54
|
puts "Here's the list of missing domains:"
|
53
55
|
puts YAML.dump(missing)
|
54
56
|
|
55
57
|
domains = importer.domains.domains
|
56
|
-
domains = domains.group_by
|
58
|
+
domains = domains.group_by do |domain|
|
59
|
+
importer.valid_domain?(domain, skip_dupe: true)
|
60
|
+
end
|
57
61
|
domains.delete(true)
|
58
62
|
domains.delete(false)
|
59
|
-
domains.delete(
|
63
|
+
domains.delete('locality')
|
60
64
|
|
61
|
-
|
65
|
+
count = domains.values.flatten.count
|
66
|
+
importer.logger.info "Calling out #{count} rejected domains"
|
62
67
|
|
63
|
-
puts
|
68
|
+
puts 'Here are the rejected domains and why they were rejected:'
|
64
69
|
puts YAML.dump(domains)
|
data/script/vendor-federal-de
CHANGED
@@ -5,10 +5,10 @@ require 'open-uri'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
9
9
|
|
10
|
-
domains = open(url).read.encode(
|
11
|
-
domains = CSV.parse(domains, :
|
12
|
-
domains = domains.map { |row| row[
|
10
|
+
domains = open(url).read.encode('UTF-8')
|
11
|
+
domains = CSV.parse(domains, headers: true)
|
12
|
+
domains = domains.map { |row| row['Domain Name'] }
|
13
13
|
|
14
|
-
Gman.import(
|
14
|
+
Gman.import('German Federal' => domains)
|
data/script/vendor-municipal-de
CHANGED
@@ -5,9 +5,9 @@ require 'open-uri'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
9
9
|
|
10
|
-
csv = open(url).read.force_encoding(
|
10
|
+
csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
11
11
|
|
12
12
|
# For some reason, the header row is actually the last row
|
13
13
|
# Pop the last line off the file and prepend it at the begining
|
@@ -17,7 +17,7 @@ lines.unshift lines.pop
|
|
17
17
|
csv = lines.join("\n")
|
18
18
|
|
19
19
|
# Load municipal domains
|
20
|
-
data = CSV.parse(csv, :
|
21
|
-
domains = data.map { |row| row[
|
20
|
+
data = CSV.parse(csv, headers: true, col_sep: ';')
|
21
|
+
domains = data.map { |row| row['Internet'] }
|
22
22
|
|
23
|
-
Gman.import(
|
23
|
+
Gman.import('German Municipalities' => domains)
|
data/script/vendor-nl
CHANGED
@@ -3,8 +3,16 @@
|
|
3
3
|
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
|
-
FileUtils.rm_rf(
|
7
|
-
|
8
|
-
|
6
|
+
FileUtils.rm_rf('almanak.overheid.nl')
|
7
|
+
commands = [
|
8
|
+
"wget -q -r -nc -np https://almanak.overheid.nl/
|
9
|
+
grep @ -rI almanak.overheid.nl/",
|
10
|
+
'cut -f 2 -d @',
|
11
|
+
"cut -f 1 -d '\"'",
|
12
|
+
'grep \\.nl$',
|
13
|
+
'sort',
|
14
|
+
'uniq'
|
15
|
+
]
|
16
|
+
domains = system commands.join('|')
|
9
17
|
|
10
|
-
Gman.import(
|
18
|
+
Gman.import('Netherlands' => domains.split("\n"))
|
data/script/vendor-public-suffix
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# Propagates an initial list of best-guess government domains
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require_relative
|
7
|
-
require_relative
|
4
|
+
require 'public_suffix'
|
5
|
+
require 'yaml'
|
6
|
+
require_relative '../lib/gman'
|
7
|
+
require_relative '../lib/gman/importer'
|
8
8
|
|
9
9
|
# https://gist.github.com/benbalter/6147066
|
10
10
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
@@ -15,13 +15,13 @@ PublicSuffix::List.default.each do |rule|
|
|
15
15
|
|
16
16
|
if rule.parts.length == 1
|
17
17
|
domain = rule.parts.first if ".#{rule.value}" =~ REGEX
|
18
|
-
|
19
|
-
domain = rule.parts.pop(2).join(
|
18
|
+
elsif ".#{rule.value}" =~ REGEX
|
19
|
+
domain = rule.parts.pop(2).join('.')
|
20
20
|
end
|
21
21
|
|
22
|
-
domains.push domain unless domain.nil?
|
22
|
+
domains.push domain unless domain.nil? || domains.include?(domain)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
26
|
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
-
Gman.import({
|
27
|
+
Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
|
data/script/vendor-se
CHANGED
@@ -5,15 +5,17 @@ require 'csv'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
|
9
9
|
agent = Mechanize.new
|
10
10
|
page = agent.get(url)
|
11
11
|
form = page.forms.first
|
12
|
-
form.radiobuttons.find { |r| r.value =
|
13
|
-
submit_button = form.buttons.find { |b| b.type ==
|
12
|
+
form.radiobuttons.find { |r| r.value = 'Textfil' }.check
|
13
|
+
submit_button = form.buttons.find { |b| b.type == 'submit' }
|
14
14
|
response = agent.submit(form, submit_button)
|
15
15
|
|
16
|
-
rows = CSV.parse(response.content, :
|
17
|
-
domains = rows.map
|
16
|
+
rows = CSV.parse(response.content, headers: true, col_sep: "\t")
|
17
|
+
domains = rows.map do |row|
|
18
|
+
row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
|
19
|
+
end
|
18
20
|
|
19
|
-
Gman.import(
|
21
|
+
Gman.import('Swedish Administrative Authorities' => domains)
|
data/script/vendor-us
CHANGED
@@ -12,24 +12,24 @@
|
|
12
12
|
|
13
13
|
require './lib/gman/importer'
|
14
14
|
|
15
|
-
blacklist =
|
16
|
-
source =
|
15
|
+
blacklist = %w(usagovQUASI usagovFEDgov)
|
16
|
+
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
17
17
|
|
18
18
|
data = open(source).read
|
19
|
-
data = data.split(
|
19
|
+
data = data.split('_' * 74)
|
20
20
|
data = data.last.strip
|
21
|
-
data = data.split(/\r?\n/).reject
|
21
|
+
data = data.split(/\r?\n/).reject(&:empty?)
|
22
22
|
|
23
23
|
domains = {}
|
24
|
-
group =
|
24
|
+
group = ''
|
25
25
|
data.each do |row|
|
26
26
|
if row =~ /^\w/
|
27
27
|
group = row
|
28
28
|
domains[group] = []
|
29
29
|
else
|
30
|
-
domains[group].push row.sub("\.\t",
|
30
|
+
domains[group].push row.sub("\.\t", '').strip
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
domains.reject! { |
|
34
|
+
domains.reject! { |g, _| blacklist.include?(g) }
|
35
35
|
Gman.import(domains)
|
File without changes
|
data/test/helper.rb
CHANGED
@@ -8,7 +8,7 @@ begin
|
|
8
8
|
Bundler.setup(:default, :development)
|
9
9
|
rescue Bundler::BundlerError => e
|
10
10
|
$stderr.puts e.message
|
11
|
-
$stderr.puts
|
11
|
+
$stderr.puts 'Run `bundle install` to install missing gems'
|
12
12
|
exit e.status_code
|
13
13
|
end
|
14
14
|
|
@@ -16,11 +16,25 @@ require 'shoulda'
|
|
16
16
|
|
17
17
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
18
18
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
|
-
require_relative
|
20
|
-
require_relative
|
19
|
+
require_relative '../lib/gman'
|
20
|
+
require_relative '../lib/gman/domain_list'
|
21
|
+
require_relative '../lib/gman/importer'
|
21
22
|
|
22
|
-
|
23
|
+
def bin_path(cmd = 'gman')
|
24
|
+
File.expand_path "../bin/#{cmd}", File.dirname(__FILE__)
|
25
|
+
end
|
23
26
|
|
24
27
|
def test_bin(*args)
|
25
|
-
|
28
|
+
Open3.capture2e('bundle', 'exec', bin_path, *args)
|
29
|
+
end
|
30
|
+
|
31
|
+
def fixture_path(fixture)
|
32
|
+
File.expand_path "./fixtures/#{fixture}", File.dirname(__FILE__)
|
33
|
+
end
|
34
|
+
|
35
|
+
def with_env(key, value)
|
36
|
+
old_env = ENV[key]
|
37
|
+
ENV[key] = value
|
38
|
+
yield
|
39
|
+
ENV[key] = old_env
|
26
40
|
end
|
data/test/test_gman.rb
CHANGED
@@ -1,57 +1,62 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__), 'helper')
|
2
2
|
|
3
|
-
VALID = [
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
]
|
3
|
+
VALID = ['foo.gov',
|
4
|
+
'http://foo.mil',
|
5
|
+
'foo@bar.gc.ca',
|
6
|
+
'foo.gov.au',
|
7
|
+
'https://www.foo.gouv.fr',
|
8
|
+
'foo@ci.champaign.il.us',
|
9
|
+
'foo.bar.baz.gov.au',
|
10
|
+
'foo@bar.gov.uk',
|
11
|
+
'foo.gov',
|
12
|
+
'foo.fed.us',
|
13
|
+
'foo.state.il.us',
|
14
|
+
'state.il.us',
|
15
|
+
'foo@af.mil',
|
16
|
+
'foo.gov.in'
|
17
|
+
].freeze
|
18
18
|
|
19
|
-
INVALID = [
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
]
|
19
|
+
INVALID = ['foo.bar.com',
|
20
|
+
'bar@foo.biz',
|
21
|
+
'http://www.foo.biz',
|
22
|
+
'foo.uk',
|
23
|
+
'gov',
|
24
|
+
'foo@k12.champaign.il.us',
|
25
|
+
'foo@kii.gov.by',
|
26
|
+
'foo',
|
27
|
+
'',
|
28
|
+
nil,
|
29
|
+
' ',
|
30
|
+
'foo.city.il.us',
|
31
|
+
'foo.ci.il.us',
|
32
|
+
'foo.zx.us',
|
33
|
+
'foo@mail.gov.ua'
|
34
|
+
].freeze
|
35
35
|
|
36
36
|
class TestGman < Minitest::Test
|
37
|
-
|
38
37
|
VALID.each do |domain|
|
39
38
|
should "recognize #{domain} as a government domain" do
|
40
|
-
assert Gman
|
39
|
+
assert Gman.valid?(domain)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
|
44
43
|
INVALID.each do |domain|
|
45
44
|
should "recognize #{domain} as a non-government domain" do
|
46
|
-
refute Gman
|
45
|
+
refute Gman.valid?(domain)
|
47
46
|
end
|
48
47
|
end
|
49
48
|
|
50
|
-
should
|
51
|
-
assert_equal false, Gman
|
49
|
+
should 'not allow educational domains' do
|
50
|
+
assert_equal false, Gman.valid?('foo@gwu.edu')
|
51
|
+
end
|
52
|
+
|
53
|
+
should 'returns the path to domains.txt' do
|
54
|
+
assert_equal true, File.exist?(Gman.list_path)
|
52
55
|
end
|
53
56
|
|
54
|
-
should
|
55
|
-
|
57
|
+
should 'stub domains when asked' do
|
58
|
+
with_env 'GMAN_STUB_DOMAINS', 'true' do
|
59
|
+
assert_equal fixture_path('domains.txt'), Gman.list_path
|
60
|
+
end
|
56
61
|
end
|
57
62
|
end
|