gman 5.0.9 → 6.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -1
- data/Gemfile +1 -0
- data/README.md +16 -22
- data/Rakefile +3 -3
- data/bin/gman +10 -11
- data/bin/gman_filter +7 -7
- data/config/domains.txt +19 -19
- data/config/vendor/dotgovs.csv +398 -355
- data/gman.gemspec +34 -27
- data/lib/gman.rb +29 -23
- data/lib/gman/country_codes.rb +14 -15
- data/lib/gman/domain_list.rb +34 -25
- data/lib/gman/identifier.rb +39 -43
- data/lib/gman/importer.rb +111 -61
- data/lib/gman/locality.rb +22 -10
- data/lib/gman/version.rb +1 -1
- data/script/add +2 -2
- data/script/alphabetize +2 -2
- data/script/cibuild +2 -0
- data/script/dedupe +2 -2
- data/script/profile +5 -2
- data/script/prune +7 -7
- data/script/reconcile-us +26 -21
- data/script/vendor-federal-de +5 -5
- data/script/vendor-municipal-de +5 -5
- data/script/vendor-nl +12 -4
- data/script/vendor-public-suffix +8 -8
- data/script/vendor-se +8 -6
- data/script/vendor-us +7 -7
- data/test/fixtures/domains.txt +2 -0
- data/test/{obama.txt → fixtures/obama.txt} +0 -0
- data/test/helper.rb +19 -5
- data/test/test_gman.rb +43 -38
- data/test/test_gman_bin.rb +37 -43
- data/test/test_gman_country_codes.rb +10 -6
- data/test/test_gman_domains.rb +15 -10
- data/test/test_gman_filter.rb +5 -7
- data/test/test_gman_identifier.rb +36 -35
- data/test/test_gman_importer.rb +250 -0
- data/test/test_gman_locality.rb +5 -5
- metadata +28 -10
- data/lib/gman/sanctions.rb +0 -29
- data/test/test_gman_sanctions.rb +0 -20
data/lib/gman/locality.rb
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
class Gman
|
2
|
+
class Locality
|
3
|
+
AFFINITY_NAMESPACES = %w(state dst cog).freeze
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
STATES = %w(
|
6
|
+
ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
|
7
|
+
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
|
8
|
+
ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
|
9
|
+
).freeze
|
10
|
+
|
11
|
+
LOCALITY_DOMAINS = %w(
|
12
|
+
ci co borough boro city county
|
13
|
+
parish town twp vi vil village
|
14
|
+
).freeze
|
15
|
+
|
16
|
+
REGEX = /
|
17
|
+
(
|
18
|
+
(#{Regexp.union(AFFINITY_NAMESPACES)})
|
19
|
+
|
|
20
|
+
(#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
|
21
|
+
)\.(#{Regexp.union(STATES)})\.us
|
22
|
+
/x
|
23
|
+
end
|
12
24
|
|
13
25
|
# Second level .us domains for states and locality
|
14
26
|
# See http://en.wikipedia.org/wiki/.us
|
@@ -23,6 +35,6 @@ class Gman
|
|
23
35
|
# * k12.il.us
|
24
36
|
# * ci.foo.zx.us
|
25
37
|
def locality?
|
26
|
-
|
38
|
+
!domain.to_s.match(Locality::REGEX).nil?
|
27
39
|
end
|
28
40
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/add
CHANGED
@@ -7,11 +7,11 @@
|
|
7
7
|
require './lib/gman/importer'
|
8
8
|
|
9
9
|
if ARGV.length < 2
|
10
|
-
puts
|
10
|
+
puts 'Usage: script/add [GROUP] [DOMAIN(S)]'
|
11
11
|
exit 1
|
12
12
|
end
|
13
13
|
|
14
14
|
group = ARGV[0]
|
15
15
|
domains = ARGV.drop(1)
|
16
16
|
|
17
|
-
Gman.import(
|
17
|
+
Gman.import(group => domains)
|
data/script/alphabetize
CHANGED
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -7,7 +7,7 @@ require './lib/gman/importer'
|
|
7
7
|
|
8
8
|
current = Gman::DomainList.current
|
9
9
|
|
10
|
-
puts
|
10
|
+
puts 'Checking for duplicate domains in the domain list...'
|
11
11
|
puts "Current list contains #{current.count} domains..."
|
12
12
|
|
13
13
|
dupe = current.count - current.domains.uniq.count
|
@@ -16,6 +16,6 @@ exit 0 if dupe == 0
|
|
16
16
|
|
17
17
|
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
18
18
|
|
19
|
-
puts
|
19
|
+
puts 'Duplicate domains:'
|
20
20
|
puts dupes
|
21
21
|
exit 1
|
data/script/profile
CHANGED
@@ -3,9 +3,12 @@
|
|
3
3
|
require 'ruby-prof'
|
4
4
|
require './lib/gman'
|
5
5
|
|
6
|
-
# Pick N random domains directly,
|
6
|
+
# Pick N random domains directly,
|
7
|
+
# without pre-loading the Gman list for an accurate benchmark
|
7
8
|
count = (ARGV[0] || 100).to_i
|
8
|
-
domains = File.readlines(
|
9
|
+
domains = File.readlines('./config/domains.txt')
|
10
|
+
domains = domains.select { |l| l =~ /^[a-z0-9]/i }
|
11
|
+
domains = domains.sample(count)
|
9
12
|
|
10
13
|
RubyProf.start
|
11
14
|
domains.each do |domain|
|
data/script/prune
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
# Given an array of domains, removes them from the list
|
3
3
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
4
|
|
5
|
-
require_relative
|
6
|
-
require_relative
|
5
|
+
require_relative '../lib/gman'
|
6
|
+
require_relative '../lib/gman/domain_list'
|
7
7
|
|
8
8
|
domains = ARGV
|
9
|
-
domains = domains.clone.map { |d| d.
|
9
|
+
domains = domains.clone.map { |d| d.delete ',' }
|
10
10
|
|
11
|
-
list = File.open(
|
11
|
+
list = File.open('./config/domains.txt').read
|
12
12
|
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
13
13
|
|
14
14
|
domains.each do |domain|
|
15
|
-
list.gsub!
|
15
|
+
list.gsub!(/^#{domain}$\n/, '')
|
16
16
|
end
|
17
17
|
|
18
18
|
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
19
19
|
|
20
|
-
File.write
|
20
|
+
File.write './config/domains.txt', list
|
data/script/reconcile-us
CHANGED
@@ -8,57 +8,62 @@
|
|
8
8
|
require './lib/gman/importer'
|
9
9
|
require 'yaml'
|
10
10
|
|
11
|
-
ENV[
|
12
|
-
blacklist = [
|
13
|
-
source =
|
11
|
+
ENV['RECONCILING'] = 'true'
|
12
|
+
blacklist = ['usagovQUASI']
|
13
|
+
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
14
14
|
|
15
15
|
data = open(source).read
|
16
|
-
data = data.split(
|
16
|
+
data = data.split('_' * 74)
|
17
17
|
data = data.last.strip
|
18
|
-
data = data.split(/\r?\n/).reject
|
18
|
+
data = data.split(/\r?\n/).reject(&:empty?)
|
19
19
|
|
20
20
|
domains = {}
|
21
|
-
group =
|
21
|
+
group = ''
|
22
22
|
data.each do |row|
|
23
23
|
if row =~ /^\w/
|
24
24
|
group = row
|
25
25
|
domains[group] = []
|
26
26
|
else
|
27
|
-
domains[group].push row.sub("\.\t",
|
27
|
+
domains[group].push row.sub("\.\t", '').strip
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
domains.reject! { |
|
31
|
+
domains.reject! { |g, _domain| blacklist.include?(g) }
|
32
32
|
importer = Gman::Importer.new(domains)
|
33
33
|
|
34
34
|
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
35
|
|
36
|
-
importer.domains.list.each do |
|
37
|
-
|
38
|
-
|
36
|
+
importer.domains.list.each do |_group, d|
|
37
|
+
d.map! { |domain| Gman.new(domain).to_s }
|
38
|
+
d.map! { |domain| importer.normalize_domain(domain) }
|
39
39
|
end
|
40
40
|
|
41
|
-
|
41
|
+
count = importer.domains.domains.count
|
42
|
+
importer.logger.info "Filtered down to #{count} normalized domains"
|
42
43
|
|
43
44
|
missing = {}
|
44
|
-
importer.domains.list.each do |
|
45
|
-
next unless importer.current.list[
|
46
|
-
missing[
|
45
|
+
importer.domains.list.each do |g, usagovdomains|
|
46
|
+
next unless importer.current.list[g]
|
47
|
+
missing[g] = importer.current.list[g] - usagovdomains
|
47
48
|
end
|
48
49
|
|
49
|
-
missing.reject! { |
|
50
|
+
missing.reject! { |_key, value| value.empty? }
|
50
51
|
|
51
|
-
|
52
|
+
count = missing.values.count
|
53
|
+
importer.logger.info "Found #{count} domains not on the USA.gov list"
|
52
54
|
puts "Here's the list of missing domains:"
|
53
55
|
puts YAML.dump(missing)
|
54
56
|
|
55
57
|
domains = importer.domains.domains
|
56
|
-
domains = domains.group_by
|
58
|
+
domains = domains.group_by do |domain|
|
59
|
+
importer.valid_domain?(domain, skip_dupe: true)
|
60
|
+
end
|
57
61
|
domains.delete(true)
|
58
62
|
domains.delete(false)
|
59
|
-
domains.delete(
|
63
|
+
domains.delete('locality')
|
60
64
|
|
61
|
-
|
65
|
+
count = domains.values.flatten.count
|
66
|
+
importer.logger.info "Calling out #{count} rejected domains"
|
62
67
|
|
63
|
-
puts
|
68
|
+
puts 'Here are the rejected domains and why they were rejected:'
|
64
69
|
puts YAML.dump(domains)
|
data/script/vendor-federal-de
CHANGED
@@ -5,10 +5,10 @@ require 'open-uri'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
|
9
9
|
|
10
|
-
domains = open(url).read.encode(
|
11
|
-
domains = CSV.parse(domains, :
|
12
|
-
domains = domains.map { |row| row[
|
10
|
+
domains = open(url).read.encode('UTF-8')
|
11
|
+
domains = CSV.parse(domains, headers: true)
|
12
|
+
domains = domains.map { |row| row['Domain Name'] }
|
13
13
|
|
14
|
-
Gman.import(
|
14
|
+
Gman.import('German Federal' => domains)
|
data/script/vendor-municipal-de
CHANGED
@@ -5,9 +5,9 @@ require 'open-uri'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
9
9
|
|
10
|
-
csv = open(url).read.force_encoding(
|
10
|
+
csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
|
11
11
|
|
12
12
|
# For some reason, the header row is actually the last row
|
13
13
|
# Pop the last line off the file and prepend it at the begining
|
@@ -17,7 +17,7 @@ lines.unshift lines.pop
|
|
17
17
|
csv = lines.join("\n")
|
18
18
|
|
19
19
|
# Load municipal domains
|
20
|
-
data = CSV.parse(csv, :
|
21
|
-
domains = data.map { |row| row[
|
20
|
+
data = CSV.parse(csv, headers: true, col_sep: ';')
|
21
|
+
domains = data.map { |row| row['Internet'] }
|
22
22
|
|
23
|
-
Gman.import(
|
23
|
+
Gman.import('German Municipalities' => domains)
|
data/script/vendor-nl
CHANGED
@@ -3,8 +3,16 @@
|
|
3
3
|
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
|
-
FileUtils.rm_rf(
|
7
|
-
|
8
|
-
|
6
|
+
FileUtils.rm_rf('almanak.overheid.nl')
|
7
|
+
commands = [
|
8
|
+
"wget -q -r -nc -np https://almanak.overheid.nl/
|
9
|
+
grep @ -rI almanak.overheid.nl/",
|
10
|
+
'cut -f 2 -d @',
|
11
|
+
"cut -f 1 -d '\"'",
|
12
|
+
'grep \\.nl$',
|
13
|
+
'sort',
|
14
|
+
'uniq'
|
15
|
+
]
|
16
|
+
domains = system commands.join('|')
|
9
17
|
|
10
|
-
Gman.import(
|
18
|
+
Gman.import('Netherlands' => domains.split("\n"))
|
data/script/vendor-public-suffix
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# Propagates an initial list of best-guess government domains
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require_relative
|
7
|
-
require_relative
|
4
|
+
require 'public_suffix'
|
5
|
+
require 'yaml'
|
6
|
+
require_relative '../lib/gman'
|
7
|
+
require_relative '../lib/gman/importer'
|
8
8
|
|
9
9
|
# https://gist.github.com/benbalter/6147066
|
10
10
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
@@ -15,13 +15,13 @@ PublicSuffix::List.default.each do |rule|
|
|
15
15
|
|
16
16
|
if rule.parts.length == 1
|
17
17
|
domain = rule.parts.first if ".#{rule.value}" =~ REGEX
|
18
|
-
|
19
|
-
domain = rule.parts.pop(2).join(
|
18
|
+
elsif ".#{rule.value}" =~ REGEX
|
19
|
+
domain = rule.parts.pop(2).join('.')
|
20
20
|
end
|
21
21
|
|
22
|
-
domains.push domain unless domain.nil?
|
22
|
+
domains.push domain unless domain.nil? || domains.include?(domain)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
26
|
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
-
Gman.import({
|
27
|
+
Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
|
data/script/vendor-se
CHANGED
@@ -5,15 +5,17 @@ require 'csv'
|
|
5
5
|
require './lib/gman'
|
6
6
|
require './lib/gman/importer'
|
7
7
|
|
8
|
-
url =
|
8
|
+
url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
|
9
9
|
agent = Mechanize.new
|
10
10
|
page = agent.get(url)
|
11
11
|
form = page.forms.first
|
12
|
-
form.radiobuttons.find { |r| r.value =
|
13
|
-
submit_button = form.buttons.find { |b| b.type ==
|
12
|
+
form.radiobuttons.find { |r| r.value = 'Textfil' }.check
|
13
|
+
submit_button = form.buttons.find { |b| b.type == 'submit' }
|
14
14
|
response = agent.submit(form, submit_button)
|
15
15
|
|
16
|
-
rows = CSV.parse(response.content, :
|
17
|
-
domains = rows.map
|
16
|
+
rows = CSV.parse(response.content, headers: true, col_sep: "\t")
|
17
|
+
domains = rows.map do |row|
|
18
|
+
row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
|
19
|
+
end
|
18
20
|
|
19
|
-
Gman.import(
|
21
|
+
Gman.import('Swedish Administrative Authorities' => domains)
|
data/script/vendor-us
CHANGED
@@ -12,24 +12,24 @@
|
|
12
12
|
|
13
13
|
require './lib/gman/importer'
|
14
14
|
|
15
|
-
blacklist =
|
16
|
-
source =
|
15
|
+
blacklist = %w(usagovQUASI usagovFEDgov)
|
16
|
+
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
17
17
|
|
18
18
|
data = open(source).read
|
19
|
-
data = data.split(
|
19
|
+
data = data.split('_' * 74)
|
20
20
|
data = data.last.strip
|
21
|
-
data = data.split(/\r?\n/).reject
|
21
|
+
data = data.split(/\r?\n/).reject(&:empty?)
|
22
22
|
|
23
23
|
domains = {}
|
24
|
-
group =
|
24
|
+
group = ''
|
25
25
|
data.each do |row|
|
26
26
|
if row =~ /^\w/
|
27
27
|
group = row
|
28
28
|
domains[group] = []
|
29
29
|
else
|
30
|
-
domains[group].push row.sub("\.\t",
|
30
|
+
domains[group].push row.sub("\.\t", '').strip
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
domains.reject! { |
|
34
|
+
domains.reject! { |g, _| blacklist.include?(g) }
|
35
35
|
Gman.import(domains)
|
File without changes
|
data/test/helper.rb
CHANGED
@@ -8,7 +8,7 @@ begin
|
|
8
8
|
Bundler.setup(:default, :development)
|
9
9
|
rescue Bundler::BundlerError => e
|
10
10
|
$stderr.puts e.message
|
11
|
-
$stderr.puts
|
11
|
+
$stderr.puts 'Run `bundle install` to install missing gems'
|
12
12
|
exit e.status_code
|
13
13
|
end
|
14
14
|
|
@@ -16,11 +16,25 @@ require 'shoulda'
|
|
16
16
|
|
17
17
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
18
18
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
|
-
require_relative
|
20
|
-
require_relative
|
19
|
+
require_relative '../lib/gman'
|
20
|
+
require_relative '../lib/gman/domain_list'
|
21
|
+
require_relative '../lib/gman/importer'
|
21
22
|
|
22
|
-
|
23
|
+
def bin_path(cmd = 'gman')
|
24
|
+
File.expand_path "../bin/#{cmd}", File.dirname(__FILE__)
|
25
|
+
end
|
23
26
|
|
24
27
|
def test_bin(*args)
|
25
|
-
|
28
|
+
Open3.capture2e('bundle', 'exec', bin_path, *args)
|
29
|
+
end
|
30
|
+
|
31
|
+
def fixture_path(fixture)
|
32
|
+
File.expand_path "./fixtures/#{fixture}", File.dirname(__FILE__)
|
33
|
+
end
|
34
|
+
|
35
|
+
def with_env(key, value)
|
36
|
+
old_env = ENV[key]
|
37
|
+
ENV[key] = value
|
38
|
+
yield
|
39
|
+
ENV[key] = old_env
|
26
40
|
end
|
data/test/test_gman.rb
CHANGED
@@ -1,57 +1,62 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__), 'helper')
|
2
2
|
|
3
|
-
VALID = [
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
]
|
3
|
+
VALID = ['foo.gov',
|
4
|
+
'http://foo.mil',
|
5
|
+
'foo@bar.gc.ca',
|
6
|
+
'foo.gov.au',
|
7
|
+
'https://www.foo.gouv.fr',
|
8
|
+
'foo@ci.champaign.il.us',
|
9
|
+
'foo.bar.baz.gov.au',
|
10
|
+
'foo@bar.gov.uk',
|
11
|
+
'foo.gov',
|
12
|
+
'foo.fed.us',
|
13
|
+
'foo.state.il.us',
|
14
|
+
'state.il.us',
|
15
|
+
'foo@af.mil',
|
16
|
+
'foo.gov.in'
|
17
|
+
].freeze
|
18
18
|
|
19
|
-
INVALID = [
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
]
|
19
|
+
INVALID = ['foo.bar.com',
|
20
|
+
'bar@foo.biz',
|
21
|
+
'http://www.foo.biz',
|
22
|
+
'foo.uk',
|
23
|
+
'gov',
|
24
|
+
'foo@k12.champaign.il.us',
|
25
|
+
'foo@kii.gov.by',
|
26
|
+
'foo',
|
27
|
+
'',
|
28
|
+
nil,
|
29
|
+
' ',
|
30
|
+
'foo.city.il.us',
|
31
|
+
'foo.ci.il.us',
|
32
|
+
'foo.zx.us',
|
33
|
+
'foo@mail.gov.ua'
|
34
|
+
].freeze
|
35
35
|
|
36
36
|
class TestGman < Minitest::Test
|
37
|
-
|
38
37
|
VALID.each do |domain|
|
39
38
|
should "recognize #{domain} as a government domain" do
|
40
|
-
assert Gman
|
39
|
+
assert Gman.valid?(domain)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
|
44
43
|
INVALID.each do |domain|
|
45
44
|
should "recognize #{domain} as a non-government domain" do
|
46
|
-
refute Gman
|
45
|
+
refute Gman.valid?(domain)
|
47
46
|
end
|
48
47
|
end
|
49
48
|
|
50
|
-
should
|
51
|
-
assert_equal false, Gman
|
49
|
+
should 'not allow educational domains' do
|
50
|
+
assert_equal false, Gman.valid?('foo@gwu.edu')
|
51
|
+
end
|
52
|
+
|
53
|
+
should 'returns the path to domains.txt' do
|
54
|
+
assert_equal true, File.exist?(Gman.list_path)
|
52
55
|
end
|
53
56
|
|
54
|
-
should
|
55
|
-
|
57
|
+
should 'stub domains when asked' do
|
58
|
+
with_env 'GMAN_STUB_DOMAINS', 'true' do
|
59
|
+
assert_equal fixture_path('domains.txt'), Gman.list_path
|
60
|
+
end
|
56
61
|
end
|
57
62
|
end
|