gman 6.0.1 → 7.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rake_tasks +0 -0
- data/.rubocop.yml +3 -0
- data/config/domains.txt +19 -32
- data/config/vendor/academic.txt +8039 -0
- data/config/vendor/dotgovs.csv +5560 -5560
- data/gman.gemspec +1 -1
- data/lib/gman.rb +22 -20
- data/lib/gman/domain_list.rb +107 -41
- data/lib/gman/identifier.rb +2 -2
- data/lib/gman/importer.rb +10 -21
- data/lib/gman/locality.rb +15 -15
- data/lib/gman/version.rb +1 -1
- data/script/vendor +1 -1
- data/script/vendor-federal-de +1 -2
- data/script/vendor-municipal-de +1 -2
- data/script/vendor-nl +2 -1
- data/script/vendor-public-suffix +1 -2
- data/script/vendor-se +1 -2
- data/script/vendor-swot +41 -0
- data/script/vendor-us +3 -2
- data/test/helper.rb +8 -0
- data/test/test_gman.rb +0 -6
- data/test/test_gman_domain_list.rb +112 -0
- data/test/test_gman_domains.rb +5 -6
- data/test/test_gman_importer.rb +26 -32
- metadata +20 -15
data/script/vendor-municipal-de
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'csv'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
9
8
|
|
@@ -20,4 +19,4 @@ csv = lines.join("\n")
|
|
20
19
|
data = CSV.parse(csv, headers: true, col_sep: ';')
|
21
20
|
domains = data.map { |row| row['Internet'] }
|
22
21
|
|
23
|
-
Gman.
|
22
|
+
Gman::Importer.new('German Municipalities' => domains).import
|
data/script/vendor-nl
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
# See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
|
3
3
|
|
4
4
|
require 'fileutils'
|
5
|
+
require './lib/gman'
|
5
6
|
|
6
7
|
FileUtils.rm_rf('almanak.overheid.nl')
|
7
8
|
commands = [
|
@@ -15,4 +16,4 @@ commands = [
|
|
15
16
|
]
|
16
17
|
domains = system commands.join('|')
|
17
18
|
|
18
|
-
Gman.
|
19
|
+
Gman::Importer.new('Netherlands' => domains.split("\n")).import
|
data/script/vendor-public-suffix
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
require 'public_suffix'
|
5
5
|
require 'yaml'
|
6
6
|
require_relative '../lib/gman'
|
7
|
-
require_relative '../lib/gman/importer'
|
8
7
|
|
9
8
|
# https://gist.github.com/benbalter/6147066
|
10
9
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
@@ -24,4 +23,4 @@ end
|
|
24
23
|
|
25
24
|
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
25
|
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
-
Gman.
|
26
|
+
Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
|
data/script/vendor-se
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'csv'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
|
9
8
|
agent = Mechanize.new
|
@@ -18,4 +17,4 @@ domains = rows.map do |row|
|
|
18
17
|
row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
|
19
18
|
end
|
20
19
|
|
21
|
-
Gman.
|
20
|
+
Gman::Importer.new('Swedish Administrative Authorities' => domains).import
|
data/script/vendor-swot
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Vendors the Swot-maintained list of adademic domains into config/academic.txt
|
4
|
+
# Source: https://github.com/leereilly/swot/
|
5
|
+
#
|
6
|
+
# Usage: script/vendor-swot
|
7
|
+
#
|
8
|
+
# Will automatically fetch latest version of the list and merge
|
9
|
+
# You can check for changes and commit via `git status`
|
10
|
+
#
|
11
|
+
# It's also probably a good idea to run `script/ci-build` for good measure
|
12
|
+
#
|
13
|
+
# Note: We do this, because as a bajillion individual files, Swot takes up 30MB
|
14
|
+
|
15
|
+
require './lib/gman'
|
16
|
+
require 'swot'
|
17
|
+
|
18
|
+
# Generate array of all Swot domains
|
19
|
+
domains = Swot.all_domains
|
20
|
+
domains << Swot::ACADEMIC_TLDS
|
21
|
+
|
22
|
+
# Init the importer, builiding a DomainList
|
23
|
+
group = "Academic domains vendored from Swot v#{Swot::VERSION}"
|
24
|
+
hash = { group => domains }
|
25
|
+
|
26
|
+
importer = Gman::Importer.new(hash)
|
27
|
+
importer.logger.info "Importing from Swot v#{Swot::VERSION}"
|
28
|
+
importer.logger.info "Found #{domains.count} academic domains"
|
29
|
+
|
30
|
+
domain_list = importer.domain_list
|
31
|
+
domain_list.path = Gman.academic_list_path
|
32
|
+
|
33
|
+
# Cleanup and write
|
34
|
+
# Note: we're not using the import method, as that assume's we're writing the
|
35
|
+
# government domain list and would use Swot to ensure domains aren't academic
|
36
|
+
importer.send :normalize_domains!
|
37
|
+
domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
|
38
|
+
domain_list.data[group] = domain_list.data[group].flatten
|
39
|
+
domain_list.write
|
40
|
+
|
41
|
+
importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
|
data/script/vendor-us
CHANGED
@@ -10,7 +10,8 @@
|
|
10
10
|
#
|
11
11
|
# It's also probably a good idea to run `script/ci-build` for good measure
|
12
12
|
|
13
|
-
require './lib/gman
|
13
|
+
require './lib/gman'
|
14
|
+
require 'open-uri'
|
14
15
|
|
15
16
|
blacklist = %w(usagovQUASI usagovFEDgov)
|
16
17
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
@@ -32,4 +33,4 @@ data.each do |row|
|
|
32
33
|
end
|
33
34
|
|
34
35
|
domains.reject! { |g, _| blacklist.include?(g) }
|
35
|
-
Gman.
|
36
|
+
Gman::Importer.new(domains).import
|
data/test/helper.rb
CHANGED
data/test/test_gman.rb
CHANGED
@@ -53,10 +53,4 @@ class TestGman < Minitest::Test
|
|
53
53
|
should 'returns the path to domains.txt' do
|
54
54
|
assert_equal true, File.exist?(Gman.list_path)
|
55
55
|
end
|
56
|
-
|
57
|
-
should 'stub domains when asked' do
|
58
|
-
with_env 'GMAN_STUB_DOMAINS', 'true' do
|
59
|
-
assert_equal fixture_path('domains.txt'), Gman.list_path
|
60
|
-
end
|
61
|
-
end
|
62
56
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
class TestGmanDomainList < Minitest::Test
|
4
|
+
INIT_TYPES = [:path, :contents, :data].freeze
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@original_domain_list = File.read(stubbed_list_path)
|
8
|
+
end
|
9
|
+
|
10
|
+
def teardown
|
11
|
+
File.write stubbed_list_path, @original_domain_list
|
12
|
+
end
|
13
|
+
|
14
|
+
def domain_list(type)
|
15
|
+
case type
|
16
|
+
when :path
|
17
|
+
Gman::DomainList.new(path: Gman.list_path)
|
18
|
+
when :contents
|
19
|
+
contents = File.read(Gman.list_path)
|
20
|
+
Gman::DomainList.new(contents: contents)
|
21
|
+
when :data
|
22
|
+
data = Gman::DomainList.new(path: Gman.list_path).to_h
|
23
|
+
Gman::DomainList.new(data: data)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
INIT_TYPES.each do |type|
|
28
|
+
context "when initalized with #{type}" do
|
29
|
+
should 'store the init vars' do
|
30
|
+
refute domain_list(type).public_send(type).nil?
|
31
|
+
end
|
32
|
+
|
33
|
+
should 'return the domain data' do
|
34
|
+
list = domain_list(type)
|
35
|
+
assert list.data.key? 'Canada federal'
|
36
|
+
assert list.data.any? { |_key, values| values.include? 'gov' }
|
37
|
+
end
|
38
|
+
|
39
|
+
should 'return the list contents' do
|
40
|
+
list = domain_list(type)
|
41
|
+
assert_match(/^gov$/, list.contents)
|
42
|
+
end
|
43
|
+
|
44
|
+
should 'return the list path' do
|
45
|
+
list = domain_list(type)
|
46
|
+
assert_equal list.path, Gman.list_path
|
47
|
+
end
|
48
|
+
|
49
|
+
should 'return the public suffix parsed list' do
|
50
|
+
list = domain_list(type)
|
51
|
+
assert list.public_suffix_list.class == PublicSuffix::List
|
52
|
+
end
|
53
|
+
|
54
|
+
should 'know if a domain is valid' do
|
55
|
+
list = domain_list(type)
|
56
|
+
assert list.valid? 'whitehouse.gov'
|
57
|
+
end
|
58
|
+
|
59
|
+
should 'know if a domain is invalid' do
|
60
|
+
list = domain_list(type)
|
61
|
+
refute list.valid? 'example.com'
|
62
|
+
end
|
63
|
+
|
64
|
+
should 'return the domain groups' do
|
65
|
+
list = domain_list(type)
|
66
|
+
assert list.groups.include?('Canada federal')
|
67
|
+
end
|
68
|
+
|
69
|
+
should 'return the domains' do
|
70
|
+
list = domain_list(type)
|
71
|
+
assert list.domains.include?('gov')
|
72
|
+
end
|
73
|
+
|
74
|
+
should 'return the domain count' do
|
75
|
+
list = domain_list(type)
|
76
|
+
assert list.count.is_a?(Integer)
|
77
|
+
assert list.count > 100
|
78
|
+
end
|
79
|
+
|
80
|
+
should 'alphabetize the list' do
|
81
|
+
list = domain_list(type)
|
82
|
+
list.data['Canada municipal'].shuffle!
|
83
|
+
assert list.data['Canada municipal'].first != '100milehouse.com'
|
84
|
+
list.alphabetize
|
85
|
+
assert list.data['Canada municipal'].first == '100milehouse.com'
|
86
|
+
end
|
87
|
+
|
88
|
+
should 'write the list' do
|
89
|
+
list = domain_list(type)
|
90
|
+
list.instance_variable_set('@path', stubbed_list_path)
|
91
|
+
list.data = { 'foo' => ['bar.gov', 'baz.net'] }
|
92
|
+
list.write
|
93
|
+
contents = File.read(stubbed_list_path)
|
94
|
+
assert_match %r{^// foo$}, contents
|
95
|
+
expected = "// foo\nbar.gov\nbaz.net"
|
96
|
+
assert contents.include?(expected)
|
97
|
+
end
|
98
|
+
|
99
|
+
should 'output the list in public_suffix format' do
|
100
|
+
list = domain_list(type)
|
101
|
+
string = list.to_s
|
102
|
+
assert_match %r{^// Canada federal$}, string
|
103
|
+
assert string.include? "// Canada federal\ncanada\.ca\n"
|
104
|
+
end
|
105
|
+
|
106
|
+
should "find a domain's parent" do
|
107
|
+
list = domain_list(type)
|
108
|
+
assert_equal 'gov.uk', list.parent_domain('foo.gov.uk')
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/test/test_gman_domains.rb
CHANGED
@@ -20,13 +20,12 @@ class TestGmanDomains < Minitest::Test
|
|
20
20
|
end
|
21
21
|
|
22
22
|
invalid = []
|
23
|
-
|
24
|
-
|
23
|
+
options = { skip_dupe: true, skip_resolve: !resolve_domains? }
|
24
|
+
Gman.list.to_h.each do |group, domains|
|
25
25
|
next if WHITELIST.include?(group)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
}
|
26
|
+
Parallel.each(domains, in_threads: 4) do |domain|
|
27
|
+
invalid.push(domain) unless importer.valid_domain?(domain, options)
|
28
|
+
end
|
30
29
|
end
|
31
30
|
assert_equal [], invalid.flatten.reject(&:empty?)
|
32
31
|
end
|
data/test/test_gman_importer.rb
CHANGED
@@ -6,21 +6,17 @@ class TestGManImporter < Minitest::Test
|
|
6
6
|
@stdout = StringIO.new
|
7
7
|
@importer.instance_variable_set '@logger', Logger.new(@stdout)
|
8
8
|
|
9
|
-
|
10
|
-
@original_domain_list = File.open(Gman.list_path).read
|
11
|
-
end
|
9
|
+
@original_domain_list = File.read(stubbed_list_path)
|
12
10
|
end
|
13
11
|
|
14
12
|
def teardown
|
15
|
-
|
16
|
-
File.write Gman.list_path, @original_domain_list
|
17
|
-
end
|
13
|
+
File.write stubbed_list_path, @original_domain_list
|
18
14
|
end
|
19
15
|
|
20
16
|
should 'init the domain list' do
|
21
|
-
assert_equal Gman::DomainList, @importer.
|
22
|
-
assert_equal 1, @importer.
|
23
|
-
assert_equal 'example.com', @importer.
|
17
|
+
assert_equal Gman::DomainList, @importer.domain_list.class
|
18
|
+
assert_equal 1, @importer.domain_list.count
|
19
|
+
assert_equal 'example.com', @importer.domain_list.domains.first
|
24
20
|
end
|
25
21
|
|
26
22
|
should 'init the logger' do
|
@@ -51,43 +47,41 @@ class TestGManImporter < Minitest::Test
|
|
51
47
|
should 'normalize domains within the domain list' do
|
52
48
|
importer = Gman::Importer.new 'test' => ['www.EXAMPLE.com/']
|
53
49
|
importer.send :normalize_domains!
|
54
|
-
assert_equal 'example.com', importer.
|
50
|
+
assert_equal 'example.com', importer.domain_list.domains.first
|
55
51
|
end
|
56
52
|
|
57
53
|
should 'remove invalid domains from the domain list' do
|
58
54
|
importer = Gman::Importer.new 'test' => ['foo.github.io', 'example.com']
|
59
55
|
importer.instance_variable_set '@logger', Logger.new(@stdout)
|
60
56
|
|
61
|
-
assert_equal 2, importer.
|
57
|
+
assert_equal 2, importer.domain_list.count
|
62
58
|
importer.send :ensure_validity!
|
63
|
-
assert_equal 1, importer.
|
59
|
+
assert_equal 1, importer.domain_list.count
|
64
60
|
end
|
65
61
|
|
66
62
|
context 'writing the domain list' do
|
67
63
|
should 'add domains to the current domain list' do
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
end
|
64
|
+
domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
|
65
|
+
importer = Gman::Importer.new domains
|
66
|
+
importer.instance_variable_set '@current', stubbed_list
|
67
|
+
importer.send :add_to_current
|
68
|
+
expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
|
69
|
+
assert_equal expected, File.open(stubbed_list_path).read
|
75
70
|
end
|
76
71
|
|
77
72
|
should 'import' do
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end
|
73
|
+
domains = {
|
74
|
+
'test' => ['www.example.com', 'goo.github.io'],
|
75
|
+
'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
|
76
|
+
}
|
77
|
+
|
78
|
+
importer = Gman::Importer.new domains
|
79
|
+
importer.instance_variable_set '@current', stubbed_list
|
80
|
+
importer.instance_variable_set '@logger', Logger.new(@stdout)
|
81
|
+
importer.import(skip_resolve: true)
|
82
|
+
|
83
|
+
expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
|
84
|
+
assert_equal expected, File.open(stubbed_list_path).read
|
91
85
|
end
|
92
86
|
end
|
93
87
|
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 7.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: iso_country_codes
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: naughty_or_nice
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0
|
33
|
+
version: '2.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: colored
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2
|
47
|
+
version: '1.2'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2
|
54
|
+
version: '1.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: swot
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
62
|
-
type: :
|
61
|
+
version: '1.0'
|
62
|
+
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -217,6 +217,7 @@ extensions: []
|
|
217
217
|
extra_rdoc_files: []
|
218
218
|
files:
|
219
219
|
- ".gitignore"
|
220
|
+
- ".rake_tasks"
|
220
221
|
- ".rubocop.yml"
|
221
222
|
- ".ruby-version"
|
222
223
|
- ".travis.yml"
|
@@ -228,6 +229,7 @@ files:
|
|
228
229
|
- bin/gman
|
229
230
|
- bin/gman_filter
|
230
231
|
- config/domains.txt
|
232
|
+
- config/vendor/academic.txt
|
231
233
|
- config/vendor/dotgovs.csv
|
232
234
|
- gman.gemspec
|
233
235
|
- lib/gman.rb
|
@@ -254,6 +256,7 @@ files:
|
|
254
256
|
- script/vendor-nl
|
255
257
|
- script/vendor-public-suffix
|
256
258
|
- script/vendor-se
|
259
|
+
- script/vendor-swot
|
257
260
|
- script/vendor-us
|
258
261
|
- test/fixtures/domains.txt
|
259
262
|
- test/fixtures/obama.txt
|
@@ -261,6 +264,7 @@ files:
|
|
261
264
|
- test/test_gman.rb
|
262
265
|
- test/test_gman_bin.rb
|
263
266
|
- test/test_gman_country_codes.rb
|
267
|
+
- test/test_gman_domain_list.rb
|
264
268
|
- test/test_gman_domains.rb
|
265
269
|
- test/test_gman_filter.rb
|
266
270
|
- test/test_gman_identifier.rb
|
@@ -297,6 +301,7 @@ test_files:
|
|
297
301
|
- test/test_gman.rb
|
298
302
|
- test/test_gman_bin.rb
|
299
303
|
- test/test_gman_country_codes.rb
|
304
|
+
- test/test_gman_domain_list.rb
|
300
305
|
- test/test_gman_domains.rb
|
301
306
|
- test/test_gman_filter.rb
|
302
307
|
- test/test_gman_identifier.rb
|