gman 6.0.1 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rake_tasks +0 -0
- data/.rubocop.yml +3 -0
- data/config/domains.txt +19 -32
- data/config/vendor/academic.txt +8039 -0
- data/config/vendor/dotgovs.csv +5560 -5560
- data/gman.gemspec +1 -1
- data/lib/gman.rb +22 -20
- data/lib/gman/domain_list.rb +107 -41
- data/lib/gman/identifier.rb +2 -2
- data/lib/gman/importer.rb +10 -21
- data/lib/gman/locality.rb +15 -15
- data/lib/gman/version.rb +1 -1
- data/script/vendor +1 -1
- data/script/vendor-federal-de +1 -2
- data/script/vendor-municipal-de +1 -2
- data/script/vendor-nl +2 -1
- data/script/vendor-public-suffix +1 -2
- data/script/vendor-se +1 -2
- data/script/vendor-swot +41 -0
- data/script/vendor-us +3 -2
- data/test/helper.rb +8 -0
- data/test/test_gman.rb +0 -6
- data/test/test_gman_domain_list.rb +112 -0
- data/test/test_gman_domains.rb +5 -6
- data/test/test_gman_importer.rb +26 -32
- metadata +20 -15
data/script/vendor-municipal-de
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'csv'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
|
9
8
|
|
@@ -20,4 +19,4 @@ csv = lines.join("\n")
|
|
20
19
|
data = CSV.parse(csv, headers: true, col_sep: ';')
|
21
20
|
domains = data.map { |row| row['Internet'] }
|
22
21
|
|
23
|
-
Gman.
|
22
|
+
Gman::Importer.new('German Municipalities' => domains).import
|
data/script/vendor-nl
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
# See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
|
3
3
|
|
4
4
|
require 'fileutils'
|
5
|
+
require './lib/gman'
|
5
6
|
|
6
7
|
FileUtils.rm_rf('almanak.overheid.nl')
|
7
8
|
commands = [
|
@@ -15,4 +16,4 @@ commands = [
|
|
15
16
|
]
|
16
17
|
domains = system commands.join('|')
|
17
18
|
|
18
|
-
Gman.
|
19
|
+
Gman::Importer.new('Netherlands' => domains.split("\n")).import
|
data/script/vendor-public-suffix
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
require 'public_suffix'
|
5
5
|
require 'yaml'
|
6
6
|
require_relative '../lib/gman'
|
7
|
-
require_relative '../lib/gman/importer'
|
8
7
|
|
9
8
|
# https://gist.github.com/benbalter/6147066
|
10
9
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
@@ -24,4 +23,4 @@ end
|
|
24
23
|
|
25
24
|
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
25
|
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
-
Gman.
|
26
|
+
Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
|
data/script/vendor-se
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'csv'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/importer'
|
7
6
|
|
8
7
|
url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
|
9
8
|
agent = Mechanize.new
|
@@ -18,4 +17,4 @@ domains = rows.map do |row|
|
|
18
17
|
row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
|
19
18
|
end
|
20
19
|
|
21
|
-
Gman.
|
20
|
+
Gman::Importer.new('Swedish Administrative Authorities' => domains).import
|
data/script/vendor-swot
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Vendors the Swot-maintained list of adademic domains into config/academic.txt
|
4
|
+
# Source: https://github.com/leereilly/swot/
|
5
|
+
#
|
6
|
+
# Usage: script/vendor-swot
|
7
|
+
#
|
8
|
+
# Will automatically fetch latest version of the list and merge
|
9
|
+
# You can check for changes and commit via `git status`
|
10
|
+
#
|
11
|
+
# It's also probably a good idea to run `script/ci-build` for good measure
|
12
|
+
#
|
13
|
+
# Note: We do this, because as a bajillion individual files, Swot takes up 30MB
|
14
|
+
|
15
|
+
require './lib/gman'
|
16
|
+
require 'swot'
|
17
|
+
|
18
|
+
# Generate array of all Swot domains
|
19
|
+
domains = Swot.all_domains
|
20
|
+
domains << Swot::ACADEMIC_TLDS
|
21
|
+
|
22
|
+
# Init the importer, builiding a DomainList
|
23
|
+
group = "Academic domains vendored from Swot v#{Swot::VERSION}"
|
24
|
+
hash = { group => domains }
|
25
|
+
|
26
|
+
importer = Gman::Importer.new(hash)
|
27
|
+
importer.logger.info "Importing from Swot v#{Swot::VERSION}"
|
28
|
+
importer.logger.info "Found #{domains.count} academic domains"
|
29
|
+
|
30
|
+
domain_list = importer.domain_list
|
31
|
+
domain_list.path = Gman.academic_list_path
|
32
|
+
|
33
|
+
# Cleanup and write
|
34
|
+
# Note: we're not using the import method, as that assume's we're writing the
|
35
|
+
# government domain list and would use Swot to ensure domains aren't academic
|
36
|
+
importer.send :normalize_domains!
|
37
|
+
domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
|
38
|
+
domain_list.data[group] = domain_list.data[group].flatten
|
39
|
+
domain_list.write
|
40
|
+
|
41
|
+
importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
|
data/script/vendor-us
CHANGED
@@ -10,7 +10,8 @@
|
|
10
10
|
#
|
11
11
|
# It's also probably a good idea to run `script/ci-build` for good measure
|
12
12
|
|
13
|
-
require './lib/gman
|
13
|
+
require './lib/gman'
|
14
|
+
require 'open-uri'
|
14
15
|
|
15
16
|
blacklist = %w(usagovQUASI usagovFEDgov)
|
16
17
|
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
|
@@ -32,4 +33,4 @@ data.each do |row|
|
|
32
33
|
end
|
33
34
|
|
34
35
|
domains.reject! { |g, _| blacklist.include?(g) }
|
35
|
-
Gman.
|
36
|
+
Gman::Importer.new(domains).import
|
data/test/helper.rb
CHANGED
data/test/test_gman.rb
CHANGED
@@ -53,10 +53,4 @@ class TestGman < Minitest::Test
|
|
53
53
|
should 'returns the path to domains.txt' do
|
54
54
|
assert_equal true, File.exist?(Gman.list_path)
|
55
55
|
end
|
56
|
-
|
57
|
-
should 'stub domains when asked' do
|
58
|
-
with_env 'GMAN_STUB_DOMAINS', 'true' do
|
59
|
-
assert_equal fixture_path('domains.txt'), Gman.list_path
|
60
|
-
end
|
61
|
-
end
|
62
56
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
class TestGmanDomainList < Minitest::Test
|
4
|
+
INIT_TYPES = [:path, :contents, :data].freeze
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@original_domain_list = File.read(stubbed_list_path)
|
8
|
+
end
|
9
|
+
|
10
|
+
def teardown
|
11
|
+
File.write stubbed_list_path, @original_domain_list
|
12
|
+
end
|
13
|
+
|
14
|
+
def domain_list(type)
|
15
|
+
case type
|
16
|
+
when :path
|
17
|
+
Gman::DomainList.new(path: Gman.list_path)
|
18
|
+
when :contents
|
19
|
+
contents = File.read(Gman.list_path)
|
20
|
+
Gman::DomainList.new(contents: contents)
|
21
|
+
when :data
|
22
|
+
data = Gman::DomainList.new(path: Gman.list_path).to_h
|
23
|
+
Gman::DomainList.new(data: data)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
INIT_TYPES.each do |type|
|
28
|
+
context "when initalized with #{type}" do
|
29
|
+
should 'store the init vars' do
|
30
|
+
refute domain_list(type).public_send(type).nil?
|
31
|
+
end
|
32
|
+
|
33
|
+
should 'return the domain data' do
|
34
|
+
list = domain_list(type)
|
35
|
+
assert list.data.key? 'Canada federal'
|
36
|
+
assert list.data.any? { |_key, values| values.include? 'gov' }
|
37
|
+
end
|
38
|
+
|
39
|
+
should 'return the list contents' do
|
40
|
+
list = domain_list(type)
|
41
|
+
assert_match(/^gov$/, list.contents)
|
42
|
+
end
|
43
|
+
|
44
|
+
should 'return the list path' do
|
45
|
+
list = domain_list(type)
|
46
|
+
assert_equal list.path, Gman.list_path
|
47
|
+
end
|
48
|
+
|
49
|
+
should 'return the public suffix parsed list' do
|
50
|
+
list = domain_list(type)
|
51
|
+
assert list.public_suffix_list.class == PublicSuffix::List
|
52
|
+
end
|
53
|
+
|
54
|
+
should 'know if a domain is valid' do
|
55
|
+
list = domain_list(type)
|
56
|
+
assert list.valid? 'whitehouse.gov'
|
57
|
+
end
|
58
|
+
|
59
|
+
should 'know if a domain is invalid' do
|
60
|
+
list = domain_list(type)
|
61
|
+
refute list.valid? 'example.com'
|
62
|
+
end
|
63
|
+
|
64
|
+
should 'return the domain groups' do
|
65
|
+
list = domain_list(type)
|
66
|
+
assert list.groups.include?('Canada federal')
|
67
|
+
end
|
68
|
+
|
69
|
+
should 'return the domains' do
|
70
|
+
list = domain_list(type)
|
71
|
+
assert list.domains.include?('gov')
|
72
|
+
end
|
73
|
+
|
74
|
+
should 'return the domain count' do
|
75
|
+
list = domain_list(type)
|
76
|
+
assert list.count.is_a?(Integer)
|
77
|
+
assert list.count > 100
|
78
|
+
end
|
79
|
+
|
80
|
+
should 'alphabetize the list' do
|
81
|
+
list = domain_list(type)
|
82
|
+
list.data['Canada municipal'].shuffle!
|
83
|
+
assert list.data['Canada municipal'].first != '100milehouse.com'
|
84
|
+
list.alphabetize
|
85
|
+
assert list.data['Canada municipal'].first == '100milehouse.com'
|
86
|
+
end
|
87
|
+
|
88
|
+
should 'write the list' do
|
89
|
+
list = domain_list(type)
|
90
|
+
list.instance_variable_set('@path', stubbed_list_path)
|
91
|
+
list.data = { 'foo' => ['bar.gov', 'baz.net'] }
|
92
|
+
list.write
|
93
|
+
contents = File.read(stubbed_list_path)
|
94
|
+
assert_match %r{^// foo$}, contents
|
95
|
+
expected = "// foo\nbar.gov\nbaz.net"
|
96
|
+
assert contents.include?(expected)
|
97
|
+
end
|
98
|
+
|
99
|
+
should 'output the list in public_suffix format' do
|
100
|
+
list = domain_list(type)
|
101
|
+
string = list.to_s
|
102
|
+
assert_match %r{^// Canada federal$}, string
|
103
|
+
assert string.include? "// Canada federal\ncanada\.ca\n"
|
104
|
+
end
|
105
|
+
|
106
|
+
should "find a domain's parent" do
|
107
|
+
list = domain_list(type)
|
108
|
+
assert_equal 'gov.uk', list.parent_domain('foo.gov.uk')
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/test/test_gman_domains.rb
CHANGED
@@ -20,13 +20,12 @@ class TestGmanDomains < Minitest::Test
|
|
20
20
|
end
|
21
21
|
|
22
22
|
invalid = []
|
23
|
-
|
24
|
-
|
23
|
+
options = { skip_dupe: true, skip_resolve: !resolve_domains? }
|
24
|
+
Gman.list.to_h.each do |group, domains|
|
25
25
|
next if WHITELIST.include?(group)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
}
|
26
|
+
Parallel.each(domains, in_threads: 4) do |domain|
|
27
|
+
invalid.push(domain) unless importer.valid_domain?(domain, options)
|
28
|
+
end
|
30
29
|
end
|
31
30
|
assert_equal [], invalid.flatten.reject(&:empty?)
|
32
31
|
end
|
data/test/test_gman_importer.rb
CHANGED
@@ -6,21 +6,17 @@ class TestGManImporter < Minitest::Test
|
|
6
6
|
@stdout = StringIO.new
|
7
7
|
@importer.instance_variable_set '@logger', Logger.new(@stdout)
|
8
8
|
|
9
|
-
|
10
|
-
@original_domain_list = File.open(Gman.list_path).read
|
11
|
-
end
|
9
|
+
@original_domain_list = File.read(stubbed_list_path)
|
12
10
|
end
|
13
11
|
|
14
12
|
def teardown
|
15
|
-
|
16
|
-
File.write Gman.list_path, @original_domain_list
|
17
|
-
end
|
13
|
+
File.write stubbed_list_path, @original_domain_list
|
18
14
|
end
|
19
15
|
|
20
16
|
should 'init the domain list' do
|
21
|
-
assert_equal Gman::DomainList, @importer.
|
22
|
-
assert_equal 1, @importer.
|
23
|
-
assert_equal 'example.com', @importer.
|
17
|
+
assert_equal Gman::DomainList, @importer.domain_list.class
|
18
|
+
assert_equal 1, @importer.domain_list.count
|
19
|
+
assert_equal 'example.com', @importer.domain_list.domains.first
|
24
20
|
end
|
25
21
|
|
26
22
|
should 'init the logger' do
|
@@ -51,43 +47,41 @@ class TestGManImporter < Minitest::Test
|
|
51
47
|
should 'normalize domains within the domain list' do
|
52
48
|
importer = Gman::Importer.new 'test' => ['www.EXAMPLE.com/']
|
53
49
|
importer.send :normalize_domains!
|
54
|
-
assert_equal 'example.com', importer.
|
50
|
+
assert_equal 'example.com', importer.domain_list.domains.first
|
55
51
|
end
|
56
52
|
|
57
53
|
should 'remove invalid domains from the domain list' do
|
58
54
|
importer = Gman::Importer.new 'test' => ['foo.github.io', 'example.com']
|
59
55
|
importer.instance_variable_set '@logger', Logger.new(@stdout)
|
60
56
|
|
61
|
-
assert_equal 2, importer.
|
57
|
+
assert_equal 2, importer.domain_list.count
|
62
58
|
importer.send :ensure_validity!
|
63
|
-
assert_equal 1, importer.
|
59
|
+
assert_equal 1, importer.domain_list.count
|
64
60
|
end
|
65
61
|
|
66
62
|
context 'writing the domain list' do
|
67
63
|
should 'add domains to the current domain list' do
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
end
|
64
|
+
domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
|
65
|
+
importer = Gman::Importer.new domains
|
66
|
+
importer.instance_variable_set '@current', stubbed_list
|
67
|
+
importer.send :add_to_current
|
68
|
+
expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
|
69
|
+
assert_equal expected, File.open(stubbed_list_path).read
|
75
70
|
end
|
76
71
|
|
77
72
|
should 'import' do
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end
|
73
|
+
domains = {
|
74
|
+
'test' => ['www.example.com', 'goo.github.io'],
|
75
|
+
'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
|
76
|
+
}
|
77
|
+
|
78
|
+
importer = Gman::Importer.new domains
|
79
|
+
importer.instance_variable_set '@current', stubbed_list
|
80
|
+
importer.instance_variable_set '@logger', Logger.new(@stdout)
|
81
|
+
importer.import(skip_resolve: true)
|
82
|
+
|
83
|
+
expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
|
84
|
+
assert_equal expected, File.open(stubbed_list_path).read
|
91
85
|
end
|
92
86
|
end
|
93
87
|
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 7.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: iso_country_codes
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: naughty_or_nice
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0
|
33
|
+
version: '2.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: colored
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2
|
47
|
+
version: '1.2'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2
|
54
|
+
version: '1.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: swot
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
62
|
-
type: :
|
61
|
+
version: '1.0'
|
62
|
+
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -217,6 +217,7 @@ extensions: []
|
|
217
217
|
extra_rdoc_files: []
|
218
218
|
files:
|
219
219
|
- ".gitignore"
|
220
|
+
- ".rake_tasks"
|
220
221
|
- ".rubocop.yml"
|
221
222
|
- ".ruby-version"
|
222
223
|
- ".travis.yml"
|
@@ -228,6 +229,7 @@ files:
|
|
228
229
|
- bin/gman
|
229
230
|
- bin/gman_filter
|
230
231
|
- config/domains.txt
|
232
|
+
- config/vendor/academic.txt
|
231
233
|
- config/vendor/dotgovs.csv
|
232
234
|
- gman.gemspec
|
233
235
|
- lib/gman.rb
|
@@ -254,6 +256,7 @@ files:
|
|
254
256
|
- script/vendor-nl
|
255
257
|
- script/vendor-public-suffix
|
256
258
|
- script/vendor-se
|
259
|
+
- script/vendor-swot
|
257
260
|
- script/vendor-us
|
258
261
|
- test/fixtures/domains.txt
|
259
262
|
- test/fixtures/obama.txt
|
@@ -261,6 +264,7 @@ files:
|
|
261
264
|
- test/test_gman.rb
|
262
265
|
- test/test_gman_bin.rb
|
263
266
|
- test/test_gman_country_codes.rb
|
267
|
+
- test/test_gman_domain_list.rb
|
264
268
|
- test/test_gman_domains.rb
|
265
269
|
- test/test_gman_filter.rb
|
266
270
|
- test/test_gman_identifier.rb
|
@@ -297,6 +301,7 @@ test_files:
|
|
297
301
|
- test/test_gman.rb
|
298
302
|
- test/test_gman_bin.rb
|
299
303
|
- test/test_gman_country_codes.rb
|
304
|
+
- test/test_gman_domain_list.rb
|
300
305
|
- test/test_gman_domains.rb
|
301
306
|
- test/test_gman_filter.rb
|
302
307
|
- test/test_gman_identifier.rb
|