gman 6.0.1 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,6 @@
3
3
  require 'csv'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
8
 
@@ -20,4 +19,4 @@ csv = lines.join("\n")
20
19
  data = CSV.parse(csv, headers: true, col_sep: ';')
21
20
  domains = data.map { |row| row['Internet'] }
22
21
 
23
- Gman.import('German Municipalities' => domains)
22
+ Gman::Importer.new('German Municipalities' => domains).import
@@ -2,6 +2,7 @@
2
2
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
3
 
4
4
  require 'fileutils'
5
+ require './lib/gman'
5
6
 
6
7
  FileUtils.rm_rf('almanak.overheid.nl')
7
8
  commands = [
@@ -15,4 +16,4 @@ commands = [
15
16
  ]
16
17
  domains = system commands.join('|')
17
18
 
18
- Gman.import('Netherlands' => domains.split("\n"))
19
+ Gman::Importer.new('Netherlands' => domains.split("\n")).import
@@ -4,7 +4,6 @@
4
4
  require 'public_suffix'
5
5
  require 'yaml'
6
6
  require_relative '../lib/gman'
7
- require_relative '../lib/gman/importer'
8
7
 
9
8
  # https://gist.github.com/benbalter/6147066
10
9
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
@@ -24,4 +23,4 @@ end
24
23
 
25
24
  # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
25
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
26
+ Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -3,7 +3,6 @@
3
3
  require 'mechanize'
4
4
  require 'csv'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
8
  agent = Mechanize.new
@@ -18,4 +17,4 @@ domains = rows.map do |row|
18
17
  row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
19
18
  end
20
19
 
21
- Gman.import('Swedish Administrative Authorities' => domains)
20
+ Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -0,0 +1,41 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
+ # Source: https://github.com/leereilly/swot/
5
+ #
6
+ # Usage: script/vendor-swot
7
+ #
8
+ # Will automatically fetch latest version of the list and merge
9
+ # You can check for changes and commit via `git status`
10
+ #
11
+ # It's also probably a good idea to run `script/ci-build` for good measure
12
+ #
13
+ # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
14
+
15
+ require './lib/gman'
16
+ require 'swot'
17
+
18
+ # Generate array of all Swot domains
19
+ domains = Swot.all_domains
20
+ domains << Swot::ACADEMIC_TLDS
21
+
22
+ # Init the importer, builiding a DomainList
23
+ group = "Academic domains vendored from Swot v#{Swot::VERSION}"
24
+ hash = { group => domains }
25
+
26
+ importer = Gman::Importer.new(hash)
27
+ importer.logger.info "Importing from Swot v#{Swot::VERSION}"
28
+ importer.logger.info "Found #{domains.count} academic domains"
29
+
30
+ domain_list = importer.domain_list
31
+ domain_list.path = Gman.academic_list_path
32
+
33
+ # Cleanup and write
34
+ # Note: we're not using the import method, as that assume's we're writing the
35
+ # government domain list and would use Swot to ensure domains aren't academic
36
+ importer.send :normalize_domains!
37
+ domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
38
+ domain_list.data[group] = domain_list.data[group].flatten
39
+ domain_list.write
40
+
41
+ importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
@@ -10,7 +10,8 @@
10
10
  #
11
11
  # It's also probably a good idea to run `script/ci-build` for good measure
12
12
 
13
- require './lib/gman/importer'
13
+ require './lib/gman'
14
+ require 'open-uri'
14
15
 
15
16
  blacklist = %w(usagovQUASI usagovFEDgov)
16
17
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
@@ -32,4 +33,4 @@ data.each do |row|
32
33
  end
33
34
 
34
35
  domains.reject! { |g, _| blacklist.include?(g) }
35
- Gman.import(domains)
36
+ Gman::Importer.new(domains).import
@@ -38,3 +38,11 @@ def with_env(key, value)
38
38
  yield
39
39
  ENV[key] = old_env
40
40
  end
41
+
42
+ def stubbed_list_path
43
+ File.expand_path './fixtures/domains.txt', File.dirname(__FILE__)
44
+ end
45
+
46
+ def stubbed_list
47
+ Gman::DomainList.new(path: stubbed_list_path)
48
+ end
@@ -53,10 +53,4 @@ class TestGman < Minitest::Test
53
53
  should 'returns the path to domains.txt' do
54
54
  assert_equal true, File.exist?(Gman.list_path)
55
55
  end
56
-
57
- should 'stub domains when asked' do
58
- with_env 'GMAN_STUB_DOMAINS', 'true' do
59
- assert_equal fixture_path('domains.txt'), Gman.list_path
60
- end
61
- end
62
56
  end
@@ -0,0 +1,112 @@
1
+ require File.join(File.dirname(__FILE__), 'helper')
2
+
3
+ class TestGmanDomainList < Minitest::Test
4
+ INIT_TYPES = [:path, :contents, :data].freeze
5
+
6
+ def setup
7
+ @original_domain_list = File.read(stubbed_list_path)
8
+ end
9
+
10
+ def teardown
11
+ File.write stubbed_list_path, @original_domain_list
12
+ end
13
+
14
+ def domain_list(type)
15
+ case type
16
+ when :path
17
+ Gman::DomainList.new(path: Gman.list_path)
18
+ when :contents
19
+ contents = File.read(Gman.list_path)
20
+ Gman::DomainList.new(contents: contents)
21
+ when :data
22
+ data = Gman::DomainList.new(path: Gman.list_path).to_h
23
+ Gman::DomainList.new(data: data)
24
+ end
25
+ end
26
+
27
+ INIT_TYPES.each do |type|
28
+ context "when initalized with #{type}" do
29
+ should 'store the init vars' do
30
+ refute domain_list(type).public_send(type).nil?
31
+ end
32
+
33
+ should 'return the domain data' do
34
+ list = domain_list(type)
35
+ assert list.data.key? 'Canada federal'
36
+ assert list.data.any? { |_key, values| values.include? 'gov' }
37
+ end
38
+
39
+ should 'return the list contents' do
40
+ list = domain_list(type)
41
+ assert_match(/^gov$/, list.contents)
42
+ end
43
+
44
+ should 'return the list path' do
45
+ list = domain_list(type)
46
+ assert_equal list.path, Gman.list_path
47
+ end
48
+
49
+ should 'return the public suffix parsed list' do
50
+ list = domain_list(type)
51
+ assert list.public_suffix_list.class == PublicSuffix::List
52
+ end
53
+
54
+ should 'know if a domain is valid' do
55
+ list = domain_list(type)
56
+ assert list.valid? 'whitehouse.gov'
57
+ end
58
+
59
+ should 'know if a domain is invalid' do
60
+ list = domain_list(type)
61
+ refute list.valid? 'example.com'
62
+ end
63
+
64
+ should 'return the domain groups' do
65
+ list = domain_list(type)
66
+ assert list.groups.include?('Canada federal')
67
+ end
68
+
69
+ should 'return the domains' do
70
+ list = domain_list(type)
71
+ assert list.domains.include?('gov')
72
+ end
73
+
74
+ should 'return the domain count' do
75
+ list = domain_list(type)
76
+ assert list.count.is_a?(Integer)
77
+ assert list.count > 100
78
+ end
79
+
80
+ should 'alphabetize the list' do
81
+ list = domain_list(type)
82
+ list.data['Canada municipal'].shuffle!
83
+ assert list.data['Canada municipal'].first != '100milehouse.com'
84
+ list.alphabetize
85
+ assert list.data['Canada municipal'].first == '100milehouse.com'
86
+ end
87
+
88
+ should 'write the list' do
89
+ list = domain_list(type)
90
+ list.instance_variable_set('@path', stubbed_list_path)
91
+ list.data = { 'foo' => ['bar.gov', 'baz.net'] }
92
+ list.write
93
+ contents = File.read(stubbed_list_path)
94
+ assert_match %r{^// foo$}, contents
95
+ expected = "// foo\nbar.gov\nbaz.net"
96
+ assert contents.include?(expected)
97
+ end
98
+
99
+ should 'output the list in public_suffix format' do
100
+ list = domain_list(type)
101
+ string = list.to_s
102
+ assert_match %r{^// Canada federal$}, string
103
+ assert string.include? "// Canada federal\ncanada\.ca\n"
104
+ end
105
+
106
+ should "find a domain's parent" do
107
+ list = domain_list(type)
108
+ assert_equal 'gov.uk', list.parent_domain('foo.gov.uk')
109
+ end
110
+ end
111
+ end
112
+ end
@@ -20,13 +20,12 @@ class TestGmanDomains < Minitest::Test
20
20
  end
21
21
 
22
22
  invalid = []
23
- list = Gman::DomainList.current.list
24
- Parallel.each(list, in_threads: 2) do |group, domains|
23
+ options = { skip_dupe: true, skip_resolve: !resolve_domains? }
24
+ Gman.list.to_h.each do |group, domains|
25
25
  next if WHITELIST.include?(group)
26
- invalid.push domains.reject { |domain|
27
- options = { skip_dupe: true, skip_resolve: !resolve_domains? }
28
- importer.valid_domain?(domain, options)
29
- }
26
+ Parallel.each(domains, in_threads: 4) do |domain|
27
+ invalid.push(domain) unless importer.valid_domain?(domain, options)
28
+ end
30
29
  end
31
30
  assert_equal [], invalid.flatten.reject(&:empty?)
32
31
  end
@@ -6,21 +6,17 @@ class TestGManImporter < Minitest::Test
6
6
  @stdout = StringIO.new
7
7
  @importer.instance_variable_set '@logger', Logger.new(@stdout)
8
8
 
9
- with_env 'GMAN_STUB_DOMAINS', 'true' do
10
- @original_domain_list = File.open(Gman.list_path).read
11
- end
9
+ @original_domain_list = File.read(stubbed_list_path)
12
10
  end
13
11
 
14
12
  def teardown
15
- with_env 'GMAN_STUB_DOMAINS', 'true' do
16
- File.write Gman.list_path, @original_domain_list
17
- end
13
+ File.write stubbed_list_path, @original_domain_list
18
14
  end
19
15
 
20
16
  should 'init the domain list' do
21
- assert_equal Gman::DomainList, @importer.domains.class
22
- assert_equal 1, @importer.domains.domains.count
23
- assert_equal 'example.com', @importer.domains.domains.first
17
+ assert_equal Gman::DomainList, @importer.domain_list.class
18
+ assert_equal 1, @importer.domain_list.count
19
+ assert_equal 'example.com', @importer.domain_list.domains.first
24
20
  end
25
21
 
26
22
  should 'init the logger' do
@@ -51,43 +47,41 @@ class TestGManImporter < Minitest::Test
51
47
  should 'normalize domains within the domain list' do
52
48
  importer = Gman::Importer.new 'test' => ['www.EXAMPLE.com/']
53
49
  importer.send :normalize_domains!
54
- assert_equal 'example.com', importer.domains.domains.first
50
+ assert_equal 'example.com', importer.domain_list.domains.first
55
51
  end
56
52
 
57
53
  should 'remove invalid domains from the domain list' do
58
54
  importer = Gman::Importer.new 'test' => ['foo.github.io', 'example.com']
59
55
  importer.instance_variable_set '@logger', Logger.new(@stdout)
60
56
 
61
- assert_equal 2, importer.domains.domains.count
57
+ assert_equal 2, importer.domain_list.count
62
58
  importer.send :ensure_validity!
63
- assert_equal 1, importer.domains.domains.count
59
+ assert_equal 1, importer.domain_list.count
64
60
  end
65
61
 
66
62
  context 'writing the domain list' do
67
63
  should 'add domains to the current domain list' do
68
- with_env 'GMAN_STUB_DOMAINS', 'true' do
69
- domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
70
- importer = Gman::Importer.new domains
71
- importer.send :add_to_current
72
- expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
73
- assert_equal expected, File.open(Gman.list_path).read
74
- end
64
+ domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
65
+ importer = Gman::Importer.new domains
66
+ importer.instance_variable_set '@current', stubbed_list
67
+ importer.send :add_to_current
68
+ expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
69
+ assert_equal expected, File.open(stubbed_list_path).read
75
70
  end
76
71
 
77
72
  should 'import' do
78
- with_env 'GMAN_STUB_DOMAINS', 'true' do
79
- domains = {
80
- 'test' => ['www.example.com', 'goo.github.io'],
81
- 'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
82
- }
83
-
84
- importer = Gman::Importer.new domains
85
- importer.instance_variable_set '@logger', Logger.new(@stdout)
86
- importer.import(skip_resolve: true)
87
-
88
- expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
89
- assert_equal expected, File.open(Gman.list_path).read
90
- end
73
+ domains = {
74
+ 'test' => ['www.example.com', 'goo.github.io'],
75
+ 'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
76
+ }
77
+
78
+ importer = Gman::Importer.new domains
79
+ importer.instance_variable_set '@current', stubbed_list
80
+ importer.instance_variable_set '@logger', Logger.new(@stdout)
81
+ importer.import(skip_resolve: true)
82
+
83
+ expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
84
+ assert_equal expected, File.open(stubbed_list_path).read
91
85
  end
92
86
  end
93
87
  end
metadata CHANGED
@@ -1,71 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 6.0.1
4
+ version: 7.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: swot
14
+ name: iso_country_codes
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '0.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '0.6'
27
27
  - !ruby/object:Gem::Dependency
28
- name: iso_country_codes
28
+ name: naughty_or_nice
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.6'
33
+ version: '2.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.6'
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: naughty_or_nice
42
+ name: colored
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '2.0'
47
+ version: '1.2'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '2.0'
54
+ version: '1.2'
55
55
  - !ruby/object:Gem::Dependency
56
- name: colored
56
+ name: swot
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.2'
62
- type: :runtime
61
+ version: '1.0'
62
+ type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.2'
68
+ version: '1.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -217,6 +217,7 @@ extensions: []
217
217
  extra_rdoc_files: []
218
218
  files:
219
219
  - ".gitignore"
220
+ - ".rake_tasks"
220
221
  - ".rubocop.yml"
221
222
  - ".ruby-version"
222
223
  - ".travis.yml"
@@ -228,6 +229,7 @@ files:
228
229
  - bin/gman
229
230
  - bin/gman_filter
230
231
  - config/domains.txt
232
+ - config/vendor/academic.txt
231
233
  - config/vendor/dotgovs.csv
232
234
  - gman.gemspec
233
235
  - lib/gman.rb
@@ -254,6 +256,7 @@ files:
254
256
  - script/vendor-nl
255
257
  - script/vendor-public-suffix
256
258
  - script/vendor-se
259
+ - script/vendor-swot
257
260
  - script/vendor-us
258
261
  - test/fixtures/domains.txt
259
262
  - test/fixtures/obama.txt
@@ -261,6 +264,7 @@ files:
261
264
  - test/test_gman.rb
262
265
  - test/test_gman_bin.rb
263
266
  - test/test_gman_country_codes.rb
267
+ - test/test_gman_domain_list.rb
264
268
  - test/test_gman_domains.rb
265
269
  - test/test_gman_filter.rb
266
270
  - test/test_gman_identifier.rb
@@ -297,6 +301,7 @@ test_files:
297
301
  - test/test_gman.rb
298
302
  - test/test_gman_bin.rb
299
303
  - test/test_gman_country_codes.rb
304
+ - test/test_gman_domain_list.rb
300
305
  - test/test_gman_domains.rb
301
306
  - test/test_gman_filter.rb
302
307
  - test/test_gman_identifier.rb