gman 6.0.1 → 7.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,6 @@
3
3
  require 'csv'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
8
 
@@ -20,4 +19,4 @@ csv = lines.join("\n")
20
19
  data = CSV.parse(csv, headers: true, col_sep: ';')
21
20
  domains = data.map { |row| row['Internet'] }
22
21
 
23
- Gman.import('German Municipalities' => domains)
22
+ Gman::Importer.new('German Municipalities' => domains).import
@@ -2,6 +2,7 @@
2
2
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
3
 
4
4
  require 'fileutils'
5
+ require './lib/gman'
5
6
 
6
7
  FileUtils.rm_rf('almanak.overheid.nl')
7
8
  commands = [
@@ -15,4 +16,4 @@ commands = [
15
16
  ]
16
17
  domains = system commands.join('|')
17
18
 
18
- Gman.import('Netherlands' => domains.split("\n"))
19
+ Gman::Importer.new('Netherlands' => domains.split("\n")).import
@@ -4,7 +4,6 @@
4
4
  require 'public_suffix'
5
5
  require 'yaml'
6
6
  require_relative '../lib/gman'
7
- require_relative '../lib/gman/importer'
8
7
 
9
8
  # https://gist.github.com/benbalter/6147066
10
9
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
@@ -24,4 +23,4 @@ end
24
23
 
25
24
  # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
25
  # a valid TLD, not have any top-level sites, and we'd still want it listed
27
- Gman.import({ 'non-us gov' => domains }, skip_resolve: true)
26
+ Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -3,7 +3,6 @@
3
3
  require 'mechanize'
4
4
  require 'csv'
5
5
  require './lib/gman'
6
- require './lib/gman/importer'
7
6
 
8
7
  url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
8
  agent = Mechanize.new
@@ -18,4 +17,4 @@ domains = rows.map do |row|
18
17
  row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
19
18
  end
20
19
 
21
- Gman.import('Swedish Administrative Authorities' => domains)
20
+ Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -0,0 +1,41 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
+ # Source: https://github.com/leereilly/swot/
5
+ #
6
+ # Usage: script/vendor-swot
7
+ #
8
+ # Will automatically fetch latest version of the list and merge
9
+ # You can check for changes and commit via `git status`
10
+ #
11
+ # It's also probably a good idea to run `script/ci-build` for good measure
12
+ #
13
+ # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
14
+
15
+ require './lib/gman'
16
+ require 'swot'
17
+
18
+ # Generate array of all Swot domains
19
+ domains = Swot.all_domains
20
+ domains << Swot::ACADEMIC_TLDS
21
+
22
+ # Init the importer, builiding a DomainList
23
+ group = "Academic domains vendored from Swot v#{Swot::VERSION}"
24
+ hash = { group => domains }
25
+
26
+ importer = Gman::Importer.new(hash)
27
+ importer.logger.info "Importing from Swot v#{Swot::VERSION}"
28
+ importer.logger.info "Found #{domains.count} academic domains"
29
+
30
+ domain_list = importer.domain_list
31
+ domain_list.path = Gman.academic_list_path
32
+
33
+ # Cleanup and write
34
+ # Note: we're not using the import method, as that assume's we're writing the
35
+ # government domain list and would use Swot to ensure domains aren't academic
36
+ importer.send :normalize_domains!
37
+ domain_list.data[group] << Swot::BLACKLIST.map { |domain| "!#{domain}" }
38
+ domain_list.data[group] = domain_list.data[group].flatten
39
+ domain_list.write
40
+
41
+ importer.logger.info "Vendored #{importer.domain_list.count} academic domains."
@@ -10,7 +10,8 @@
10
10
  #
11
11
  # It's also probably a good idea to run `script/ci-build` for good measure
12
12
 
13
- require './lib/gman/importer'
13
+ require './lib/gman'
14
+ require 'open-uri'
14
15
 
15
16
  blacklist = %w(usagovQUASI usagovFEDgov)
16
17
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
@@ -32,4 +33,4 @@ data.each do |row|
32
33
  end
33
34
 
34
35
  domains.reject! { |g, _| blacklist.include?(g) }
35
- Gman.import(domains)
36
+ Gman::Importer.new(domains).import
@@ -38,3 +38,11 @@ def with_env(key, value)
38
38
  yield
39
39
  ENV[key] = old_env
40
40
  end
41
+
42
+ def stubbed_list_path
43
+ File.expand_path './fixtures/domains.txt', File.dirname(__FILE__)
44
+ end
45
+
46
+ def stubbed_list
47
+ Gman::DomainList.new(path: stubbed_list_path)
48
+ end
@@ -53,10 +53,4 @@ class TestGman < Minitest::Test
53
53
  should 'returns the path to domains.txt' do
54
54
  assert_equal true, File.exist?(Gman.list_path)
55
55
  end
56
-
57
- should 'stub domains when asked' do
58
- with_env 'GMAN_STUB_DOMAINS', 'true' do
59
- assert_equal fixture_path('domains.txt'), Gman.list_path
60
- end
61
- end
62
56
  end
@@ -0,0 +1,112 @@
1
+ require File.join(File.dirname(__FILE__), 'helper')
2
+
3
+ class TestGmanDomainList < Minitest::Test
4
+ INIT_TYPES = [:path, :contents, :data].freeze
5
+
6
+ def setup
7
+ @original_domain_list = File.read(stubbed_list_path)
8
+ end
9
+
10
+ def teardown
11
+ File.write stubbed_list_path, @original_domain_list
12
+ end
13
+
14
+ def domain_list(type)
15
+ case type
16
+ when :path
17
+ Gman::DomainList.new(path: Gman.list_path)
18
+ when :contents
19
+ contents = File.read(Gman.list_path)
20
+ Gman::DomainList.new(contents: contents)
21
+ when :data
22
+ data = Gman::DomainList.new(path: Gman.list_path).to_h
23
+ Gman::DomainList.new(data: data)
24
+ end
25
+ end
26
+
27
+ INIT_TYPES.each do |type|
28
+ context "when initalized with #{type}" do
29
+ should 'store the init vars' do
30
+ refute domain_list(type).public_send(type).nil?
31
+ end
32
+
33
+ should 'return the domain data' do
34
+ list = domain_list(type)
35
+ assert list.data.key? 'Canada federal'
36
+ assert list.data.any? { |_key, values| values.include? 'gov' }
37
+ end
38
+
39
+ should 'return the list contents' do
40
+ list = domain_list(type)
41
+ assert_match(/^gov$/, list.contents)
42
+ end
43
+
44
+ should 'return the list path' do
45
+ list = domain_list(type)
46
+ assert_equal list.path, Gman.list_path
47
+ end
48
+
49
+ should 'return the public suffix parsed list' do
50
+ list = domain_list(type)
51
+ assert list.public_suffix_list.class == PublicSuffix::List
52
+ end
53
+
54
+ should 'know if a domain is valid' do
55
+ list = domain_list(type)
56
+ assert list.valid? 'whitehouse.gov'
57
+ end
58
+
59
+ should 'know if a domain is invalid' do
60
+ list = domain_list(type)
61
+ refute list.valid? 'example.com'
62
+ end
63
+
64
+ should 'return the domain groups' do
65
+ list = domain_list(type)
66
+ assert list.groups.include?('Canada federal')
67
+ end
68
+
69
+ should 'return the domains' do
70
+ list = domain_list(type)
71
+ assert list.domains.include?('gov')
72
+ end
73
+
74
+ should 'return the domain count' do
75
+ list = domain_list(type)
76
+ assert list.count.is_a?(Integer)
77
+ assert list.count > 100
78
+ end
79
+
80
+ should 'alphabetize the list' do
81
+ list = domain_list(type)
82
+ list.data['Canada municipal'].shuffle!
83
+ assert list.data['Canada municipal'].first != '100milehouse.com'
84
+ list.alphabetize
85
+ assert list.data['Canada municipal'].first == '100milehouse.com'
86
+ end
87
+
88
+ should 'write the list' do
89
+ list = domain_list(type)
90
+ list.instance_variable_set('@path', stubbed_list_path)
91
+ list.data = { 'foo' => ['bar.gov', 'baz.net'] }
92
+ list.write
93
+ contents = File.read(stubbed_list_path)
94
+ assert_match %r{^// foo$}, contents
95
+ expected = "// foo\nbar.gov\nbaz.net"
96
+ assert contents.include?(expected)
97
+ end
98
+
99
+ should 'output the list in public_suffix format' do
100
+ list = domain_list(type)
101
+ string = list.to_s
102
+ assert_match %r{^// Canada federal$}, string
103
+ assert string.include? "// Canada federal\ncanada\.ca\n"
104
+ end
105
+
106
+ should "find a domain's parent" do
107
+ list = domain_list(type)
108
+ assert_equal 'gov.uk', list.parent_domain('foo.gov.uk')
109
+ end
110
+ end
111
+ end
112
+ end
@@ -20,13 +20,12 @@ class TestGmanDomains < Minitest::Test
20
20
  end
21
21
 
22
22
  invalid = []
23
- list = Gman::DomainList.current.list
24
- Parallel.each(list, in_threads: 2) do |group, domains|
23
+ options = { skip_dupe: true, skip_resolve: !resolve_domains? }
24
+ Gman.list.to_h.each do |group, domains|
25
25
  next if WHITELIST.include?(group)
26
- invalid.push domains.reject { |domain|
27
- options = { skip_dupe: true, skip_resolve: !resolve_domains? }
28
- importer.valid_domain?(domain, options)
29
- }
26
+ Parallel.each(domains, in_threads: 4) do |domain|
27
+ invalid.push(domain) unless importer.valid_domain?(domain, options)
28
+ end
30
29
  end
31
30
  assert_equal [], invalid.flatten.reject(&:empty?)
32
31
  end
@@ -6,21 +6,17 @@ class TestGManImporter < Minitest::Test
6
6
  @stdout = StringIO.new
7
7
  @importer.instance_variable_set '@logger', Logger.new(@stdout)
8
8
 
9
- with_env 'GMAN_STUB_DOMAINS', 'true' do
10
- @original_domain_list = File.open(Gman.list_path).read
11
- end
9
+ @original_domain_list = File.read(stubbed_list_path)
12
10
  end
13
11
 
14
12
  def teardown
15
- with_env 'GMAN_STUB_DOMAINS', 'true' do
16
- File.write Gman.list_path, @original_domain_list
17
- end
13
+ File.write stubbed_list_path, @original_domain_list
18
14
  end
19
15
 
20
16
  should 'init the domain list' do
21
- assert_equal Gman::DomainList, @importer.domains.class
22
- assert_equal 1, @importer.domains.domains.count
23
- assert_equal 'example.com', @importer.domains.domains.first
17
+ assert_equal Gman::DomainList, @importer.domain_list.class
18
+ assert_equal 1, @importer.domain_list.count
19
+ assert_equal 'example.com', @importer.domain_list.domains.first
24
20
  end
25
21
 
26
22
  should 'init the logger' do
@@ -51,43 +47,41 @@ class TestGManImporter < Minitest::Test
51
47
  should 'normalize domains within the domain list' do
52
48
  importer = Gman::Importer.new 'test' => ['www.EXAMPLE.com/']
53
49
  importer.send :normalize_domains!
54
- assert_equal 'example.com', importer.domains.domains.first
50
+ assert_equal 'example.com', importer.domain_list.domains.first
55
51
  end
56
52
 
57
53
  should 'remove invalid domains from the domain list' do
58
54
  importer = Gman::Importer.new 'test' => ['foo.github.io', 'example.com']
59
55
  importer.instance_variable_set '@logger', Logger.new(@stdout)
60
56
 
61
- assert_equal 2, importer.domains.domains.count
57
+ assert_equal 2, importer.domain_list.count
62
58
  importer.send :ensure_validity!
63
- assert_equal 1, importer.domains.domains.count
59
+ assert_equal 1, importer.domain_list.count
64
60
  end
65
61
 
66
62
  context 'writing the domain list' do
67
63
  should 'add domains to the current domain list' do
68
- with_env 'GMAN_STUB_DOMAINS', 'true' do
69
- domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
70
- importer = Gman::Importer.new domains
71
- importer.send :add_to_current
72
- expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
73
- assert_equal expected, File.open(Gman.list_path).read
74
- end
64
+ domains = { 'test' => ['example.com'], 'test2' => ['github.com'] }
65
+ importer = Gman::Importer.new domains
66
+ importer.instance_variable_set '@current', stubbed_list
67
+ importer.send :add_to_current
68
+ expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
69
+ assert_equal expected, File.open(stubbed_list_path).read
75
70
  end
76
71
 
77
72
  should 'import' do
78
- with_env 'GMAN_STUB_DOMAINS', 'true' do
79
- domains = {
80
- 'test' => ['www.example.com', 'goo.github.io'],
81
- 'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
82
- }
83
-
84
- importer = Gman::Importer.new domains
85
- importer.instance_variable_set '@logger', Logger.new(@stdout)
86
- importer.import(skip_resolve: true)
87
-
88
- expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
89
- assert_equal expected, File.open(Gman.list_path).read
90
- end
73
+ domains = {
74
+ 'test' => ['www.example.com', 'goo.github.io'],
75
+ 'test2' => ['github.com', 'www.github.com', 'whitehouse.gov']
76
+ }
77
+
78
+ importer = Gman::Importer.new domains
79
+ importer.instance_variable_set '@current', stubbed_list
80
+ importer.instance_variable_set '@logger', Logger.new(@stdout)
81
+ importer.import(skip_resolve: true)
82
+
83
+ expected = "// test\nexample.com\ngov\n\n// test2\ngithub.com"
84
+ assert_equal expected, File.open(stubbed_list_path).read
91
85
  end
92
86
  end
93
87
  end
metadata CHANGED
@@ -1,71 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 6.0.1
4
+ version: 7.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: swot
14
+ name: iso_country_codes
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '0.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '0.6'
27
27
  - !ruby/object:Gem::Dependency
28
- name: iso_country_codes
28
+ name: naughty_or_nice
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.6'
33
+ version: '2.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.6'
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: naughty_or_nice
42
+ name: colored
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '2.0'
47
+ version: '1.2'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '2.0'
54
+ version: '1.2'
55
55
  - !ruby/object:Gem::Dependency
56
- name: colored
56
+ name: swot
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.2'
62
- type: :runtime
61
+ version: '1.0'
62
+ type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.2'
68
+ version: '1.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -217,6 +217,7 @@ extensions: []
217
217
  extra_rdoc_files: []
218
218
  files:
219
219
  - ".gitignore"
220
+ - ".rake_tasks"
220
221
  - ".rubocop.yml"
221
222
  - ".ruby-version"
222
223
  - ".travis.yml"
@@ -228,6 +229,7 @@ files:
228
229
  - bin/gman
229
230
  - bin/gman_filter
230
231
  - config/domains.txt
232
+ - config/vendor/academic.txt
231
233
  - config/vendor/dotgovs.csv
232
234
  - gman.gemspec
233
235
  - lib/gman.rb
@@ -254,6 +256,7 @@ files:
254
256
  - script/vendor-nl
255
257
  - script/vendor-public-suffix
256
258
  - script/vendor-se
259
+ - script/vendor-swot
257
260
  - script/vendor-us
258
261
  - test/fixtures/domains.txt
259
262
  - test/fixtures/obama.txt
@@ -261,6 +264,7 @@ files:
261
264
  - test/test_gman.rb
262
265
  - test/test_gman_bin.rb
263
266
  - test/test_gman_country_codes.rb
267
+ - test/test_gman_domain_list.rb
264
268
  - test/test_gman_domains.rb
265
269
  - test/test_gman_filter.rb
266
270
  - test/test_gman_identifier.rb
@@ -297,6 +301,7 @@ test_files:
297
301
  - test/test_gman.rb
298
302
  - test/test_gman_bin.rb
299
303
  - test/test_gman_country_codes.rb
304
+ - test/test_gman_domain_list.rb
300
305
  - test/test_gman_domains.rb
301
306
  - test/test_gman_filter.rb
302
307
  - test/test_gman_identifier.rb