gman 7.0.0 → 7.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/funding.yml +1 -0
  7. data/.github/no-response.yml +15 -0
  8. data/.github/release-drafter.yml +4 -0
  9. data/.github/settings.yml +33 -0
  10. data/.github/stale.yml +29 -0
  11. data/.gitignore +1 -0
  12. data/.rspec +2 -0
  13. data/.rubocop.yml +14 -5
  14. data/.rubocop_todo.yml +84 -0
  15. data/.ruby-version +1 -1
  16. data/Gemfile +2 -0
  17. data/bin/gman +6 -4
  18. data/bin/gman_filter +5 -7
  19. data/config/domains.txt +8454 -168
  20. data/config/vendor/academic.txt +6 -7
  21. data/config/vendor/dotgovs.csv +5786 -5560
  22. data/docs/CODE_OF_CONDUCT.md +46 -0
  23. data/docs/CONTRIBUTING.md +92 -0
  24. data/{README.md → docs/README.md} +3 -3
  25. data/docs/SECURITY.md +3 -0
  26. data/docs/_config.yml +2 -0
  27. data/gman.gemspec +18 -17
  28. data/lib/gman.rb +4 -2
  29. data/lib/gman/country_codes.rb +17 -17
  30. data/lib/gman/domain_list.rb +25 -9
  31. data/lib/gman/identifier.rb +57 -19
  32. data/lib/gman/importer.rb +31 -21
  33. data/lib/gman/locality.rb +8 -6
  34. data/lib/gman/version.rb +3 -1
  35. data/script/add +2 -0
  36. data/script/alphabetize +2 -0
  37. data/script/cibuild +1 -1
  38. data/script/dedupe +2 -1
  39. data/script/profile +2 -1
  40. data/script/prune +5 -3
  41. data/script/reconcile-us +6 -3
  42. data/script/vendor-federal-de +2 -1
  43. data/script/vendor-municipal-de +2 -1
  44. data/script/vendor-nl +2 -0
  45. data/script/vendor-public-suffix +6 -4
  46. data/script/vendor-se +2 -1
  47. data/script/vendor-swot +3 -1
  48. data/script/vendor-us +5 -3
  49. data/spec/fixtures/domains.txt +4 -0
  50. data/{test → spec}/fixtures/obama.txt +0 -0
  51. data/spec/gman/bin_spec.rb +101 -0
  52. data/spec/gman/country_code_spec.rb +39 -0
  53. data/spec/gman/domain_list_spec.rb +110 -0
  54. data/spec/gman/domains_spec.rb +25 -0
  55. data/spec/gman/identifier_spec.rb +218 -0
  56. data/spec/gman/importer_spec.rb +236 -0
  57. data/spec/gman/locality_spec.rb +24 -0
  58. data/spec/gman_spec.rb +74 -0
  59. data/spec/spec_helper.rb +31 -0
  60. metadata +89 -81
  61. data/.rake_tasks +0 -0
  62. data/CONTRIBUTING.md +0 -22
  63. data/Rakefile +0 -22
  64. data/test/fixtures/domains.txt +0 -2
  65. data/test/helper.rb +0 -48
  66. data/test/test_gman.rb +0 -56
  67. data/test/test_gman_bin.rb +0 -75
  68. data/test/test_gman_country_codes.rb +0 -18
  69. data/test/test_gman_domain_list.rb +0 -112
  70. data/test/test_gman_domains.rb +0 -32
  71. data/test/test_gman_filter.rb +0 -17
  72. data/test/test_gman_identifier.rb +0 -106
  73. data/test/test_gman_importer.rb +0 -244
  74. data/test/test_gman_locality.rb +0 -10
@@ -1,9 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Utility functions for parsing and manipulating public-suffix domain lists
2
4
  # Only used in development and not loaded by default
3
5
  require 'yaml'
4
6
  require 'open-uri'
5
7
  require 'resolv'
6
8
  require 'logger'
9
+ require 'swot'
7
10
  require_relative '../gman'
8
11
  require_relative './domain_list'
9
12
 
@@ -12,7 +15,7 @@ class Gman
12
15
  attr_accessor :domain_list
13
16
 
14
17
  # Known false positives from vendored lists
15
- BLACKLIST = %w(
18
+ BLACKLIST = %w[
16
19
  business.centurytel.net
17
20
  chesnee.net
18
21
  citlink.net
@@ -38,23 +41,24 @@ class Gman
38
41
  wctc.net
39
42
  webconnections.net
40
43
  webpages.charter.net
41
- ).freeze
44
+ ].freeze
42
45
 
43
46
  REGEX_CHECKS = {
44
- 'home. regex' => /^home\./,
45
- 'user. regex' => /^users?\./,
46
- 'sites. regex' => /^sites?\./,
47
- 'weebly' => /weebly\.com$/,
48
- 'wordpress' => /wordpress\.com$/,
49
- 'govoffice' => /govoffice\d?\.com$/,
50
- 'homestead' => /homestead\.com$/,
51
- 'wix.com' => /wix\.com$/,
52
- 'blogspot.com' => /blogspot\.com$/,
53
- 'tripod.com' => /tripod\.com$/,
47
+ 'home. regex' => /^home\./,
48
+ 'user. regex' => /^users?\./,
49
+ 'sites. regex' => /^sites?\./,
50
+ 'weebly' => /weebly\.com$/,
51
+ 'wordpress' => /wordpress\.com$/,
52
+ 'govoffice' => /govoffice\d?\.com$/,
53
+ 'homestead' => /homestead\.com$/,
54
+ 'wix.com' => /wix\.com$/,
55
+ 'blogspot.com' => /blogspot\.com$/,
56
+ 'tripod.com' => /tripod\.com$/,
54
57
  'squarespace.com' => /squarespace\.com$/,
55
- 'github.io' => /github\.io$/,
56
- 'tumblr' => /tumblr\.com$/,
57
- 'locality' => Gman::Locality::REGEX
58
+ 'github.io' => /github\.io$/,
59
+ 'tumblr' => /tumblr\.com$/,
60
+ 'locality' => Gman::Locality::REGEX,
61
+ 'french edu' => /^ac-.*?\.fr/
58
62
  }.freeze
59
63
 
60
64
  def initialize(domains)
@@ -62,7 +66,7 @@ class Gman
62
66
  end
63
67
 
64
68
  def logger
65
- @logger ||= Logger.new(STDOUT)
69
+ @logger ||= Logger.new($stdout)
66
70
  end
67
71
 
68
72
  def normalize_domain(domain)
@@ -74,6 +78,7 @@ class Gman
74
78
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
75
79
  return false unless ensure_valid(domain)
76
80
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
81
+
77
82
  true
78
83
  end
79
84
 
@@ -81,6 +86,7 @@ class Gman
81
86
  # rather than a bool and silence log output
82
87
  def reject(domain, reason)
83
88
  return reason if ENV['RECONCILING']
89
+
84
90
  logger.info "👎 `#{domain}`: #{reason}"
85
91
  false
86
92
  end
@@ -101,13 +107,14 @@ class Gman
101
107
  end
102
108
 
103
109
  def resolver
104
- @resolver ||= Resolv::DNS.new(nameserver: ['8.8.8.8', '8.8.4.4'])
110
+ @resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
105
111
  end
106
112
 
107
113
  # Verifies that the given domain has an MX record, and thus is valid
108
114
  def domain_resolves?(domain)
109
115
  domain = Addressable::URI.new(host: domain).normalize.host
110
116
  return true if ip?(domain)
117
+
111
118
  returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
112
119
  end
113
120
 
@@ -115,16 +122,17 @@ class Gman
115
122
 
116
123
  def ensure_regex(domain)
117
124
  REGEX_CHECKS.each do |msg, regex|
118
- return reject(domain, msg) if domain =~ regex
125
+ return reject(domain, msg) if domain&.match?(regex)
119
126
  end
120
127
  true
121
128
  end
122
129
 
123
130
  def ensure_valid(domain)
124
131
  return false if domain.empty?
132
+
125
133
  if BLACKLIST.include?(domain)
126
134
  reject(domain, 'blacklist')
127
- elsif !PublicSuffix.valid?(".#{domain}")
135
+ elsif !PublicSuffix.valid?("foo.#{domain}")
128
136
  reject(domain, 'invalid')
129
137
  elsif Swot.is_academic?(domain)
130
138
  reject(domain, 'academic')
@@ -135,11 +143,13 @@ class Gman
135
143
 
136
144
  def ensure_resolves(domain)
137
145
  return reject(domain, 'unresolvable') unless domain_resolves?(domain)
146
+
138
147
  true
139
148
  end
140
149
 
141
150
  def ensure_not_dupe(domain)
142
151
  return true unless dupe?(domain)
152
+
143
153
  if current.domains.include?(domain)
144
154
  reject(domain, 'duplicate')
145
155
  else
@@ -153,14 +163,14 @@ class Gman
153
163
  end
154
164
 
155
165
  def normalize_domains!
156
- domain_list.to_h.each do |_group, domains|
166
+ domain_list.to_h.each_value do |domains|
157
167
  domains.map! { |domain| normalize_domain(domain) }
158
168
  domains.uniq!
159
169
  end
160
170
  end
161
171
 
162
172
  def ensure_validity!(options = {})
163
- domain_list.data.each do |_group, domains|
173
+ domain_list.data.each_value do |domains|
164
174
  domains.select! { |domain| valid_domain?(domain, options) }
165
175
  end
166
176
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
4
  # Second level .us domains for states and locality
3
5
  # See http://en.wikipedia.org/wiki/.us
@@ -12,18 +14,18 @@ class Gman
12
14
  # * k12.il.us
13
15
  # * ci.foo.zx.us
14
16
  class Locality
15
- AFFINITY_NAMESPACES = %w(state dst cog).freeze
17
+ AFFINITY_NAMESPACES = %w[state dst cog].freeze
16
18
 
17
- STATES = %w(
19
+ STATES = %w[
18
20
  ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
19
21
  la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
20
22
  ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
21
- ).freeze
23
+ ].freeze
22
24
 
23
- LOCALITY_DOMAINS = %w(
25
+ LOCALITY_DOMAINS = %w[
24
26
  ci co borough boro city county
25
27
  parish town twp vi vil village
26
- ).freeze
28
+ ].freeze
27
29
 
28
30
  REGEX = /
29
31
  (
@@ -31,7 +33,7 @@ class Gman
31
33
  |
32
34
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
33
35
  )\.(#{Regexp.union(STATES)})\.us
34
- /x
36
+ /x.freeze
35
37
 
36
38
  def self.valid?(domain)
37
39
  !domain.to_s.match(Locality::REGEX).nil?
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
- VERSION = '7.0.0'.freeze
4
+ VERSION = '7.0.5'
3
5
  end
data/script/add CHANGED
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Add one or more domains to a given group, running the standard import checks
4
6
  #
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Alphabetizes entries in the domains.txt file
4
6
  #
@@ -2,7 +2,7 @@
2
2
 
3
3
  set -ex
4
4
 
5
- bundle exec rake test
5
+ bundle exec rspec
6
6
  bundle exec rubocop -D -S -a
7
7
  bundle exec script/dedupe
8
8
  bundle exec gem build gman.gemspec
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'yaml'
4
5
  require 'open-uri'
@@ -12,7 +13,7 @@ puts "Current list contains #{current.count} domains..."
12
13
 
13
14
  dupe = current.count - current.domains.uniq.count
14
15
  puts "Found #{dupe} duplicate domains"
15
- exit 0 if dupe == 0
16
+ exit 0 if dupe.zero?
16
17
 
17
18
  dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
18
19
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -17,4 +18,4 @@ end
17
18
 
18
19
  result = RubyProf.stop
19
20
  printer = RubyProf::FlatPrinter.new(result)
20
- printer.print(STDOUT)
21
+ printer.print($stdout)
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
20
22
  domains = {}
21
23
  group = ''
22
24
  data.each do |row|
23
- if row =~ /^\w/
25
+ if /^\w/.match?(row)
24
26
  group = row
25
27
  domains[group] = []
26
28
  else
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
8
9
 
9
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
10
11
  domains = CSV.parse(domains, headers: true)
11
12
  domains = domains.map { |row| row['Domain Name'] }
12
13
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
8
9
 
9
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
11
 
11
12
  # For some reason, the header row is actually the last row
12
13
  # Pop the last line off the file and prepend it at the begining
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
@@ -6,21 +8,21 @@ require 'yaml'
6
8
  require_relative '../lib/gman'
7
9
 
8
10
  # https://gist.github.com/benbalter/6147066
9
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
10
12
 
11
13
  domains = []
12
14
  PublicSuffix::List.default.each do |rule|
13
15
  domain = nil
14
16
 
15
17
  if rule.parts.length == 1
16
- domain = rule.parts.first if ".#{rule.value}" =~ REGEX
17
- elsif ".#{rule.value}" =~ REGEX
18
+ domain = rule.parts.first if REGEX.match?(".#{rule.value}")
19
+ elsif REGEX.match?(".#{rule.value}")
18
20
  domain = rule.parts.pop(2).join('.')
19
21
  end
20
22
 
21
23
  domains.push domain unless domain.nil? || domains.include?(domain)
22
24
  end
23
25
 
24
- # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
25
27
  # a valid TLD, not have any top-level sites, and we'd still want it listed
26
28
  Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
@@ -14,7 +15,7 @@ response = agent.submit(form, submit_button)
14
15
 
15
16
  rows = CSV.parse(response.content, headers: true, col_sep: "\t")
16
17
  domains = rows.map do |row|
17
- row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
18
+ row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
18
19
  end
19
20
 
20
21
  Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
6
  # Source: https://github.com/leereilly/swot/
@@ -12,7 +14,7 @@
12
14
  #
13
15
  # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
14
16
 
15
- require './lib/gman'
17
+ require 'gman'
16
18
  require 'swot'
17
19
 
18
20
  # Generate array of all Swot domains
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -13,10 +15,10 @@
13
15
  require './lib/gman'
14
16
  require 'open-uri'
15
17
 
16
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
17
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
18
20
 
19
- data = open(source).read
21
+ data = URI.open(source).read
20
22
  data = data.split('_' * 74)
21
23
  data = data.last.strip
22
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -24,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
24
26
  domains = {}
25
27
  group = ''
26
28
  data.each do |row|
27
- if row =~ /^\w/
29
+ if /^\w/.match?(row)
28
30
  group = row
29
31
  domains[group] = []
30
32
  else
@@ -0,0 +1,4 @@
1
+ // foo
2
+ bar.gov
3
+ baz.net
4
+ !mail.bar.gov
File without changes