gman 7.0.0 → 7.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +5 -5
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
  4. data/.github/ISSUE_TEMPLATE/feature_request.md +21 -0
  5. data/.github/config.yml +23 -0
  6. data/.github/funding.yml +1 -0
  7. data/.github/no-response.yml +15 -0
  8. data/.github/release-drafter.yml +4 -0
  9. data/.github/settings.yml +33 -0
  10. data/.github/stale.yml +29 -0
  11. data/.gitignore +1 -0
  12. data/.rspec +2 -0
  13. data/.rubocop.yml +14 -5
  14. data/.rubocop_todo.yml +84 -0
  15. data/.ruby-version +1 -1
  16. data/Gemfile +2 -0
  17. data/bin/gman +6 -4
  18. data/bin/gman_filter +5 -7
  19. data/config/domains.txt +8454 -168
  20. data/config/vendor/academic.txt +6 -7
  21. data/config/vendor/dotgovs.csv +5786 -5560
  22. data/docs/CODE_OF_CONDUCT.md +46 -0
  23. data/docs/CONTRIBUTING.md +92 -0
  24. data/{README.md → docs/README.md} +3 -3
  25. data/docs/SECURITY.md +3 -0
  26. data/docs/_config.yml +2 -0
  27. data/gman.gemspec +18 -17
  28. data/lib/gman.rb +4 -2
  29. data/lib/gman/country_codes.rb +17 -17
  30. data/lib/gman/domain_list.rb +25 -9
  31. data/lib/gman/identifier.rb +57 -19
  32. data/lib/gman/importer.rb +31 -21
  33. data/lib/gman/locality.rb +8 -6
  34. data/lib/gman/version.rb +3 -1
  35. data/script/add +2 -0
  36. data/script/alphabetize +2 -0
  37. data/script/cibuild +1 -1
  38. data/script/dedupe +2 -1
  39. data/script/profile +2 -1
  40. data/script/prune +5 -3
  41. data/script/reconcile-us +6 -3
  42. data/script/vendor-federal-de +2 -1
  43. data/script/vendor-municipal-de +2 -1
  44. data/script/vendor-nl +2 -0
  45. data/script/vendor-public-suffix +6 -4
  46. data/script/vendor-se +2 -1
  47. data/script/vendor-swot +3 -1
  48. data/script/vendor-us +5 -3
  49. data/spec/fixtures/domains.txt +4 -0
  50. data/{test → spec}/fixtures/obama.txt +0 -0
  51. data/spec/gman/bin_spec.rb +101 -0
  52. data/spec/gman/country_code_spec.rb +39 -0
  53. data/spec/gman/domain_list_spec.rb +110 -0
  54. data/spec/gman/domains_spec.rb +25 -0
  55. data/spec/gman/identifier_spec.rb +218 -0
  56. data/spec/gman/importer_spec.rb +236 -0
  57. data/spec/gman/locality_spec.rb +24 -0
  58. data/spec/gman_spec.rb +74 -0
  59. data/spec/spec_helper.rb +31 -0
  60. metadata +89 -81
  61. data/.rake_tasks +0 -0
  62. data/CONTRIBUTING.md +0 -22
  63. data/Rakefile +0 -22
  64. data/test/fixtures/domains.txt +0 -2
  65. data/test/helper.rb +0 -48
  66. data/test/test_gman.rb +0 -56
  67. data/test/test_gman_bin.rb +0 -75
  68. data/test/test_gman_country_codes.rb +0 -18
  69. data/test/test_gman_domain_list.rb +0 -112
  70. data/test/test_gman_domains.rb +0 -32
  71. data/test/test_gman_filter.rb +0 -17
  72. data/test/test_gman_identifier.rb +0 -106
  73. data/test/test_gman_importer.rb +0 -244
  74. data/test/test_gman_locality.rb +0 -10
@@ -1,9 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Utility functions for parsing and manipulating public-suffix domain lists
2
4
  # Only used in development and not loaded by default
3
5
  require 'yaml'
4
6
  require 'open-uri'
5
7
  require 'resolv'
6
8
  require 'logger'
9
+ require 'swot'
7
10
  require_relative '../gman'
8
11
  require_relative './domain_list'
9
12
 
@@ -12,7 +15,7 @@ class Gman
12
15
  attr_accessor :domain_list
13
16
 
14
17
  # Known false positives from vendored lists
15
- BLACKLIST = %w(
18
+ BLACKLIST = %w[
16
19
  business.centurytel.net
17
20
  chesnee.net
18
21
  citlink.net
@@ -38,23 +41,24 @@ class Gman
38
41
  wctc.net
39
42
  webconnections.net
40
43
  webpages.charter.net
41
- ).freeze
44
+ ].freeze
42
45
 
43
46
  REGEX_CHECKS = {
44
- 'home. regex' => /^home\./,
45
- 'user. regex' => /^users?\./,
46
- 'sites. regex' => /^sites?\./,
47
- 'weebly' => /weebly\.com$/,
48
- 'wordpress' => /wordpress\.com$/,
49
- 'govoffice' => /govoffice\d?\.com$/,
50
- 'homestead' => /homestead\.com$/,
51
- 'wix.com' => /wix\.com$/,
52
- 'blogspot.com' => /blogspot\.com$/,
53
- 'tripod.com' => /tripod\.com$/,
47
+ 'home. regex' => /^home\./,
48
+ 'user. regex' => /^users?\./,
49
+ 'sites. regex' => /^sites?\./,
50
+ 'weebly' => /weebly\.com$/,
51
+ 'wordpress' => /wordpress\.com$/,
52
+ 'govoffice' => /govoffice\d?\.com$/,
53
+ 'homestead' => /homestead\.com$/,
54
+ 'wix.com' => /wix\.com$/,
55
+ 'blogspot.com' => /blogspot\.com$/,
56
+ 'tripod.com' => /tripod\.com$/,
54
57
  'squarespace.com' => /squarespace\.com$/,
55
- 'github.io' => /github\.io$/,
56
- 'tumblr' => /tumblr\.com$/,
57
- 'locality' => Gman::Locality::REGEX
58
+ 'github.io' => /github\.io$/,
59
+ 'tumblr' => /tumblr\.com$/,
60
+ 'locality' => Gman::Locality::REGEX,
61
+ 'french edu' => /^ac-.*?\.fr/
58
62
  }.freeze
59
63
 
60
64
  def initialize(domains)
@@ -62,7 +66,7 @@ class Gman
62
66
  end
63
67
 
64
68
  def logger
65
- @logger ||= Logger.new(STDOUT)
69
+ @logger ||= Logger.new($stdout)
66
70
  end
67
71
 
68
72
  def normalize_domain(domain)
@@ -74,6 +78,7 @@ class Gman
74
78
  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
75
79
  return false unless ensure_valid(domain)
76
80
  return false if !options[:skip_resolve] && !ensure_resolves(domain)
81
+
77
82
  true
78
83
  end
79
84
 
@@ -81,6 +86,7 @@ class Gman
81
86
  # rather than a bool and silence log output
82
87
  def reject(domain, reason)
83
88
  return reason if ENV['RECONCILING']
89
+
84
90
  logger.info "👎 `#{domain}`: #{reason}"
85
91
  false
86
92
  end
@@ -101,13 +107,14 @@ class Gman
101
107
  end
102
108
 
103
109
  def resolver
104
- @resolver ||= Resolv::DNS.new(nameserver: ['8.8.8.8', '8.8.4.4'])
110
+ @resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
105
111
  end
106
112
 
107
113
  # Verifies that the given domain has an MX record, and thus is valid
108
114
  def domain_resolves?(domain)
109
115
  domain = Addressable::URI.new(host: domain).normalize.host
110
116
  return true if ip?(domain)
117
+
111
118
  returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
112
119
  end
113
120
 
@@ -115,16 +122,17 @@ class Gman
115
122
 
116
123
  def ensure_regex(domain)
117
124
  REGEX_CHECKS.each do |msg, regex|
118
- return reject(domain, msg) if domain =~ regex
125
+ return reject(domain, msg) if domain&.match?(regex)
119
126
  end
120
127
  true
121
128
  end
122
129
 
123
130
  def ensure_valid(domain)
124
131
  return false if domain.empty?
132
+
125
133
  if BLACKLIST.include?(domain)
126
134
  reject(domain, 'blacklist')
127
- elsif !PublicSuffix.valid?(".#{domain}")
135
+ elsif !PublicSuffix.valid?("foo.#{domain}")
128
136
  reject(domain, 'invalid')
129
137
  elsif Swot.is_academic?(domain)
130
138
  reject(domain, 'academic')
@@ -135,11 +143,13 @@ class Gman
135
143
 
136
144
  def ensure_resolves(domain)
137
145
  return reject(domain, 'unresolvable') unless domain_resolves?(domain)
146
+
138
147
  true
139
148
  end
140
149
 
141
150
  def ensure_not_dupe(domain)
142
151
  return true unless dupe?(domain)
152
+
143
153
  if current.domains.include?(domain)
144
154
  reject(domain, 'duplicate')
145
155
  else
@@ -153,14 +163,14 @@ class Gman
153
163
  end
154
164
 
155
165
  def normalize_domains!
156
- domain_list.to_h.each do |_group, domains|
166
+ domain_list.to_h.each_value do |domains|
157
167
  domains.map! { |domain| normalize_domain(domain) }
158
168
  domains.uniq!
159
169
  end
160
170
  end
161
171
 
162
172
  def ensure_validity!(options = {})
163
- domain_list.data.each do |_group, domains|
173
+ domain_list.data.each_value do |domains|
164
174
  domains.select! { |domain| valid_domain?(domain, options) }
165
175
  end
166
176
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
4
  # Second level .us domains for states and locality
3
5
  # See http://en.wikipedia.org/wiki/.us
@@ -12,18 +14,18 @@ class Gman
12
14
  # * k12.il.us
13
15
  # * ci.foo.zx.us
14
16
  class Locality
15
- AFFINITY_NAMESPACES = %w(state dst cog).freeze
17
+ AFFINITY_NAMESPACES = %w[state dst cog].freeze
16
18
 
17
- STATES = %w(
19
+ STATES = %w[
18
20
  ak al ar az ca co ct dc de fl ga hi ia id il in ks ky
19
21
  la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh
20
22
  ok or pa ri sc sd tn tx um ut va vt wa wi wv wy
21
- ).freeze
23
+ ].freeze
22
24
 
23
- LOCALITY_DOMAINS = %w(
25
+ LOCALITY_DOMAINS = %w[
24
26
  ci co borough boro city county
25
27
  parish town twp vi vil village
26
- ).freeze
28
+ ].freeze
27
29
 
28
30
  REGEX = /
29
31
  (
@@ -31,7 +33,7 @@ class Gman
31
33
  |
32
34
  (#{Regexp.union(LOCALITY_DOMAINS)})\.[a-z-]+
33
35
  )\.(#{Regexp.union(STATES)})\.us
34
- /x
36
+ /x.freeze
35
37
 
36
38
  def self.valid?(domain)
37
39
  !domain.to_s.match(Locality::REGEX).nil?
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Gman
2
- VERSION = '7.0.0'.freeze
4
+ VERSION = '7.0.5'
3
5
  end
data/script/add CHANGED
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Add one or more domains to a given group, running the standard import checks
4
6
  #
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Alphabetizes entries in the domains.txt file
4
6
  #
@@ -2,7 +2,7 @@
2
2
 
3
3
  set -ex
4
4
 
5
- bundle exec rake test
5
+ bundle exec rspec
6
6
  bundle exec rubocop -D -S -a
7
7
  bundle exec script/dedupe
8
8
  bundle exec gem build gman.gemspec
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'yaml'
4
5
  require 'open-uri'
@@ -12,7 +13,7 @@ puts "Current list contains #{current.count} domains..."
12
13
 
13
14
  dupe = current.count - current.domains.uniq.count
14
15
  puts "Found #{dupe} duplicate domains"
15
- exit 0 if dupe == 0
16
+ exit 0 if dupe.zero?
16
17
 
17
18
  dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
18
19
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ruby-prof'
4
5
  require './lib/gman'
@@ -17,4 +18,4 @@ end
17
18
 
18
19
  result = RubyProf.stop
19
20
  printer = RubyProf::FlatPrinter.new(result)
20
- printer.print(STDOUT)
21
+ printer.print($stdout)
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Given an array of domains, removes them from the list
3
5
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
6
 
@@ -12,9 +14,9 @@ list = File.open('./config/domains.txt').read
12
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
13
15
 
14
16
  domains.each do |domain|
15
- list.gsub!(/^#{domain}$\n/, '')
17
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
16
18
  end
17
19
 
18
- puts "Ending list: #{Gman::DomainList.current.count} domains"
19
-
20
20
  File.write './config/domains.txt', list
21
+
22
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
6
  # to show domains listed in the USA.gov-maintained list that we reject and why
@@ -12,7 +14,7 @@ ENV['RECONCILING'] = 'true'
12
14
  blacklist = ['usagovQUASI']
13
15
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
14
16
 
15
- data = open(source).read
17
+ data = URI.open(source).read
16
18
  data = data.split('_' * 74)
17
19
  data = data.last.strip
18
20
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -20,7 +22,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
20
22
  domains = {}
21
23
  group = ''
22
24
  data.each do |row|
23
- if row =~ /^\w/
25
+ if /^\w/.match?(row)
24
26
  group = row
25
27
  domains[group] = []
26
28
  else
@@ -33,7 +35,7 @@ importer = Gman::Importer.new(domains)
33
35
 
34
36
  importer.logger.info "Starting with #{importer.domains.count} domains"
35
37
 
36
- importer.domains.list.each do |_group, d|
38
+ importer.domains.list.each_value do |d|
37
39
  d.map! { |domain| Gman.new(domain).to_s }
38
40
  d.map! { |domain| importer.normalize_domain(domain) }
39
41
  end
@@ -44,6 +46,7 @@ importer.logger.info "Filtered down to #{count} normalized domains"
44
46
  missing = {}
45
47
  importer.domains.list.each do |g, usagovdomains|
46
48
  next unless importer.current.list[g]
49
+
47
50
  missing[g] = importer.current.list[g] - usagovdomains
48
51
  end
49
52
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv'
8
9
 
9
- domains = open(url).read.encode('UTF-8')
10
+ domains = URI.open(url).read.encode('UTF-8')
10
11
  domains = CSV.parse(domains, headers: true)
11
12
  domains = domains.map { |row| row['Domain Name'] }
12
13
 
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'csv'
4
5
  require 'open-uri'
@@ -6,7 +7,7 @@ require './lib/gman'
6
7
 
7
8
  url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
8
9
 
9
- csv = open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
+ csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
10
11
 
11
12
  # For some reason, the header row is actually the last row
12
13
  # Pop the last line off the file and prepend it at the begining
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
3
5
 
4
6
  require 'fileutils'
@@ -1,4 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Propagates an initial list of best-guess government domains
3
5
 
4
6
  require 'public_suffix'
@@ -6,21 +8,21 @@ require 'yaml'
6
8
  require_relative '../lib/gman'
7
9
 
8
10
  # https://gist.github.com/benbalter/6147066
9
- REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
11
+ REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i.freeze
10
12
 
11
13
  domains = []
12
14
  PublicSuffix::List.default.each do |rule|
13
15
  domain = nil
14
16
 
15
17
  if rule.parts.length == 1
16
- domain = rule.parts.first if ".#{rule.value}" =~ REGEX
17
- elsif ".#{rule.value}" =~ REGEX
18
+ domain = rule.parts.first if REGEX.match?(".#{rule.value}")
19
+ elsif REGEX.match?(".#{rule.value}")
18
20
  domain = rule.parts.pop(2).join('.')
19
21
  end
20
22
 
21
23
  domains.push domain unless domain.nil? || domains.include?(domain)
22
24
  end
23
25
 
24
- # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # NOTE: We want to skip resolution here, because a domain like `gov.sv` may be
25
27
  # a valid TLD, not have any top-level sites, and we'd still want it listed
26
28
  Gman::Importer.new('non-us gov' => domains).import(skip_resolve: true)
@@ -1,4 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'mechanize'
4
5
  require 'csv'
@@ -14,7 +15,7 @@ response = agent.submit(form, submit_button)
14
15
 
15
16
  rows = CSV.parse(response.content, headers: true, col_sep: "\t")
16
17
  domains = rows.map do |row|
17
- row['Webbadress'] unless row['Namn'] =~ /UNIVERSITET/
18
+ row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
18
19
  end
19
20
 
20
21
  Gman::Importer.new('Swedish Administrative Authorities' => domains).import
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the Swot-maintained list of adademic domains into config/academic.txt
4
6
  # Source: https://github.com/leereilly/swot/
@@ -12,7 +14,7 @@
12
14
  #
13
15
  # Note: We do this, because as a bajillion individual files, Swot takes up 30MB
14
16
 
15
- require './lib/gman'
17
+ require 'gman'
16
18
  require 'swot'
17
19
 
18
20
  # Generate array of all Swot domains
@@ -1,4 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  # Vendors the USA.gov-maintained list of US domains into domains.txt
4
6
  # Source: https://github.com/GSA-OCSIT/govt-urls
@@ -13,10 +15,10 @@
13
15
  require './lib/gman'
14
16
  require 'open-uri'
15
17
 
16
- blacklist = %w(usagovQUASI usagovFEDgov)
18
+ blacklist = %w[usagovQUASI usagovFEDgov]
17
19
  source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
18
20
 
19
- data = open(source).read
21
+ data = URI.open(source).read
20
22
  data = data.split('_' * 74)
21
23
  data = data.last.strip
22
24
  data = data.split(/\r?\n/).reject(&:empty?)
@@ -24,7 +26,7 @@ data = data.split(/\r?\n/).reject(&:empty?)
24
26
  domains = {}
25
27
  group = ''
26
28
  data.each do |row|
27
- if row =~ /^\w/
29
+ if /^\w/.match?(row)
28
30
  group = row
29
31
  domains[group] = []
30
32
  else
@@ -0,0 +1,4 @@
1
+ // foo
2
+ bar.gov
3
+ baz.net
4
+ !mail.bar.gov
File without changes