gman 7.0.5 → 7.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/gman.gemspec CHANGED
@@ -18,13 +18,12 @@ Gem::Specification.new do |s|
18
18
  s.licenses = ['MIT']
19
19
 
20
20
  s.files = `git ls-files`.split("\n")
21
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
22
21
  s.executables = `git ls-files -- bin/*`.split("\n").map do |f|
23
22
  File.basename(f)
24
23
  end
25
24
 
26
25
  s.require_paths = ['lib']
27
- s.required_ruby_version = '~> 2.5'
26
+ s.required_ruby_version = '>= 2.5', '< 4.0'
28
27
 
29
28
  s.add_dependency('colored', '~> 1.2')
30
29
  s.add_dependency('iso_country_codes', '~> 0.6')
@@ -39,6 +38,8 @@ Gem::Specification.new do |s|
39
38
  s.add_development_dependency('rubocop', '~> 1.0')
40
39
  s.add_development_dependency('rubocop-performance', '~> 1.5')
41
40
  s.add_development_dependency('rubocop-rspec', '~> 2.0')
42
- s.add_development_dependency('ruby-prof', '~> 0.15')
41
+ s.add_development_dependency('ruby-prof', '~> 1.4')
42
+ s.add_development_dependency('ruby-progressbar', '~> 1.10')
43
43
  s.add_development_dependency('swot', '~> 1.0')
44
+ s.metadata['rubygems_mfa_required'] = 'true'
44
45
  end
@@ -26,7 +26,7 @@ class Gman
26
26
  def_hash_delegator :dotgov_listing, :Agency
27
27
  def_hash_delegator :dotgov_listing, :Organization
28
28
  def_hash_delegator :dotgov_listing, :City
29
- def_hash_delegator :dotgov_listing, :"Domain Type"
29
+ def_hash_delegator :dotgov_listing, :'Domain Type'
30
30
  private :domain_type
31
31
 
32
32
  def type
@@ -60,7 +60,7 @@ class Gman
60
60
  def federal?
61
61
  return false unless dotgov_listing
62
62
 
63
- domain_type =~ /^Federal Agency/i
63
+ domain_type =~ /^Federal/i
64
64
  end
65
65
 
66
66
  def city?
@@ -87,7 +87,7 @@ class Gman
87
87
  if matches
88
88
  matches[1] == 'state'
89
89
  elsif dotgov_listing
90
- domain_type == 'State/Local Govt'
90
+ domain_type == 'State/Local Govt' || domain_type == 'State'
91
91
  else
92
92
  false
93
93
  end
@@ -108,14 +108,14 @@ class Gman
108
108
  private
109
109
 
110
110
  def list_category
111
- @list_category ||= begin
112
- match = Gman.list.public_suffix_list.find(domain.to_s)
113
- return unless match
111
+ return @list_category if defined?(@list_category)
114
112
 
115
- regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.value)}\n}im
116
- matches = Gman.list.contents.match(regex)
117
- matches[1] if matches
118
- end
113
+ match = Gman.list.public_suffix_list.find(domain.to_s)
114
+ return @list_category = nil unless match
115
+
116
+ regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.value)}\n}im
117
+ matches = Gman.list.contents.match(regex)
118
+ @list_category = matches ? matches[1] : nil
119
119
  end
120
120
 
121
121
  def matches
data/lib/gman/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Gman
4
- VERSION = '7.0.5'
4
+ VERSION = '7.0.6'
5
5
  end
data/lib/gman.rb CHANGED
@@ -43,12 +43,11 @@ class Gman
43
43
  #
44
44
  # Returns boolean true if a government domain
45
45
  def valid?
46
- @valid ||= begin
47
- return false unless valid_domain?
48
- return false if academic?
46
+ return @valid if defined?(@valid)
49
47
 
50
- locality? || public_suffix_valid?
51
- end
48
+ @valid = false unless valid_domain?
49
+ @valid = false if academic?
50
+ @valid ||= locality? || public_suffix_valid?
52
51
  end
53
52
 
54
53
  def locality?
data/script/profile CHANGED
@@ -8,7 +8,7 @@ require './lib/gman'
8
8
  # without pre-loading the Gman list for an accurate benchmark
9
9
  count = (ARGV[0] || 100).to_i
10
10
  domains = File.readlines('./config/domains.txt')
11
- domains = domains.select { |l| l =~ /^[a-z0-9]/i }
11
+ domains = domains.grep(/^[a-z0-9]/i)
12
12
  domains = domains.sample(count)
13
13
 
14
14
  RubyProf.start
data/script/prune CHANGED
@@ -10,7 +10,7 @@ require_relative '../lib/gman/domain_list'
10
10
  domains = ARGV
11
11
  domains = domains.clone.map { |d| d.delete ',' }
12
12
 
13
- list = File.open('./config/domains.txt').read
13
+ list = File.read('./config/domains.txt')
14
14
  puts "Starting list: #{Gman::DomainList.current.count} domains"
15
15
 
16
16
  domains.each do |domain|
data/script/reconcile-us CHANGED
@@ -26,7 +26,7 @@ data.each do |row|
26
26
  group = row
27
27
  domains[group] = []
28
28
  else
29
- domains[group].push row.sub("\.\t", '').strip
29
+ domains[group].push row.sub(".\t", '').strip
30
30
  end
31
31
  end
32
32
 
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ! /usr/bin/env ruby
4
+ # frozen_string_literal: true
5
+
6
+ #
7
+ # Add one or more domains to a given group, running the standard import checks
8
+ #
9
+ # Usage: script/add [GROUP] [DOMAIN(S)]
10
+
11
+ require './lib/gman/importer'
12
+ require 'parallel'
13
+
14
+ importer = Gman::Importer.new({})
15
+ options = { skip_dupe: true, skip_resolve: false }
16
+ list_path = File.expand_path '../config/domains.txt', __dir__
17
+
18
+ importer.logger.info "Starting list: #{Gman::DomainList.current.count} domains"
19
+
20
+ Gman.list.to_h.values.shuffle.each do |domains|
21
+ # next if ['non-us gov', 'non-us mil', 'US Federal'].include?(group)
22
+
23
+ Parallel.each(domains, progress: "Validating") do |domain|
24
+ next if domain.start_with?("!")
25
+ next if importer.valid_domain?(domain, options)
26
+
27
+ importer.logger.warn "#{domain} is not valid, removing from list"
28
+ list = File.read(list_path)
29
+ list.gsub!(/^#{Regexp.escape(domain)}$\n/, '')
30
+ File.write list_path, list
31
+ end
32
+ end
33
+
34
+ importer.logger.info "Ending list: #{Gman::DomainList.current.count} domains"
data/script/vendor CHANGED
@@ -10,4 +10,4 @@ for file in script/vendor-*; do
10
10
  fi
11
11
  done
12
12
 
13
- script/alphabetize
13
+ bundle exec script/alphabetize
@@ -3,15 +3,6 @@
3
3
  # Vendors the full list of US .gov domains from https://github.com/GSA/data
4
4
  # Usage: script/vendor-gov-list
5
5
 
6
- # Set up
7
- mkdir tmp
8
- rm -Rf tmp/gsa-data
9
-
10
6
  # Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
11
- git clone https://github.com/GSA/data tmp/gsa-data
12
- pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
13
- files=( $pattern )
14
- cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv
7
+ wget https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv -O ./config/vendor/dotgovs.csv
15
8
 
16
- # Clean up
17
- rm -Rf tmp/gsa-data
data/script/vendor-us CHANGED
@@ -14,25 +14,29 @@
14
14
 
15
15
  require './lib/gman'
16
16
  require 'open-uri'
17
+ require 'csv'
17
18
 
19
+ path = File.expand_path('./vendor-us-tmp.csv')
18
20
  blacklist = %w[usagovQUASI usagovFEDgov]
19
- source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'
21
+ source = 'https://raw.githubusercontent.com/GSA/govt-urls/main/1_govt_urls_full.csv'
22
+ domains = {}
20
23
 
21
- data = URI.open(source).read
22
- data = data.split('_' * 74)
23
- data = data.last.strip
24
- data = data.split(/\r?\n/).reject(&:empty?)
24
+ begin
25
+ raw = URI.open(source).read
26
+ File.write(path, raw)
27
+ data = CSV.table(path)
25
28
 
26
- domains = {}
27
- group = ''
28
- data.each do |row|
29
- if /^\w/.match?(row)
30
- group = row
31
- domains[group] = []
32
- else
33
- domains[group].push row.sub("\.\t", '').strip
29
+ data.each do |domain|
30
+ next if domain[:type_of_government] == 'Quasigovernmental'
31
+
32
+ group = "US #{domain[:type_of_government]}"
33
+ group += " (#{domain[:state]})" if domain[:type_of_government] != 'Federal' && domain[:state]
34
+ domains[group] ||= []
35
+ domains[group] << domain[:domain_name]
34
36
  end
35
- end
36
37
 
37
- domains.reject! { |g, _| blacklist.include?(g) }
38
- Gman::Importer.new(domains).import
38
+ domains.reject! { |g, _| blacklist.include?(g) }
39
+ Gman::Importer.new(domains).import
40
+ ensure
41
+ File.delete(path)
42
+ end
@@ -69,7 +69,7 @@ RSpec.describe Gman::DomainList do
69
69
  end
70
70
 
71
71
  it 'outputs public suffix format' do
72
- expect(subject.to_s).to match("// Canada federal\ncanada\.ca\n")
72
+ expect(subject.to_s).to match("// Canada federal\ncanada.ca\n")
73
73
  end
74
74
 
75
75
  it "finds a domain's parent" do
@@ -80,7 +80,7 @@ RSpec.describe Gman::DomainList do
80
80
  let(:stubbed_file_contents) { File.read(stubbed_list_path) }
81
81
 
82
82
  before do
83
- subject.instance_variable_set('@path', stubbed_list_path)
83
+ subject.instance_variable_set(:@path, stubbed_list_path)
84
84
  end
85
85
 
86
86
  context 'with list data stubbed' do
@@ -137,7 +137,7 @@ RSpec.describe 'Gman identifier' do
137
137
  end
138
138
 
139
139
  context 'a county .gov' do
140
- let(:domain) { 'ALLEGHENYCOUNTYPA.GOV' }
140
+ let(:domain) { '211DUPAGE.GOV' }
141
141
 
142
142
  it "knows it's a county" do
143
143
  expect(subject).to be_a_county
@@ -161,11 +161,11 @@ RSpec.describe 'Gman identifier' do
161
161
  end
162
162
 
163
163
  it 'knows the state' do
164
- expect(subject.state).to eql('PA')
164
+ expect(subject.state).to eql('IL')
165
165
  end
166
166
 
167
167
  it 'knows the city' do
168
- expect(subject.city).to eql('Pittsburgh')
168
+ expect(subject.city).to eql('Wheaton')
169
169
  end
170
170
  end
171
171
 
@@ -203,8 +203,8 @@ RSpec.describe 'Gman identifier' do
203
203
  context "determining a domain's type" do
204
204
  {
205
205
  unknown: 'cityofperu.org',
206
- "Canada municipal": 'acme.ca',
207
- "Canada federal": 'canada.ca'
206
+ 'Canada municipal': 'acme.ca',
207
+ 'Canada federal': 'canada.ca'
208
208
  }.each do |expected, domain|
209
209
  context "Given the #{domain} domain" do
210
210
  let(:domain) { domain }
@@ -9,7 +9,7 @@ RSpec.describe Gman::Importer do
9
9
  let(:domain_list) { subject.domain_list }
10
10
 
11
11
  before do
12
- subject.instance_variable_set '@logger', logger
12
+ subject.instance_variable_set :@logger, logger
13
13
  end
14
14
 
15
15
  it 'inits the domain list' do
@@ -68,7 +68,7 @@ RSpec.describe Gman::Importer do
68
68
  let(:stubbed_list) { Gman::DomainList.new(path: stubbed_list_path) }
69
69
  let(:stubbed_file_contents) { File.read(stubbed_list_path) }
70
70
 
71
- before { subject.instance_variable_set '@current', stubbed_list }
71
+ before { subject.instance_variable_set :@current, stubbed_list }
72
72
 
73
73
  context 'writing' do
74
74
  before { @current = subject.current.to_s }
data/spec/gman_spec.rb CHANGED
@@ -18,7 +18,7 @@ RSpec.describe Gman do
18
18
 
19
19
  context 'invalid domains' do
20
20
  ['foo.bar.com', 'bar@foo.biz', 'http://www.foo.biz',
21
- 'foo.uk', 'gov', 'foo@k12.champaign.il.us', 'foo@kii.gov.by',
21
+ 'foo.uk', 'gov', 'foo@k12.champaign.il.us', # 'foo@kii.gov.by',
22
22
  'foo', '', nil, ' ', 'foo.city.il.us', 'foo.ci.il.us',
23
23
  'foo.zx.us', 'foo@mail.gov.ua', 'foo@gwu.edu'].each do |domain|
24
24
  subject { described_class.new(domain) }
data/spec/spec_helper.rb CHANGED
@@ -24,7 +24,7 @@ def stubbed_list_path
24
24
  end
25
25
 
26
26
  def with_env(key, value)
27
- old_env = ENV[key]
27
+ old_env = ENV.fetch(key, nil)
28
28
  ENV[key] = value
29
29
  yield
30
30
  ENV[key] = old_env
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 7.0.5
4
+ version: 7.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-13 00:00:00.000000000 Z
11
+ date: 2022-12-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colored
@@ -184,14 +184,28 @@ dependencies:
184
184
  requirements:
185
185
  - - "~>"
186
186
  - !ruby/object:Gem::Version
187
- version: '0.15'
187
+ version: '1.4'
188
188
  type: :development
189
189
  prerelease: false
190
190
  version_requirements: !ruby/object:Gem::Requirement
191
191
  requirements:
192
192
  - - "~>"
193
193
  - !ruby/object:Gem::Version
194
- version: '0.15'
194
+ version: '1.4'
195
+ - !ruby/object:Gem::Dependency
196
+ name: ruby-progressbar
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - "~>"
200
+ - !ruby/object:Gem::Version
201
+ version: '1.10'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '1.10'
195
209
  - !ruby/object:Gem::Dependency
196
210
  name: swot
197
211
  requirement: !ruby/object:Gem::Requirement
@@ -220,17 +234,21 @@ files:
220
234
  - ".github/ISSUE_TEMPLATE/bug_report.md"
221
235
  - ".github/ISSUE_TEMPLATE/feature_request.md"
222
236
  - ".github/config.yml"
237
+ - ".github/dependabot.yml"
223
238
  - ".github/funding.yml"
224
239
  - ".github/no-response.yml"
225
240
  - ".github/release-drafter.yml"
226
241
  - ".github/settings.yml"
227
242
  - ".github/stale.yml"
243
+ - ".github/workflows/ci.yml"
244
+ - ".github/workflows/clean.yml"
245
+ - ".github/workflows/codeql-analysis.yml"
246
+ - ".github/workflows/validate.yml"
247
+ - ".github/workflows/vendor.yml"
228
248
  - ".gitignore"
229
249
  - ".rspec"
230
250
  - ".rubocop.yml"
231
251
  - ".rubocop_todo.yml"
232
- - ".ruby-version"
233
- - ".travis.yml"
234
252
  - Gemfile
235
253
  - LICENSE
236
254
  - bin/gman
@@ -261,13 +279,11 @@ files:
261
279
  - script/prune
262
280
  - script/reconcile-us
263
281
  - script/release
282
+ - script/validate-domains
264
283
  - script/vendor
265
284
  - script/vendor-federal-de
266
285
  - script/vendor-gov-list
267
- - script/vendor-municipal-de
268
- - script/vendor-nl
269
286
  - script/vendor-public-suffix
270
- - script/vendor-se
271
287
  - script/vendor-swot
272
288
  - script/vendor-us
273
289
  - spec/fixtures/domains.txt
@@ -284,35 +300,28 @@ files:
284
300
  homepage: https://github.com/benbalter/gman
285
301
  licenses:
286
302
  - MIT
287
- metadata: {}
303
+ metadata:
304
+ rubygems_mfa_required: 'true'
288
305
  post_install_message:
289
306
  rdoc_options: []
290
307
  require_paths:
291
308
  - lib
292
309
  required_ruby_version: !ruby/object:Gem::Requirement
293
310
  requirements:
294
- - - "~>"
311
+ - - ">="
295
312
  - !ruby/object:Gem::Version
296
313
  version: '2.5'
314
+ - - "<"
315
+ - !ruby/object:Gem::Version
316
+ version: '4.0'
297
317
  required_rubygems_version: !ruby/object:Gem::Requirement
298
318
  requirements:
299
319
  - - ">="
300
320
  - !ruby/object:Gem::Version
301
321
  version: '0'
302
322
  requirements: []
303
- rubygems_version: 3.0.3
323
+ rubygems_version: 3.2.33
304
324
  signing_key:
305
325
  specification_version: 4
306
326
  summary: Check if a given domain or email address belong to a governemnt entity
307
- test_files:
308
- - spec/fixtures/domains.txt
309
- - spec/fixtures/obama.txt
310
- - spec/gman/bin_spec.rb
311
- - spec/gman/country_code_spec.rb
312
- - spec/gman/domain_list_spec.rb
313
- - spec/gman/domains_spec.rb
314
- - spec/gman/identifier_spec.rb
315
- - spec/gman/importer_spec.rb
316
- - spec/gman/locality_spec.rb
317
- - spec/gman_spec.rb
318
- - spec/spec_helper.rb
327
+ test_files: []
data/.ruby-version DELETED
@@ -1 +0,0 @@
1
- 2.6.6
data/.travis.yml DELETED
@@ -1,4 +0,0 @@
1
- langauage: ruby
2
- script: "script/cibuild"
3
- sudo: false
4
- cache: bundler
@@ -1,23 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'csv'
5
- require 'open-uri'
6
- require './lib/gman'
7
-
8
- url = 'http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv'
9
-
10
- csv = URI.open(url).read.force_encoding('iso-8859-1').encode('UTF-8')
11
-
12
- # For some reason, the header row is actually the last row
13
- # Pop the last line off the file and prepend it at the begining
14
- # So that when we pass it to CSV it detects the headers properly
15
- lines = csv.split("\n")
16
- lines.unshift lines.pop
17
- csv = lines.join("\n")
18
-
19
- # Load municipal domains
20
- data = CSV.parse(csv, headers: true, col_sep: ';')
21
- domains = data.map { |row| row['Internet'] }
22
-
23
- Gman::Importer.new('German Municipalities' => domains).import
data/script/vendor-nl DELETED
@@ -1,21 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
5
-
6
- require 'fileutils'
7
- require './lib/gman'
8
-
9
- FileUtils.rm_rf('almanak.overheid.nl')
10
- commands = [
11
- "wget -q -r -nc -np https://almanak.overheid.nl/
12
- grep @ -rI almanak.overheid.nl/",
13
- 'cut -f 2 -d @',
14
- "cut -f 1 -d '\"'",
15
- 'grep \\.nl$',
16
- 'sort',
17
- 'uniq'
18
- ]
19
- domains = system commands.join('|')
20
-
21
- Gman::Importer.new('Netherlands' => domains.split("\n")).import
data/script/vendor-se DELETED
@@ -1,21 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'mechanize'
5
- require 'csv'
6
- require './lib/gman'
7
-
8
- url = 'http://www.myndighetsregistret.scb.se/Myndighet.aspx'
9
- agent = Mechanize.new
10
- page = agent.get(url)
11
- form = page.forms.first
12
- form.radiobuttons.find { |r| r.value = 'Textfil' }.check
13
- submit_button = form.buttons.find { |b| b.type == 'submit' }
14
- response = agent.submit(form, submit_button)
15
-
16
- rows = CSV.parse(response.content, headers: true, col_sep: "\t")
17
- domains = rows.map do |row|
18
- row['Webbadress'] unless /UNIVERSITET/.match?(row['Namn'])
19
- end
20
-
21
- Gman::Importer.new('Swedish Administrative Authorities' => domains).import