gman 5.0.6 → 5.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8d98b3b52245844954dec2db49e0016b4c00f620
4
- data.tar.gz: d374b390f178b4f2e02e3eea1ccb770891d46070
3
+ metadata.gz: 5836f10b4bf15f1e7c91b197095ae1dff999bc04
4
+ data.tar.gz: 49032b8c90816c1fd6a52268f3f212520d07cd99
5
5
  SHA512:
6
- metadata.gz: c23d968dd4a9bdce198794f4477c645c99642f95cb9134858fed569dea4322a0eb8ec64b08752fd0246389f693d777ede3c6881e5b41c9aa64d7a865d91b3237
7
- data.tar.gz: 67c4672fd6bcf4bfbc6705120d31e603903ce44dc2307a0407de644e5712b766ec9fe4ae90f32a3790a51ef4f1ec4efc3c4950ddcd6c0273eb7b343ba31c090e
6
+ metadata.gz: ba1591c651effddcd8b654316b1ca8a325a705461cb2963cf1a908b1db92e3b2eb3cba0c5ef547103ccc9d7941d2256da5ec720160625b28b91a74a8965c8ff8
7
+ data.tar.gz: f9dea4ab2d0f97788c479c9f0e8aa5dc9fdda278038c5e201c867d60f6aaf6c2ffefb8c066acae63f1a9bdef535b29424afffbd31a6d2a78047bd50fa862c8ee
data/config/domains.txt CHANGED
@@ -1201,7 +1201,6 @@ village.hazelton.bc.ca
1201
1201
  village.longview.ab.ca
1202
1202
  village.memramcook.com
1203
1203
  village.merrickville-wolford.on.ca
1204
- village.nobleford.ab.ca
1205
1204
  village.stantoine.nb.ca
1206
1205
  village.westport.on.ca
1207
1206
  villagecharlo.com
@@ -1451,7 +1450,6 @@ riigikogu.ee
1451
1450
  siseministeerium.ee
1452
1451
  sm.ee
1453
1452
  tja.ee
1454
- valitus.ee
1455
1453
  vm.ee
1456
1454
 
1457
1455
  // Finland
@@ -2929,6 +2927,8 @@ gob.mx
2929
2927
  gob.pa
2930
2928
  gob.pe
2931
2929
  gob.pk
2930
+ gob.sv
2931
+ gob.ve
2932
2932
  gouv.bj
2933
2933
  gouv.ci
2934
2934
  gouv.fr
@@ -3062,6 +3062,7 @@ govt.nz
3062
3062
  gub.uy
3063
3063
  leg.br
3064
3064
  lg.jp
3065
+ mil.tr
3065
3066
  nic.in
3066
3067
  onroerenderfgoed.be
3067
3068
 
@@ -4183,6 +4184,7 @@ jeffco.us
4183
4184
  kitcarsoncounty.org
4184
4185
  lakewood.org
4185
4186
  littletongov.org
4187
+ metrodenvercfc.org
4186
4188
  metromayors.org
4187
4189
  minturn.org
4188
4190
  mountain-village.co.us
@@ -4192,7 +4194,6 @@ orchardcityco.org
4192
4194
  parachutecolorado.com
4193
4195
  parkco.us
4194
4196
  parkeronline.org
4195
- peakcfc.com
4196
4197
  plattevillegov.org
4197
4198
  prowerscounty.net
4198
4199
  pueblo.org
@@ -4322,6 +4323,7 @@ waterfordct.org
4322
4323
  watertownct.org
4323
4324
  west-hartford.com
4324
4325
  westbrookct.us
4326
+ weston-ct.com
4325
4327
  wethersfieldct.com
4326
4328
  willingtonct.org
4327
4329
  wiltonct.org
@@ -5041,7 +5043,6 @@ adeliowa.org
5041
5043
  aftoniowa.com
5042
5044
  akronia.org
5043
5045
  altoona-iowa.com
5044
- anitaiowa.com
5045
5046
  aplingtonia.com
5046
5047
  arnoldsparkcity.com
5047
5048
  baxter-iowa.com
@@ -7759,7 +7760,6 @@ wrightschool.org
7759
7760
  yadkinville.org
7760
7761
 
7761
7762
  // usagovND
7762
- ashley-nd.com
7763
7763
  beulahnd.org
7764
7764
  bismarck.org
7765
7765
  bismarckairport.com
@@ -7839,7 +7839,6 @@ mcville.com
7839
7839
  medorand.com
7840
7840
  mercercountynd.com
7841
7841
  michigannd.com
7842
- milnornd.com
7843
7842
  minnewaukan.com
7844
7843
  minotnd.org
7845
7844
  mohallndak.com
@@ -7869,7 +7868,6 @@ rutlandnd.com
7869
7868
  sargentnd.com
7870
7869
  sheridan.nd.us
7871
7870
  sherwoodnd.com
7872
- sourisnd.com
7873
7871
  stanleynd.com
7874
7872
  stantonnd.com
7875
7873
  steelend.com
@@ -8154,7 +8152,6 @@ brigantinebeachnj.com
8154
8152
  brooklawn-nj.com
8155
8153
  brooklawn.us
8156
8154
  buenaboro.org
8157
- buenavistatownship.org
8158
8155
  burlingtonnj.us
8159
8156
  butlerborough.com
8160
8157
  bwhnj.com
@@ -8736,7 +8733,6 @@ poncatribe-ne.org
8736
8733
  portlions.net
8737
8734
  potawatomi.org
8738
8735
  powhatan.org
8739
- prairieisland.org
8740
8736
  pueblodecochiti.org
8741
8737
  puyallup-tribe.com
8742
8738
  redding-rancheria.com
@@ -8788,7 +8784,6 @@ accessesmeralda.com
8788
8784
  bcnv.org
8789
8785
  carson.org
8790
8786
  churchillcounty.org
8791
- cityoffallon.org
8792
8787
  cityoffernley.org
8793
8788
  cityofhenderson.com
8794
8789
  cityofnorthlasvegas.com
@@ -9175,7 +9170,6 @@ suffernvillage.com
9175
9170
  sylvanbeachny.com
9176
9171
  syrgov.net
9177
9172
  taghkanic.org
9178
- tannersvilleny.org
9179
9173
  tarrytowngov.com
9180
9174
  thurman-ny.com
9181
9175
  tiogacountyny.com
@@ -9396,7 +9390,6 @@ townofwheatland.org
9396
9390
  townofwillsboro.com
9397
9391
  townofwilmington.org
9398
9392
  townofwilton.com
9399
- townofwindham.com
9400
9393
  townofwoodbury.com
9401
9394
  townverona.org
9402
9395
  townwalworthny.com
@@ -9425,7 +9418,6 @@ villageofbarneveld.org
9425
9418
  villageofbath.org
9426
9419
  villageofbergen.com
9427
9420
  villageofbridgewater.org
9428
- villageofbrocton.com
9429
9421
  villageofbronxville.com
9430
9422
  villageofbuchanan.com
9431
9423
  villageofcapevincent.org
@@ -10277,7 +10269,6 @@ paehealth.com
10277
10269
  palmertonborough.com
10278
10270
  palmertwp.com
10279
10271
  paradisetownship.com
10280
- parkercity.org
10281
10272
  parkesburg.org
10282
10273
  parksideboro.com
10283
10274
  patientsafetyauthority.org
@@ -10590,7 +10581,6 @@ greatfallssc.net
10590
10581
  greenvillecounty.org
10591
10582
  hamptoncountysc.org
10592
10583
  hamptonsc.net
10593
- hartsvillesc.com
10594
10584
  horrycounty.org
10595
10585
  iop.net
10596
10586
  jamesislandsc.us
@@ -10622,7 +10612,6 @@ patriotspoint.org
10622
10612
  port-of-charleston.com
10623
10613
  portroyal.org
10624
10614
  prosperitysc.com
10625
- richburgsc.net
10626
10615
  richlandonline.com
10627
10616
  ridgespringsc.com
10628
10617
  santeecooper.com
@@ -10666,12 +10655,10 @@ townofbriarcliffe.us
10666
10655
  townofcampobello.com
10667
10656
  townofedistobeach.com
10668
10657
  townofgraycourt.net
10669
- townofheathsprings.org
10670
10658
  townofhollywood.org
10671
10659
  townofhoneapath.com
10672
10660
  townofirmosc.com
10673
10661
  townofiva.com
10674
- townofjamesislandsc.org
10675
10662
  townofkershaw.net
10676
10663
  townofmcclellanville-sc.net
10677
10664
  townofmccormicksc.org
@@ -11433,7 +11420,6 @@ yesvirginia.org
11433
11420
 
11434
11421
  // usagovVI
11435
11422
  gov.vi
11436
- governordejongh.com
11437
11423
  legvi.org
11438
11424
  nationalarchives.gov.vg
11439
11425
  usviber.org
@@ -15,7 +15,7 @@ class Gman
15
15
  end
16
16
 
17
17
  def domains
18
- list.values.flatten
18
+ list.values.flatten.sort.uniq
19
19
  end
20
20
 
21
21
  def count
data/lib/gman/importer.rb CHANGED
@@ -81,7 +81,9 @@ class Gman
81
81
  true
82
82
  end
83
83
 
84
+ # if RECONCILING=true, return the reason, rather than a bool and silence log output
84
85
  def reject(domain, reason)
86
+ return reason if ENV["RECONCILING"]
85
87
  logger.info "👎 `#{domain}`: #{reason}"
86
88
  false
87
89
  end
@@ -90,14 +92,14 @@ class Gman
90
92
  @current ||= DomainList.current
91
93
  end
92
94
 
93
- def import
95
+ def import(options={})
94
96
  logger.info "Current: #{Gman::DomainList.current.count} domains"
95
97
  logger.info "Adding: #{domains.count} domains"
96
98
 
97
99
  domains.list.each do |group, domains|
98
100
  domains.map! { |domain| Gman.new(domain).to_s }
99
101
  domains.map! { |domain| normalize_domain(domain) }
100
- domains.select! { |domain| valid_domain?(domain) }
102
+ domains.select! { |domain| valid_domain?(domain, options) }
101
103
  end
102
104
 
103
105
  logger.info "Filtered to: #{domains.count} domains"
@@ -142,7 +144,7 @@ class Gman
142
144
  end
143
145
 
144
146
  class Gman
145
- def self.import(hash)
146
- Gman::Importer.new(hash).import
147
+ def self.import(hash, options={})
148
+ Gman::Importer.new(hash).import(options)
147
149
  end
148
150
  end
data/lib/gman/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '5.0.6'
2
+ VERSION = '5.0.7'
3
3
  end
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
+ # to show domains listed in the USA.gov-maintained list that we reject and why
5
+ #
6
+ # Usage: script/reconcile-us
7
+
8
+ require './lib/gman/importer'
9
+ require 'yaml'
10
+
11
+ ENV["RECONCILING"] = "true"
12
+ blacklist = ["usagovQUASI"]
13
+ source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
14
+
15
+ data = open(source).read
16
+ data = data.split("__________________________________________________________________________")
17
+ data = data.last.strip
18
+ data = data.split(/\r?\n/).reject { |r| r.empty? }
19
+
20
+ domains = {}
21
+ group = ""
22
+ data.each do |row|
23
+ if row =~ /^\w/
24
+ group = row
25
+ domains[group] = []
26
+ else
27
+ domains[group].push row.sub("\.\t", "").strip
28
+ end
29
+ end
30
+
31
+ domains.reject! { |group,domain| blacklist.include?(group) }
32
+ importer = Gman::Importer.new(domains)
33
+
34
+ importer.logger.info "Starting with #{importer.domains.count} domains"
35
+
36
+ importer.domains.list.each do |group, domains|
37
+ domains.map! { |domain| Gman.new(domain).to_s }
38
+ domains.map! { |domain| importer.normalize_domain(domain) }
39
+ end
40
+
41
+ importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
42
+
43
+ missing = {}
44
+ importer.domains.list.each do |group, usagovdomains|
45
+ next unless importer.current.list[group]
46
+ missing[group] = importer.current.list[group] - usagovdomains
47
+ end
48
+
49
+ missing.reject! { |key, value| value.empty? }
50
+
51
+ importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
52
+ puts "Here's the list of missing domains:"
53
+ puts YAML.dump(missing)
54
+
55
+ domains = importer.domains.domains
56
+ domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
57
+ domains.delete(true)
58
+ domains.delete(false)
59
+ domains.delete("locality")
60
+
61
+ importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
62
+
63
+ puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
64
+ puts YAML.dump(domains)
@@ -1,5 +1,17 @@
1
1
  #!/bin/sh
2
+ #
3
+ # Vendors the full list of US .gov domains from https://github.com/GSA/data
4
+ # Usage: script/vendor-gov-list
2
5
 
3
- DATE=2015-03-15
6
+ # Set up
7
+ mkdir tmp
8
+ rm -Rf tmp/gsa-data
4
9
 
5
- wget "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/$DATE-full.csv" -O config/vendor/dotgovs.csv
10
+ # Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
11
+ git clone https://github.com/GSA/data tmp/gsa-data
12
+ pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
13
+ files=( $pattern )
14
+ cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv
15
+
16
+ # Clean up
17
+ rm -Rf tmp/gsa-data
@@ -22,4 +22,6 @@ PublicSuffix::List.default.each do |rule|
22
22
  domains.push domain unless domain.nil? or domains.include? domain
23
23
  end
24
24
 
25
- Gman.import("non-us gov" => domains)
25
+ # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # a valid TLD, not have any top-level sites, and we'd still want it listed
27
+ Gman.import({"non-us gov" => domains}, :skip_resolve => true)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.0.6
4
+ version: 5.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-06 00:00:00.000000000 Z
11
+ date: 2015-10-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: swot
@@ -230,6 +230,7 @@ files:
230
230
  - script/dedupe
231
231
  - script/profile
232
232
  - script/prune
233
+ - script/reconcile-us
233
234
  - script/release
234
235
  - script/vendor
235
236
  - script/vendor-federal-de