gman 5.0.6 → 5.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8d98b3b52245844954dec2db49e0016b4c00f620
4
- data.tar.gz: d374b390f178b4f2e02e3eea1ccb770891d46070
3
+ metadata.gz: 5836f10b4bf15f1e7c91b197095ae1dff999bc04
4
+ data.tar.gz: 49032b8c90816c1fd6a52268f3f212520d07cd99
5
5
  SHA512:
6
- metadata.gz: c23d968dd4a9bdce198794f4477c645c99642f95cb9134858fed569dea4322a0eb8ec64b08752fd0246389f693d777ede3c6881e5b41c9aa64d7a865d91b3237
7
- data.tar.gz: 67c4672fd6bcf4bfbc6705120d31e603903ce44dc2307a0407de644e5712b766ec9fe4ae90f32a3790a51ef4f1ec4efc3c4950ddcd6c0273eb7b343ba31c090e
6
+ metadata.gz: ba1591c651effddcd8b654316b1ca8a325a705461cb2963cf1a908b1db92e3b2eb3cba0c5ef547103ccc9d7941d2256da5ec720160625b28b91a74a8965c8ff8
7
+ data.tar.gz: f9dea4ab2d0f97788c479c9f0e8aa5dc9fdda278038c5e201c867d60f6aaf6c2ffefb8c066acae63f1a9bdef535b29424afffbd31a6d2a78047bd50fa862c8ee
data/config/domains.txt CHANGED
@@ -1201,7 +1201,6 @@ village.hazelton.bc.ca
1201
1201
  village.longview.ab.ca
1202
1202
  village.memramcook.com
1203
1203
  village.merrickville-wolford.on.ca
1204
- village.nobleford.ab.ca
1205
1204
  village.stantoine.nb.ca
1206
1205
  village.westport.on.ca
1207
1206
  villagecharlo.com
@@ -1451,7 +1450,6 @@ riigikogu.ee
1451
1450
  siseministeerium.ee
1452
1451
  sm.ee
1453
1452
  tja.ee
1454
- valitus.ee
1455
1453
  vm.ee
1456
1454
 
1457
1455
  // Finland
@@ -2929,6 +2927,8 @@ gob.mx
2929
2927
  gob.pa
2930
2928
  gob.pe
2931
2929
  gob.pk
2930
+ gob.sv
2931
+ gob.ve
2932
2932
  gouv.bj
2933
2933
  gouv.ci
2934
2934
  gouv.fr
@@ -3062,6 +3062,7 @@ govt.nz
3062
3062
  gub.uy
3063
3063
  leg.br
3064
3064
  lg.jp
3065
+ mil.tr
3065
3066
  nic.in
3066
3067
  onroerenderfgoed.be
3067
3068
 
@@ -4183,6 +4184,7 @@ jeffco.us
4183
4184
  kitcarsoncounty.org
4184
4185
  lakewood.org
4185
4186
  littletongov.org
4187
+ metrodenvercfc.org
4186
4188
  metromayors.org
4187
4189
  minturn.org
4188
4190
  mountain-village.co.us
@@ -4192,7 +4194,6 @@ orchardcityco.org
4192
4194
  parachutecolorado.com
4193
4195
  parkco.us
4194
4196
  parkeronline.org
4195
- peakcfc.com
4196
4197
  plattevillegov.org
4197
4198
  prowerscounty.net
4198
4199
  pueblo.org
@@ -4322,6 +4323,7 @@ waterfordct.org
4322
4323
  watertownct.org
4323
4324
  west-hartford.com
4324
4325
  westbrookct.us
4326
+ weston-ct.com
4325
4327
  wethersfieldct.com
4326
4328
  willingtonct.org
4327
4329
  wiltonct.org
@@ -5041,7 +5043,6 @@ adeliowa.org
5041
5043
  aftoniowa.com
5042
5044
  akronia.org
5043
5045
  altoona-iowa.com
5044
- anitaiowa.com
5045
5046
  aplingtonia.com
5046
5047
  arnoldsparkcity.com
5047
5048
  baxter-iowa.com
@@ -7759,7 +7760,6 @@ wrightschool.org
7759
7760
  yadkinville.org
7760
7761
 
7761
7762
  // usagovND
7762
- ashley-nd.com
7763
7763
  beulahnd.org
7764
7764
  bismarck.org
7765
7765
  bismarckairport.com
@@ -7839,7 +7839,6 @@ mcville.com
7839
7839
  medorand.com
7840
7840
  mercercountynd.com
7841
7841
  michigannd.com
7842
- milnornd.com
7843
7842
  minnewaukan.com
7844
7843
  minotnd.org
7845
7844
  mohallndak.com
@@ -7869,7 +7868,6 @@ rutlandnd.com
7869
7868
  sargentnd.com
7870
7869
  sheridan.nd.us
7871
7870
  sherwoodnd.com
7872
- sourisnd.com
7873
7871
  stanleynd.com
7874
7872
  stantonnd.com
7875
7873
  steelend.com
@@ -8154,7 +8152,6 @@ brigantinebeachnj.com
8154
8152
  brooklawn-nj.com
8155
8153
  brooklawn.us
8156
8154
  buenaboro.org
8157
- buenavistatownship.org
8158
8155
  burlingtonnj.us
8159
8156
  butlerborough.com
8160
8157
  bwhnj.com
@@ -8736,7 +8733,6 @@ poncatribe-ne.org
8736
8733
  portlions.net
8737
8734
  potawatomi.org
8738
8735
  powhatan.org
8739
- prairieisland.org
8740
8736
  pueblodecochiti.org
8741
8737
  puyallup-tribe.com
8742
8738
  redding-rancheria.com
@@ -8788,7 +8784,6 @@ accessesmeralda.com
8788
8784
  bcnv.org
8789
8785
  carson.org
8790
8786
  churchillcounty.org
8791
- cityoffallon.org
8792
8787
  cityoffernley.org
8793
8788
  cityofhenderson.com
8794
8789
  cityofnorthlasvegas.com
@@ -9175,7 +9170,6 @@ suffernvillage.com
9175
9170
  sylvanbeachny.com
9176
9171
  syrgov.net
9177
9172
  taghkanic.org
9178
- tannersvilleny.org
9179
9173
  tarrytowngov.com
9180
9174
  thurman-ny.com
9181
9175
  tiogacountyny.com
@@ -9396,7 +9390,6 @@ townofwheatland.org
9396
9390
  townofwillsboro.com
9397
9391
  townofwilmington.org
9398
9392
  townofwilton.com
9399
- townofwindham.com
9400
9393
  townofwoodbury.com
9401
9394
  townverona.org
9402
9395
  townwalworthny.com
@@ -9425,7 +9418,6 @@ villageofbarneveld.org
9425
9418
  villageofbath.org
9426
9419
  villageofbergen.com
9427
9420
  villageofbridgewater.org
9428
- villageofbrocton.com
9429
9421
  villageofbronxville.com
9430
9422
  villageofbuchanan.com
9431
9423
  villageofcapevincent.org
@@ -10277,7 +10269,6 @@ paehealth.com
10277
10269
  palmertonborough.com
10278
10270
  palmertwp.com
10279
10271
  paradisetownship.com
10280
- parkercity.org
10281
10272
  parkesburg.org
10282
10273
  parksideboro.com
10283
10274
  patientsafetyauthority.org
@@ -10590,7 +10581,6 @@ greatfallssc.net
10590
10581
  greenvillecounty.org
10591
10582
  hamptoncountysc.org
10592
10583
  hamptonsc.net
10593
- hartsvillesc.com
10594
10584
  horrycounty.org
10595
10585
  iop.net
10596
10586
  jamesislandsc.us
@@ -10622,7 +10612,6 @@ patriotspoint.org
10622
10612
  port-of-charleston.com
10623
10613
  portroyal.org
10624
10614
  prosperitysc.com
10625
- richburgsc.net
10626
10615
  richlandonline.com
10627
10616
  ridgespringsc.com
10628
10617
  santeecooper.com
@@ -10666,12 +10655,10 @@ townofbriarcliffe.us
10666
10655
  townofcampobello.com
10667
10656
  townofedistobeach.com
10668
10657
  townofgraycourt.net
10669
- townofheathsprings.org
10670
10658
  townofhollywood.org
10671
10659
  townofhoneapath.com
10672
10660
  townofirmosc.com
10673
10661
  townofiva.com
10674
- townofjamesislandsc.org
10675
10662
  townofkershaw.net
10676
10663
  townofmcclellanville-sc.net
10677
10664
  townofmccormicksc.org
@@ -11433,7 +11420,6 @@ yesvirginia.org
11433
11420
 
11434
11421
  // usagovVI
11435
11422
  gov.vi
11436
- governordejongh.com
11437
11423
  legvi.org
11438
11424
  nationalarchives.gov.vg
11439
11425
  usviber.org
@@ -15,7 +15,7 @@ class Gman
15
15
  end
16
16
 
17
17
  def domains
18
- list.values.flatten
18
+ list.values.flatten.sort.uniq
19
19
  end
20
20
 
21
21
  def count
data/lib/gman/importer.rb CHANGED
@@ -81,7 +81,9 @@ class Gman
81
81
  true
82
82
  end
83
83
 
84
+ # if RECONCILING=true, return the reason, rather than a bool and silence log output
84
85
  def reject(domain, reason)
86
+ return reason if ENV["RECONCILING"]
85
87
  logger.info "👎 `#{domain}`: #{reason}"
86
88
  false
87
89
  end
@@ -90,14 +92,14 @@ class Gman
90
92
  @current ||= DomainList.current
91
93
  end
92
94
 
93
- def import
95
+ def import(options={})
94
96
  logger.info "Current: #{Gman::DomainList.current.count} domains"
95
97
  logger.info "Adding: #{domains.count} domains"
96
98
 
97
99
  domains.list.each do |group, domains|
98
100
  domains.map! { |domain| Gman.new(domain).to_s }
99
101
  domains.map! { |domain| normalize_domain(domain) }
100
- domains.select! { |domain| valid_domain?(domain) }
102
+ domains.select! { |domain| valid_domain?(domain, options) }
101
103
  end
102
104
 
103
105
  logger.info "Filtered to: #{domains.count} domains"
@@ -142,7 +144,7 @@ class Gman
142
144
  end
143
145
 
144
146
  class Gman
145
- def self.import(hash)
146
- Gman::Importer.new(hash).import
147
+ def self.import(hash, options={})
148
+ Gman::Importer.new(hash).import(options)
147
149
  end
148
150
  end
data/lib/gman/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  class Gman
2
- VERSION = '5.0.6'
2
+ VERSION = '5.0.7'
3
3
  end
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Reconciles the USA.gov-maintained list of US domains with domains.txt
4
+ # to show domains listed in the USA.gov-maintained list that we reject and why
5
+ #
6
+ # Usage: script/reconcile-us
7
+
8
+ require './lib/gman/importer'
9
+ require 'yaml'
10
+
11
+ ENV["RECONCILING"] = "true"
12
+ blacklist = ["usagovQUASI"]
13
+ source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
14
+
15
+ data = open(source).read
16
+ data = data.split("__________________________________________________________________________")
17
+ data = data.last.strip
18
+ data = data.split(/\r?\n/).reject { |r| r.empty? }
19
+
20
+ domains = {}
21
+ group = ""
22
+ data.each do |row|
23
+ if row =~ /^\w/
24
+ group = row
25
+ domains[group] = []
26
+ else
27
+ domains[group].push row.sub("\.\t", "").strip
28
+ end
29
+ end
30
+
31
+ domains.reject! { |group,domain| blacklist.include?(group) }
32
+ importer = Gman::Importer.new(domains)
33
+
34
+ importer.logger.info "Starting with #{importer.domains.count} domains"
35
+
36
+ importer.domains.list.each do |group, domains|
37
+ domains.map! { |domain| Gman.new(domain).to_s }
38
+ domains.map! { |domain| importer.normalize_domain(domain) }
39
+ end
40
+
41
+ importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
42
+
43
+ missing = {}
44
+ importer.domains.list.each do |group, usagovdomains|
45
+ next unless importer.current.list[group]
46
+ missing[group] = importer.current.list[group] - usagovdomains
47
+ end
48
+
49
+ missing.reject! { |key, value| value.empty? }
50
+
51
+ importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
52
+ puts "Here's the list of missing domains:"
53
+ puts YAML.dump(missing)
54
+
55
+ domains = importer.domains.domains
56
+ domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
57
+ domains.delete(true)
58
+ domains.delete(false)
59
+ domains.delete("locality")
60
+
61
+ importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
62
+
63
+ puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
64
+ puts YAML.dump(domains)
@@ -1,5 +1,17 @@
1
1
  #!/bin/sh
2
+ #
3
+ # Vendors the full list of US .gov domains from https://github.com/GSA/data
4
+ # Usage: script/vendor-gov-list
2
5
 
3
- DATE=2015-03-15
6
+ # Set up
7
+ mkdir tmp
8
+ rm -Rf tmp/gsa-data
4
9
 
5
- wget "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/$DATE-full.csv" -O config/vendor/dotgovs.csv
10
+ # Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
11
+ git clone https://github.com/GSA/data tmp/gsa-data
12
+ pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
13
+ files=( $pattern )
14
+ cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv
15
+
16
+ # Clean up
17
+ rm -Rf tmp/gsa-data
@@ -22,4 +22,6 @@ PublicSuffix::List.default.each do |rule|
22
22
  domains.push domain unless domain.nil? or domains.include? domain
23
23
  end
24
24
 
25
- Gman.import("non-us gov" => domains)
25
+ # Note: We want to skip resolution here, because a domain like `gov.sv` may be
26
+ # a valid TLD, not have any top-level sites, and we'd still want it listed
27
+ Gman.import({"non-us gov" => domains}, :skip_resolve => true)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gman
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.0.6
4
+ version: 5.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-06 00:00:00.000000000 Z
11
+ date: 2015-10-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: swot
@@ -230,6 +230,7 @@ files:
230
230
  - script/dedupe
231
231
  - script/profile
232
232
  - script/prune
233
+ - script/reconcile-us
233
234
  - script/release
234
235
  - script/vendor
235
236
  - script/vendor-federal-de