gman 5.0.6 → 5.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/domains.txt +5 -19
- data/lib/gman/domain_list.rb +1 -1
- data/lib/gman/importer.rb +6 -4
- data/lib/gman/version.rb +1 -1
- data/script/reconcile-us +64 -0
- data/script/vendor-gov-list +14 -2
- data/script/vendor-public-suffix +3 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5836f10b4bf15f1e7c91b197095ae1dff999bc04
|
4
|
+
data.tar.gz: 49032b8c90816c1fd6a52268f3f212520d07cd99
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ba1591c651effddcd8b654316b1ca8a325a705461cb2963cf1a908b1db92e3b2eb3cba0c5ef547103ccc9d7941d2256da5ec720160625b28b91a74a8965c8ff8
|
7
|
+
data.tar.gz: f9dea4ab2d0f97788c479c9f0e8aa5dc9fdda278038c5e201c867d60f6aaf6c2ffefb8c066acae63f1a9bdef535b29424afffbd31a6d2a78047bd50fa862c8ee
|
data/config/domains.txt
CHANGED
@@ -1201,7 +1201,6 @@ village.hazelton.bc.ca
|
|
1201
1201
|
village.longview.ab.ca
|
1202
1202
|
village.memramcook.com
|
1203
1203
|
village.merrickville-wolford.on.ca
|
1204
|
-
village.nobleford.ab.ca
|
1205
1204
|
village.stantoine.nb.ca
|
1206
1205
|
village.westport.on.ca
|
1207
1206
|
villagecharlo.com
|
@@ -1451,7 +1450,6 @@ riigikogu.ee
|
|
1451
1450
|
siseministeerium.ee
|
1452
1451
|
sm.ee
|
1453
1452
|
tja.ee
|
1454
|
-
valitus.ee
|
1455
1453
|
vm.ee
|
1456
1454
|
|
1457
1455
|
// Finland
|
@@ -2929,6 +2927,8 @@ gob.mx
|
|
2929
2927
|
gob.pa
|
2930
2928
|
gob.pe
|
2931
2929
|
gob.pk
|
2930
|
+
gob.sv
|
2931
|
+
gob.ve
|
2932
2932
|
gouv.bj
|
2933
2933
|
gouv.ci
|
2934
2934
|
gouv.fr
|
@@ -3062,6 +3062,7 @@ govt.nz
|
|
3062
3062
|
gub.uy
|
3063
3063
|
leg.br
|
3064
3064
|
lg.jp
|
3065
|
+
mil.tr
|
3065
3066
|
nic.in
|
3066
3067
|
onroerenderfgoed.be
|
3067
3068
|
|
@@ -4183,6 +4184,7 @@ jeffco.us
|
|
4183
4184
|
kitcarsoncounty.org
|
4184
4185
|
lakewood.org
|
4185
4186
|
littletongov.org
|
4187
|
+
metrodenvercfc.org
|
4186
4188
|
metromayors.org
|
4187
4189
|
minturn.org
|
4188
4190
|
mountain-village.co.us
|
@@ -4192,7 +4194,6 @@ orchardcityco.org
|
|
4192
4194
|
parachutecolorado.com
|
4193
4195
|
parkco.us
|
4194
4196
|
parkeronline.org
|
4195
|
-
peakcfc.com
|
4196
4197
|
plattevillegov.org
|
4197
4198
|
prowerscounty.net
|
4198
4199
|
pueblo.org
|
@@ -4322,6 +4323,7 @@ waterfordct.org
|
|
4322
4323
|
watertownct.org
|
4323
4324
|
west-hartford.com
|
4324
4325
|
westbrookct.us
|
4326
|
+
weston-ct.com
|
4325
4327
|
wethersfieldct.com
|
4326
4328
|
willingtonct.org
|
4327
4329
|
wiltonct.org
|
@@ -5041,7 +5043,6 @@ adeliowa.org
|
|
5041
5043
|
aftoniowa.com
|
5042
5044
|
akronia.org
|
5043
5045
|
altoona-iowa.com
|
5044
|
-
anitaiowa.com
|
5045
5046
|
aplingtonia.com
|
5046
5047
|
arnoldsparkcity.com
|
5047
5048
|
baxter-iowa.com
|
@@ -7759,7 +7760,6 @@ wrightschool.org
|
|
7759
7760
|
yadkinville.org
|
7760
7761
|
|
7761
7762
|
// usagovND
|
7762
|
-
ashley-nd.com
|
7763
7763
|
beulahnd.org
|
7764
7764
|
bismarck.org
|
7765
7765
|
bismarckairport.com
|
@@ -7839,7 +7839,6 @@ mcville.com
|
|
7839
7839
|
medorand.com
|
7840
7840
|
mercercountynd.com
|
7841
7841
|
michigannd.com
|
7842
|
-
milnornd.com
|
7843
7842
|
minnewaukan.com
|
7844
7843
|
minotnd.org
|
7845
7844
|
mohallndak.com
|
@@ -7869,7 +7868,6 @@ rutlandnd.com
|
|
7869
7868
|
sargentnd.com
|
7870
7869
|
sheridan.nd.us
|
7871
7870
|
sherwoodnd.com
|
7872
|
-
sourisnd.com
|
7873
7871
|
stanleynd.com
|
7874
7872
|
stantonnd.com
|
7875
7873
|
steelend.com
|
@@ -8154,7 +8152,6 @@ brigantinebeachnj.com
|
|
8154
8152
|
brooklawn-nj.com
|
8155
8153
|
brooklawn.us
|
8156
8154
|
buenaboro.org
|
8157
|
-
buenavistatownship.org
|
8158
8155
|
burlingtonnj.us
|
8159
8156
|
butlerborough.com
|
8160
8157
|
bwhnj.com
|
@@ -8736,7 +8733,6 @@ poncatribe-ne.org
|
|
8736
8733
|
portlions.net
|
8737
8734
|
potawatomi.org
|
8738
8735
|
powhatan.org
|
8739
|
-
prairieisland.org
|
8740
8736
|
pueblodecochiti.org
|
8741
8737
|
puyallup-tribe.com
|
8742
8738
|
redding-rancheria.com
|
@@ -8788,7 +8784,6 @@ accessesmeralda.com
|
|
8788
8784
|
bcnv.org
|
8789
8785
|
carson.org
|
8790
8786
|
churchillcounty.org
|
8791
|
-
cityoffallon.org
|
8792
8787
|
cityoffernley.org
|
8793
8788
|
cityofhenderson.com
|
8794
8789
|
cityofnorthlasvegas.com
|
@@ -9175,7 +9170,6 @@ suffernvillage.com
|
|
9175
9170
|
sylvanbeachny.com
|
9176
9171
|
syrgov.net
|
9177
9172
|
taghkanic.org
|
9178
|
-
tannersvilleny.org
|
9179
9173
|
tarrytowngov.com
|
9180
9174
|
thurman-ny.com
|
9181
9175
|
tiogacountyny.com
|
@@ -9396,7 +9390,6 @@ townofwheatland.org
|
|
9396
9390
|
townofwillsboro.com
|
9397
9391
|
townofwilmington.org
|
9398
9392
|
townofwilton.com
|
9399
|
-
townofwindham.com
|
9400
9393
|
townofwoodbury.com
|
9401
9394
|
townverona.org
|
9402
9395
|
townwalworthny.com
|
@@ -9425,7 +9418,6 @@ villageofbarneveld.org
|
|
9425
9418
|
villageofbath.org
|
9426
9419
|
villageofbergen.com
|
9427
9420
|
villageofbridgewater.org
|
9428
|
-
villageofbrocton.com
|
9429
9421
|
villageofbronxville.com
|
9430
9422
|
villageofbuchanan.com
|
9431
9423
|
villageofcapevincent.org
|
@@ -10277,7 +10269,6 @@ paehealth.com
|
|
10277
10269
|
palmertonborough.com
|
10278
10270
|
palmertwp.com
|
10279
10271
|
paradisetownship.com
|
10280
|
-
parkercity.org
|
10281
10272
|
parkesburg.org
|
10282
10273
|
parksideboro.com
|
10283
10274
|
patientsafetyauthority.org
|
@@ -10590,7 +10581,6 @@ greatfallssc.net
|
|
10590
10581
|
greenvillecounty.org
|
10591
10582
|
hamptoncountysc.org
|
10592
10583
|
hamptonsc.net
|
10593
|
-
hartsvillesc.com
|
10594
10584
|
horrycounty.org
|
10595
10585
|
iop.net
|
10596
10586
|
jamesislandsc.us
|
@@ -10622,7 +10612,6 @@ patriotspoint.org
|
|
10622
10612
|
port-of-charleston.com
|
10623
10613
|
portroyal.org
|
10624
10614
|
prosperitysc.com
|
10625
|
-
richburgsc.net
|
10626
10615
|
richlandonline.com
|
10627
10616
|
ridgespringsc.com
|
10628
10617
|
santeecooper.com
|
@@ -10666,12 +10655,10 @@ townofbriarcliffe.us
|
|
10666
10655
|
townofcampobello.com
|
10667
10656
|
townofedistobeach.com
|
10668
10657
|
townofgraycourt.net
|
10669
|
-
townofheathsprings.org
|
10670
10658
|
townofhollywood.org
|
10671
10659
|
townofhoneapath.com
|
10672
10660
|
townofirmosc.com
|
10673
10661
|
townofiva.com
|
10674
|
-
townofjamesislandsc.org
|
10675
10662
|
townofkershaw.net
|
10676
10663
|
townofmcclellanville-sc.net
|
10677
10664
|
townofmccormicksc.org
|
@@ -11433,7 +11420,6 @@ yesvirginia.org
|
|
11433
11420
|
|
11434
11421
|
// usagovVI
|
11435
11422
|
gov.vi
|
11436
|
-
governordejongh.com
|
11437
11423
|
legvi.org
|
11438
11424
|
nationalarchives.gov.vg
|
11439
11425
|
usviber.org
|
data/lib/gman/domain_list.rb
CHANGED
data/lib/gman/importer.rb
CHANGED
@@ -81,7 +81,9 @@ class Gman
|
|
81
81
|
true
|
82
82
|
end
|
83
83
|
|
84
|
+
# if RECONCILING=true, return the reason, rather than a bool and silence log output
|
84
85
|
def reject(domain, reason)
|
86
|
+
return reason if ENV["RECONCILING"]
|
85
87
|
logger.info "👎 `#{domain}`: #{reason}"
|
86
88
|
false
|
87
89
|
end
|
@@ -90,14 +92,14 @@ class Gman
|
|
90
92
|
@current ||= DomainList.current
|
91
93
|
end
|
92
94
|
|
93
|
-
def import
|
95
|
+
def import(options={})
|
94
96
|
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
95
97
|
logger.info "Adding: #{domains.count} domains"
|
96
98
|
|
97
99
|
domains.list.each do |group, domains|
|
98
100
|
domains.map! { |domain| Gman.new(domain).to_s }
|
99
101
|
domains.map! { |domain| normalize_domain(domain) }
|
100
|
-
domains.select! { |domain| valid_domain?(domain) }
|
102
|
+
domains.select! { |domain| valid_domain?(domain, options) }
|
101
103
|
end
|
102
104
|
|
103
105
|
logger.info "Filtered to: #{domains.count} domains"
|
@@ -142,7 +144,7 @@ class Gman
|
|
142
144
|
end
|
143
145
|
|
144
146
|
class Gman
|
145
|
-
def self.import(hash)
|
146
|
-
Gman::Importer.new(hash).import
|
147
|
+
def self.import(hash, options={})
|
148
|
+
Gman::Importer.new(hash).import(options)
|
147
149
|
end
|
148
150
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/reconcile-us
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Reconciles the USA.gov-maintained list of US domains with domains.txt
|
4
|
+
# to show domains listed in the USA.gov-maintained list that we reject and why
|
5
|
+
#
|
6
|
+
# Usage: script/reconcile-us
|
7
|
+
|
8
|
+
require './lib/gman/importer'
|
9
|
+
require 'yaml'
|
10
|
+
|
11
|
+
ENV["RECONCILING"] = "true"
|
12
|
+
blacklist = ["usagovQUASI"]
|
13
|
+
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
|
14
|
+
|
15
|
+
data = open(source).read
|
16
|
+
data = data.split("__________________________________________________________________________")
|
17
|
+
data = data.last.strip
|
18
|
+
data = data.split(/\r?\n/).reject { |r| r.empty? }
|
19
|
+
|
20
|
+
domains = {}
|
21
|
+
group = ""
|
22
|
+
data.each do |row|
|
23
|
+
if row =~ /^\w/
|
24
|
+
group = row
|
25
|
+
domains[group] = []
|
26
|
+
else
|
27
|
+
domains[group].push row.sub("\.\t", "").strip
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
domains.reject! { |group,domain| blacklist.include?(group) }
|
32
|
+
importer = Gman::Importer.new(domains)
|
33
|
+
|
34
|
+
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
|
+
|
36
|
+
importer.domains.list.each do |group, domains|
|
37
|
+
domains.map! { |domain| Gman.new(domain).to_s }
|
38
|
+
domains.map! { |domain| importer.normalize_domain(domain) }
|
39
|
+
end
|
40
|
+
|
41
|
+
importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
|
42
|
+
|
43
|
+
missing = {}
|
44
|
+
importer.domains.list.each do |group, usagovdomains|
|
45
|
+
next unless importer.current.list[group]
|
46
|
+
missing[group] = importer.current.list[group] - usagovdomains
|
47
|
+
end
|
48
|
+
|
49
|
+
missing.reject! { |key, value| value.empty? }
|
50
|
+
|
51
|
+
importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
|
52
|
+
puts "Here's the list of missing domains:"
|
53
|
+
puts YAML.dump(missing)
|
54
|
+
|
55
|
+
domains = importer.domains.domains
|
56
|
+
domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
|
57
|
+
domains.delete(true)
|
58
|
+
domains.delete(false)
|
59
|
+
domains.delete("locality")
|
60
|
+
|
61
|
+
importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
|
62
|
+
|
63
|
+
puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
|
64
|
+
puts YAML.dump(domains)
|
data/script/vendor-gov-list
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
#!/bin/sh
|
2
|
+
#
|
3
|
+
# Vendors the full list of US .gov domains from https://github.com/GSA/data
|
4
|
+
# Usage: script/vendor-gov-list
|
2
5
|
|
3
|
-
|
6
|
+
# Set up
|
7
|
+
mkdir tmp
|
8
|
+
rm -Rf tmp/gsa-data
|
4
9
|
|
5
|
-
|
10
|
+
# Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
|
11
|
+
git clone https://github.com/GSA/data tmp/gsa-data
|
12
|
+
pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
|
13
|
+
files=( $pattern )
|
14
|
+
cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv
|
15
|
+
|
16
|
+
# Clean up
|
17
|
+
rm -Rf tmp/gsa-data
|
data/script/vendor-public-suffix
CHANGED
@@ -22,4 +22,6 @@ PublicSuffix::List.default.each do |rule|
|
|
22
22
|
domains.push domain unless domain.nil? or domains.include? domain
|
23
23
|
end
|
24
24
|
|
25
|
-
|
25
|
+
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
|
+
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
+
Gman.import({"non-us gov" => domains}, :skip_resolve => true)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.0.
|
4
|
+
version: 5.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: swot
|
@@ -230,6 +230,7 @@ files:
|
|
230
230
|
- script/dedupe
|
231
231
|
- script/profile
|
232
232
|
- script/prune
|
233
|
+
- script/reconcile-us
|
233
234
|
- script/release
|
234
235
|
- script/vendor
|
235
236
|
- script/vendor-federal-de
|