gman 5.0.6 → 5.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/domains.txt +5 -19
- data/lib/gman/domain_list.rb +1 -1
- data/lib/gman/importer.rb +6 -4
- data/lib/gman/version.rb +1 -1
- data/script/reconcile-us +64 -0
- data/script/vendor-gov-list +14 -2
- data/script/vendor-public-suffix +3 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5836f10b4bf15f1e7c91b197095ae1dff999bc04
|
4
|
+
data.tar.gz: 49032b8c90816c1fd6a52268f3f212520d07cd99
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ba1591c651effddcd8b654316b1ca8a325a705461cb2963cf1a908b1db92e3b2eb3cba0c5ef547103ccc9d7941d2256da5ec720160625b28b91a74a8965c8ff8
|
7
|
+
data.tar.gz: f9dea4ab2d0f97788c479c9f0e8aa5dc9fdda278038c5e201c867d60f6aaf6c2ffefb8c066acae63f1a9bdef535b29424afffbd31a6d2a78047bd50fa862c8ee
|
data/config/domains.txt
CHANGED
@@ -1201,7 +1201,6 @@ village.hazelton.bc.ca
|
|
1201
1201
|
village.longview.ab.ca
|
1202
1202
|
village.memramcook.com
|
1203
1203
|
village.merrickville-wolford.on.ca
|
1204
|
-
village.nobleford.ab.ca
|
1205
1204
|
village.stantoine.nb.ca
|
1206
1205
|
village.westport.on.ca
|
1207
1206
|
villagecharlo.com
|
@@ -1451,7 +1450,6 @@ riigikogu.ee
|
|
1451
1450
|
siseministeerium.ee
|
1452
1451
|
sm.ee
|
1453
1452
|
tja.ee
|
1454
|
-
valitus.ee
|
1455
1453
|
vm.ee
|
1456
1454
|
|
1457
1455
|
// Finland
|
@@ -2929,6 +2927,8 @@ gob.mx
|
|
2929
2927
|
gob.pa
|
2930
2928
|
gob.pe
|
2931
2929
|
gob.pk
|
2930
|
+
gob.sv
|
2931
|
+
gob.ve
|
2932
2932
|
gouv.bj
|
2933
2933
|
gouv.ci
|
2934
2934
|
gouv.fr
|
@@ -3062,6 +3062,7 @@ govt.nz
|
|
3062
3062
|
gub.uy
|
3063
3063
|
leg.br
|
3064
3064
|
lg.jp
|
3065
|
+
mil.tr
|
3065
3066
|
nic.in
|
3066
3067
|
onroerenderfgoed.be
|
3067
3068
|
|
@@ -4183,6 +4184,7 @@ jeffco.us
|
|
4183
4184
|
kitcarsoncounty.org
|
4184
4185
|
lakewood.org
|
4185
4186
|
littletongov.org
|
4187
|
+
metrodenvercfc.org
|
4186
4188
|
metromayors.org
|
4187
4189
|
minturn.org
|
4188
4190
|
mountain-village.co.us
|
@@ -4192,7 +4194,6 @@ orchardcityco.org
|
|
4192
4194
|
parachutecolorado.com
|
4193
4195
|
parkco.us
|
4194
4196
|
parkeronline.org
|
4195
|
-
peakcfc.com
|
4196
4197
|
plattevillegov.org
|
4197
4198
|
prowerscounty.net
|
4198
4199
|
pueblo.org
|
@@ -4322,6 +4323,7 @@ waterfordct.org
|
|
4322
4323
|
watertownct.org
|
4323
4324
|
west-hartford.com
|
4324
4325
|
westbrookct.us
|
4326
|
+
weston-ct.com
|
4325
4327
|
wethersfieldct.com
|
4326
4328
|
willingtonct.org
|
4327
4329
|
wiltonct.org
|
@@ -5041,7 +5043,6 @@ adeliowa.org
|
|
5041
5043
|
aftoniowa.com
|
5042
5044
|
akronia.org
|
5043
5045
|
altoona-iowa.com
|
5044
|
-
anitaiowa.com
|
5045
5046
|
aplingtonia.com
|
5046
5047
|
arnoldsparkcity.com
|
5047
5048
|
baxter-iowa.com
|
@@ -7759,7 +7760,6 @@ wrightschool.org
|
|
7759
7760
|
yadkinville.org
|
7760
7761
|
|
7761
7762
|
// usagovND
|
7762
|
-
ashley-nd.com
|
7763
7763
|
beulahnd.org
|
7764
7764
|
bismarck.org
|
7765
7765
|
bismarckairport.com
|
@@ -7839,7 +7839,6 @@ mcville.com
|
|
7839
7839
|
medorand.com
|
7840
7840
|
mercercountynd.com
|
7841
7841
|
michigannd.com
|
7842
|
-
milnornd.com
|
7843
7842
|
minnewaukan.com
|
7844
7843
|
minotnd.org
|
7845
7844
|
mohallndak.com
|
@@ -7869,7 +7868,6 @@ rutlandnd.com
|
|
7869
7868
|
sargentnd.com
|
7870
7869
|
sheridan.nd.us
|
7871
7870
|
sherwoodnd.com
|
7872
|
-
sourisnd.com
|
7873
7871
|
stanleynd.com
|
7874
7872
|
stantonnd.com
|
7875
7873
|
steelend.com
|
@@ -8154,7 +8152,6 @@ brigantinebeachnj.com
|
|
8154
8152
|
brooklawn-nj.com
|
8155
8153
|
brooklawn.us
|
8156
8154
|
buenaboro.org
|
8157
|
-
buenavistatownship.org
|
8158
8155
|
burlingtonnj.us
|
8159
8156
|
butlerborough.com
|
8160
8157
|
bwhnj.com
|
@@ -8736,7 +8733,6 @@ poncatribe-ne.org
|
|
8736
8733
|
portlions.net
|
8737
8734
|
potawatomi.org
|
8738
8735
|
powhatan.org
|
8739
|
-
prairieisland.org
|
8740
8736
|
pueblodecochiti.org
|
8741
8737
|
puyallup-tribe.com
|
8742
8738
|
redding-rancheria.com
|
@@ -8788,7 +8784,6 @@ accessesmeralda.com
|
|
8788
8784
|
bcnv.org
|
8789
8785
|
carson.org
|
8790
8786
|
churchillcounty.org
|
8791
|
-
cityoffallon.org
|
8792
8787
|
cityoffernley.org
|
8793
8788
|
cityofhenderson.com
|
8794
8789
|
cityofnorthlasvegas.com
|
@@ -9175,7 +9170,6 @@ suffernvillage.com
|
|
9175
9170
|
sylvanbeachny.com
|
9176
9171
|
syrgov.net
|
9177
9172
|
taghkanic.org
|
9178
|
-
tannersvilleny.org
|
9179
9173
|
tarrytowngov.com
|
9180
9174
|
thurman-ny.com
|
9181
9175
|
tiogacountyny.com
|
@@ -9396,7 +9390,6 @@ townofwheatland.org
|
|
9396
9390
|
townofwillsboro.com
|
9397
9391
|
townofwilmington.org
|
9398
9392
|
townofwilton.com
|
9399
|
-
townofwindham.com
|
9400
9393
|
townofwoodbury.com
|
9401
9394
|
townverona.org
|
9402
9395
|
townwalworthny.com
|
@@ -9425,7 +9418,6 @@ villageofbarneveld.org
|
|
9425
9418
|
villageofbath.org
|
9426
9419
|
villageofbergen.com
|
9427
9420
|
villageofbridgewater.org
|
9428
|
-
villageofbrocton.com
|
9429
9421
|
villageofbronxville.com
|
9430
9422
|
villageofbuchanan.com
|
9431
9423
|
villageofcapevincent.org
|
@@ -10277,7 +10269,6 @@ paehealth.com
|
|
10277
10269
|
palmertonborough.com
|
10278
10270
|
palmertwp.com
|
10279
10271
|
paradisetownship.com
|
10280
|
-
parkercity.org
|
10281
10272
|
parkesburg.org
|
10282
10273
|
parksideboro.com
|
10283
10274
|
patientsafetyauthority.org
|
@@ -10590,7 +10581,6 @@ greatfallssc.net
|
|
10590
10581
|
greenvillecounty.org
|
10591
10582
|
hamptoncountysc.org
|
10592
10583
|
hamptonsc.net
|
10593
|
-
hartsvillesc.com
|
10594
10584
|
horrycounty.org
|
10595
10585
|
iop.net
|
10596
10586
|
jamesislandsc.us
|
@@ -10622,7 +10612,6 @@ patriotspoint.org
|
|
10622
10612
|
port-of-charleston.com
|
10623
10613
|
portroyal.org
|
10624
10614
|
prosperitysc.com
|
10625
|
-
richburgsc.net
|
10626
10615
|
richlandonline.com
|
10627
10616
|
ridgespringsc.com
|
10628
10617
|
santeecooper.com
|
@@ -10666,12 +10655,10 @@ townofbriarcliffe.us
|
|
10666
10655
|
townofcampobello.com
|
10667
10656
|
townofedistobeach.com
|
10668
10657
|
townofgraycourt.net
|
10669
|
-
townofheathsprings.org
|
10670
10658
|
townofhollywood.org
|
10671
10659
|
townofhoneapath.com
|
10672
10660
|
townofirmosc.com
|
10673
10661
|
townofiva.com
|
10674
|
-
townofjamesislandsc.org
|
10675
10662
|
townofkershaw.net
|
10676
10663
|
townofmcclellanville-sc.net
|
10677
10664
|
townofmccormicksc.org
|
@@ -11433,7 +11420,6 @@ yesvirginia.org
|
|
11433
11420
|
|
11434
11421
|
// usagovVI
|
11435
11422
|
gov.vi
|
11436
|
-
governordejongh.com
|
11437
11423
|
legvi.org
|
11438
11424
|
nationalarchives.gov.vg
|
11439
11425
|
usviber.org
|
data/lib/gman/domain_list.rb
CHANGED
data/lib/gman/importer.rb
CHANGED
@@ -81,7 +81,9 @@ class Gman
|
|
81
81
|
true
|
82
82
|
end
|
83
83
|
|
84
|
+
# if RECONCILING=true, return the reason, rather than a bool and silence log output
|
84
85
|
def reject(domain, reason)
|
86
|
+
return reason if ENV["RECONCILING"]
|
85
87
|
logger.info "👎 `#{domain}`: #{reason}"
|
86
88
|
false
|
87
89
|
end
|
@@ -90,14 +92,14 @@ class Gman
|
|
90
92
|
@current ||= DomainList.current
|
91
93
|
end
|
92
94
|
|
93
|
-
def import
|
95
|
+
def import(options={})
|
94
96
|
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
95
97
|
logger.info "Adding: #{domains.count} domains"
|
96
98
|
|
97
99
|
domains.list.each do |group, domains|
|
98
100
|
domains.map! { |domain| Gman.new(domain).to_s }
|
99
101
|
domains.map! { |domain| normalize_domain(domain) }
|
100
|
-
domains.select! { |domain| valid_domain?(domain) }
|
102
|
+
domains.select! { |domain| valid_domain?(domain, options) }
|
101
103
|
end
|
102
104
|
|
103
105
|
logger.info "Filtered to: #{domains.count} domains"
|
@@ -142,7 +144,7 @@ class Gman
|
|
142
144
|
end
|
143
145
|
|
144
146
|
class Gman
|
145
|
-
def self.import(hash)
|
146
|
-
Gman::Importer.new(hash).import
|
147
|
+
def self.import(hash, options={})
|
148
|
+
Gman::Importer.new(hash).import(options)
|
147
149
|
end
|
148
150
|
end
|
data/lib/gman/version.rb
CHANGED
data/script/reconcile-us
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Reconciles the USA.gov-maintained list of US domains with domains.txt
|
4
|
+
# to show domains listed in the USA.gov-maintained list that we reject and why
|
5
|
+
#
|
6
|
+
# Usage: script/reconcile-us
|
7
|
+
|
8
|
+
require './lib/gman/importer'
|
9
|
+
require 'yaml'
|
10
|
+
|
11
|
+
ENV["RECONCILING"] = "true"
|
12
|
+
blacklist = ["usagovQUASI"]
|
13
|
+
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"
|
14
|
+
|
15
|
+
data = open(source).read
|
16
|
+
data = data.split("__________________________________________________________________________")
|
17
|
+
data = data.last.strip
|
18
|
+
data = data.split(/\r?\n/).reject { |r| r.empty? }
|
19
|
+
|
20
|
+
domains = {}
|
21
|
+
group = ""
|
22
|
+
data.each do |row|
|
23
|
+
if row =~ /^\w/
|
24
|
+
group = row
|
25
|
+
domains[group] = []
|
26
|
+
else
|
27
|
+
domains[group].push row.sub("\.\t", "").strip
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
domains.reject! { |group,domain| blacklist.include?(group) }
|
32
|
+
importer = Gman::Importer.new(domains)
|
33
|
+
|
34
|
+
importer.logger.info "Starting with #{importer.domains.count} domains"
|
35
|
+
|
36
|
+
importer.domains.list.each do |group, domains|
|
37
|
+
domains.map! { |domain| Gman.new(domain).to_s }
|
38
|
+
domains.map! { |domain| importer.normalize_domain(domain) }
|
39
|
+
end
|
40
|
+
|
41
|
+
importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"
|
42
|
+
|
43
|
+
missing = {}
|
44
|
+
importer.domains.list.each do |group, usagovdomains|
|
45
|
+
next unless importer.current.list[group]
|
46
|
+
missing[group] = importer.current.list[group] - usagovdomains
|
47
|
+
end
|
48
|
+
|
49
|
+
missing.reject! { |key, value| value.empty? }
|
50
|
+
|
51
|
+
importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
|
52
|
+
puts "Here's the list of missing domains:"
|
53
|
+
puts YAML.dump(missing)
|
54
|
+
|
55
|
+
domains = importer.domains.domains
|
56
|
+
domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
|
57
|
+
domains.delete(true)
|
58
|
+
domains.delete(false)
|
59
|
+
domains.delete("locality")
|
60
|
+
|
61
|
+
importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"
|
62
|
+
|
63
|
+
puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
|
64
|
+
puts YAML.dump(domains)
|
data/script/vendor-gov-list
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
#!/bin/sh
|
2
|
+
#
|
3
|
+
# Vendors the full list of US .gov domains from https://github.com/GSA/data
|
4
|
+
# Usage: script/vendor-gov-list
|
2
5
|
|
3
|
-
|
6
|
+
# Set up
|
7
|
+
mkdir tmp
|
8
|
+
rm -Rf tmp/gsa-data
|
4
9
|
|
5
|
-
|
10
|
+
# Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
|
11
|
+
git clone https://github.com/GSA/data tmp/gsa-data
|
12
|
+
pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
|
13
|
+
files=( $pattern )
|
14
|
+
cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv
|
15
|
+
|
16
|
+
# Clean up
|
17
|
+
rm -Rf tmp/gsa-data
|
data/script/vendor-public-suffix
CHANGED
@@ -22,4 +22,6 @@ PublicSuffix::List.default.each do |rule|
|
|
22
22
|
domains.push domain unless domain.nil? or domains.include? domain
|
23
23
|
end
|
24
24
|
|
25
|
-
|
25
|
+
# Note: We want to skip resolution here, because a domain like `gov.sv` may be
|
26
|
+
# a valid TLD, not have any top-level sites, and we'd still want it listed
|
27
|
+
Gman.import({"non-us gov" => domains}, :skip_resolve => true)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.0.
|
4
|
+
version: 5.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: swot
|
@@ -230,6 +230,7 @@ files:
|
|
230
230
|
- script/dedupe
|
231
231
|
- script/profile
|
232
232
|
- script/prune
|
233
|
+
- script/reconcile-us
|
233
234
|
- script/release
|
234
235
|
- script/vendor
|
235
236
|
- script/vendor-federal-de
|