gman 4.6.5 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +0 -7
- data/config/domains.txt +283 -367
- data/gman.gemspec +1 -1
- data/lib/gman/domain_list.rb +78 -0
- data/lib/gman/importer.rb +135 -0
- data/script/alphabetize +4 -34
- data/script/cibuild +0 -1
- data/script/dedupe +10 -27
- data/script/prune +5 -2
- data/script/vendor-federal-de +14 -0
- data/script/vendor-municipal-de +23 -0
- data/script/vendor-nl +9 -4
- data/script/{build → vendor-public-suffix} +4 -6
- data/script/vendor-se +4 -38
- data/script/vendor-us +7 -67
- data/test/helper.rb +4 -2
- data/test/test_gman_country_codes.rb +3 -3
- data/test/test_gman_domains.rb +28 -0
- data/test/test_gman_identifier.rb +0 -1
- metadata +9 -8
- data/lib/gman/parser.rb +0 -59
- data/script/state-domains +0 -38
- data/script/vendor-de +0 -44
- data/test/test_domains.rb +0 -54
data/gman.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.name = "gman"
|
3
3
|
s.summary = "Check if a given domain or email address belong to a governemnt entity"
|
4
4
|
s.description = "A ruby gem to check if the owner of a given email address is working for THE MAN."
|
5
|
-
s.version = '4.
|
5
|
+
s.version = '4.7.0'
|
6
6
|
s.authors = ["Ben Balter"]
|
7
7
|
s.email = "ben.balter@github.com"
|
8
8
|
s.homepage = "https://github.com/benbalter/gman"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Gman < NaughtyOrNice
|
2
|
+
class DomainList
|
3
|
+
|
4
|
+
attr_accessor :list
|
5
|
+
alias_method :to_h, :list
|
6
|
+
|
7
|
+
COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
|
8
|
+
|
9
|
+
def initialize(list)
|
10
|
+
@list = list
|
11
|
+
end
|
12
|
+
|
13
|
+
def groups
|
14
|
+
list.keys
|
15
|
+
end
|
16
|
+
|
17
|
+
def domains
|
18
|
+
list.values.flatten
|
19
|
+
end
|
20
|
+
|
21
|
+
def count
|
22
|
+
domains.count
|
23
|
+
end
|
24
|
+
|
25
|
+
def alphabetize
|
26
|
+
@list = @list.sort_by { |k,v| k.downcase }.to_h
|
27
|
+
@list.each { |group, domains| domains.sort!.uniq! }
|
28
|
+
end
|
29
|
+
|
30
|
+
def write
|
31
|
+
File.write(Gman.list_path, to_public_suffix)
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_public_suffix
|
35
|
+
current_group = ""
|
36
|
+
output = ""
|
37
|
+
list.sort_by { |group, domains| group.downcase }.each do |group, domains|
|
38
|
+
if group != current_group
|
39
|
+
output << "\n\n" unless current_group.empty? # first entry
|
40
|
+
output << "// #{group}\n"
|
41
|
+
current_group = group
|
42
|
+
end
|
43
|
+
output << domains.join("\n")
|
44
|
+
end
|
45
|
+
output
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.current
|
49
|
+
current = File.open(Gman::list_path).read
|
50
|
+
DomainList.from_public_suffix(current)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.from_public_suffix(string)
|
54
|
+
string = string.gsub(/\r\n?/, "\n").split("\n")
|
55
|
+
hash = array_to_hash(string)
|
56
|
+
DomainList.new(hash)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
# Given an array of comments/domains in public suffix format
|
62
|
+
# Converts to a hash in the form of :group => [domain1, domain2...]
|
63
|
+
def self.array_to_hash(domains)
|
64
|
+
group = ""
|
65
|
+
domain_hash = {}
|
66
|
+
domains.each do |line|
|
67
|
+
next if line.empty?
|
68
|
+
if match = COMMENT_REGEX.match(line)
|
69
|
+
group = match[1]
|
70
|
+
else
|
71
|
+
domain_hash[group] = [] if domain_hash[group].nil?
|
72
|
+
domain_hash[group].push line.downcase
|
73
|
+
end
|
74
|
+
end
|
75
|
+
domain_hash
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Utility functions for parsing and manipulating public-suffix formatted domain lists
|
2
|
+
# Only used in development and not loaded by default
|
3
|
+
require 'yaml'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'net/dns'
|
6
|
+
require 'net/dns/resolver'
|
7
|
+
require_relative '../gman'
|
8
|
+
require_relative './domain_list'
|
9
|
+
|
10
|
+
class Gman < NaughtyOrNice
|
11
|
+
class Importer
|
12
|
+
|
13
|
+
attr_accessor :domains
|
14
|
+
|
15
|
+
# Known false positives from vendored lists
|
16
|
+
BLACKLIST = %w[
|
17
|
+
business.centurytel.net
|
18
|
+
chesnee.net
|
19
|
+
citlink.net
|
20
|
+
egovlink.com
|
21
|
+
emainehosting.com
|
22
|
+
fantasyspringsresort.com
|
23
|
+
frontiernet.net
|
24
|
+
hartford-hwp.com
|
25
|
+
homepages.sover.net
|
26
|
+
htc.net
|
27
|
+
koasekabenaki.org
|
28
|
+
kstrom.net
|
29
|
+
laworkforce.net
|
30
|
+
mississippistateparks.reserveamerica.com
|
31
|
+
mylocalgov.com
|
32
|
+
myweb.cebridge.net
|
33
|
+
ncstars.org
|
34
|
+
neagrelations.org
|
35
|
+
qis.net
|
36
|
+
rootsweb.com
|
37
|
+
showcase.netins.net
|
38
|
+
valuworld.com
|
39
|
+
wctc.net
|
40
|
+
webconnections.net
|
41
|
+
webpages.charter.net
|
42
|
+
]
|
43
|
+
|
44
|
+
def initialize(domains)
|
45
|
+
@domains = DomainList.new(domains)
|
46
|
+
end
|
47
|
+
|
48
|
+
def logger
|
49
|
+
@logger ||= Logger.new(STDOUT)
|
50
|
+
end
|
51
|
+
|
52
|
+
def normalize_domain(domain)
|
53
|
+
domain.to_s.downcase.strip.gsub(/^www./, "").gsub(/\/$/, "")
|
54
|
+
end
|
55
|
+
|
56
|
+
def valid_domain?(domain, options={})
|
57
|
+
return false if domain.empty?
|
58
|
+
return reject(domain, "home. regex") if domain =~ /^home\./
|
59
|
+
return reject(domain, "user. regex") if domain =~ /^users?\./
|
60
|
+
return reject(domain, "sites. regex") if domain =~ /^sites?\./
|
61
|
+
return reject(domain, "weebly") if domain =~ /weebly\.com$/
|
62
|
+
return reject(domain, "govoffice") if domain =~ /govoffice\d?\.com$/
|
63
|
+
return reject(domain, "homestead") if domain =~ /homestead\.com$/
|
64
|
+
return reject(domain, "wix.com") if domain =~ /wix\.com$/
|
65
|
+
return reject(domain, "locality") if domain =~ Gman::LOCALITY_REGEX
|
66
|
+
return reject(domain, "blacklist") if BLACKLIST.include?(domain)
|
67
|
+
return reject(domain, "duplicate") if !options[:skip_dupe] && current.domains.include?(domain)
|
68
|
+
return reject(domain, "invalid") unless PublicSuffix.valid?(".#{domain}")
|
69
|
+
return reject(domain, "academic") if Swot::is_academic?(domain)
|
70
|
+
return reject(domain, "unresolvable") if !options[:skip_resolve] && !domain_resolves?(domain)
|
71
|
+
true
|
72
|
+
end
|
73
|
+
|
74
|
+
def reject(domain, reason)
|
75
|
+
logger.info "👎 `#{domain}`: #{reason}"
|
76
|
+
false
|
77
|
+
end
|
78
|
+
|
79
|
+
def current
|
80
|
+
@current ||= DomainList.current
|
81
|
+
end
|
82
|
+
|
83
|
+
def import
|
84
|
+
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
85
|
+
logger.info "Adding: #{domains.count} domains"
|
86
|
+
|
87
|
+
domains.list.each do |group, domains|
|
88
|
+
domains.map! { |domain| Gman.new(domain).to_s }
|
89
|
+
domains.map! { |domain| normalize_domain(domain) }
|
90
|
+
domains.select! { |domain| valid_domain?(domain) }
|
91
|
+
end
|
92
|
+
|
93
|
+
logger.info "Filtered to: #{domains.count} domains"
|
94
|
+
|
95
|
+
if domains.count == 0
|
96
|
+
logger.info "Nothing to add. Aborting"
|
97
|
+
exit 0
|
98
|
+
end
|
99
|
+
|
100
|
+
domains.list.each do |group,domains|
|
101
|
+
current.list[group] = [] if current.list[group].nil?
|
102
|
+
current.list[group].concat domains
|
103
|
+
current.list[group].sort! # Alphabetize
|
104
|
+
current.list[group].uniq! # Ensure uniqueness
|
105
|
+
end
|
106
|
+
|
107
|
+
logger.info "New: #{current.count} domains"
|
108
|
+
|
109
|
+
logger.info "Writing to disk..."
|
110
|
+
current.write
|
111
|
+
logger.info "Fin."
|
112
|
+
end
|
113
|
+
|
114
|
+
def resolver
|
115
|
+
@resolver ||= begin
|
116
|
+
resolver = Net::DNS::Resolver.new
|
117
|
+
resolver.nameservers = ["8.8.8.8","8.8.4.4"]
|
118
|
+
resolver
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Verifies that the given domain has an MX record, and thus is valid
|
123
|
+
def domain_resolves?(domain)
|
124
|
+
resolver.search(domain).header.anCount > 0 ||
|
125
|
+
resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
|
126
|
+
resolver.search(domain, Net::DNS::MX).header.anCount > 0
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class Gman < NaughtyOrNice
|
132
|
+
def self.import(hash)
|
133
|
+
Gman::Importer.new(hash).import
|
134
|
+
end
|
135
|
+
end
|
data/script/alphabetize
CHANGED
@@ -5,38 +5,8 @@
|
|
5
5
|
# usage: script/alphabetize
|
6
6
|
|
7
7
|
require_relative "../lib/gman"
|
8
|
+
require_relative "../lib/gman/importer"
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
domains = domains.split("\n")
|
13
|
-
|
14
|
-
# Split list into grouped hash
|
15
|
-
group = ""
|
16
|
-
domain_hash = {}
|
17
|
-
domains.each do |line|
|
18
|
-
next if line.empty?
|
19
|
-
if match = /\/\/[\/\s]*(.*)$/i.match(line)
|
20
|
-
group = match[1]
|
21
|
-
else
|
22
|
-
domain_hash[group] = [] if domain_hash[group].nil?
|
23
|
-
domain_hash[group].push line.downcase
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Sort by groups
|
28
|
-
domain_hash = domain_hash.sort_by { |k,v| k.downcase }.to_h
|
29
|
-
|
30
|
-
# Sort within groups
|
31
|
-
domain_hash.each do |group, domains|
|
32
|
-
domain_hash[group].sort!
|
33
|
-
end
|
34
|
-
|
35
|
-
output = ""
|
36
|
-
domain_hash.each do |group, domains|
|
37
|
-
output << "// #{group}\n"
|
38
|
-
output << domains.join("\n")
|
39
|
-
output << "\n\n"
|
40
|
-
end
|
41
|
-
|
42
|
-
File.write Gman.list_path, output.strip
|
10
|
+
current = Gman::DomainList.current
|
11
|
+
current.alphabetize
|
12
|
+
current.write
|
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -3,36 +3,19 @@
|
|
3
3
|
require 'yaml'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/
|
6
|
+
require './lib/gman/importer'
|
7
7
|
|
8
|
-
|
9
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
10
|
-
domain_hash = Gman::Parser.array_to_hash(current)
|
11
|
-
domain_list = domain_hash.flat_map { |k,v| v }
|
8
|
+
current = Gman::DomainList.current
|
12
9
|
|
13
10
|
puts "Checking for duplicate domains in the domain list..."
|
14
|
-
puts "Current list contains #{
|
15
|
-
|
16
|
-
SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
17
|
-
source_hash = YAML.load(open(SOURCE).read)
|
18
|
-
source_list = source_hash.flat_map { |k,v| v }
|
19
|
-
|
20
|
-
dupes = []
|
21
|
-
domain_hash.each do |group,domains|
|
22
|
-
domains.each do |domain|
|
23
|
-
if domain_list.count(domain) > 1 && source_list.count(domain) <= 1
|
24
|
-
dupes.push(domain)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
11
|
+
puts "Current list contains #{current.count} domains..."
|
28
12
|
|
29
|
-
|
13
|
+
dupe = current.count - current.domains.uniq.count
|
14
|
+
puts "Found #{dupe} duplicate domains"
|
15
|
+
exit 0 if dupe == 0
|
30
16
|
|
31
|
-
|
17
|
+
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
32
18
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
else
|
37
|
-
exit 0
|
38
|
-
end
|
19
|
+
puts "Duplicate domains:"
|
20
|
+
puts dupes
|
21
|
+
exit 1
|
data/script/prune
CHANGED
@@ -2,16 +2,19 @@
|
|
2
2
|
# Given an array of domains, removes them from the list
|
3
3
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
4
|
|
5
|
+
require_relative "../lib/gman"
|
6
|
+
require_relative "../lib/gman/domain_list"
|
7
|
+
|
5
8
|
domains = ARGV
|
6
9
|
domains = domains.clone.map { |d| d.gsub ",", "" }
|
7
10
|
|
8
11
|
list = File.open("./config/domains.txt").read
|
9
|
-
puts "Starting list: #{
|
12
|
+
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
10
13
|
|
11
14
|
domains.each do |domain|
|
12
15
|
list.gsub! /^#{domain}$\n/, ""
|
13
16
|
end
|
14
17
|
|
15
|
-
puts "Ending list: #{
|
18
|
+
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
16
19
|
|
17
20
|
File.write "./config/domains.txt", list
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'open-uri'
|
5
|
+
require './lib/gman'
|
6
|
+
require './lib/gman/importer'
|
7
|
+
|
8
|
+
url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
|
9
|
+
|
10
|
+
domains = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
+
domains = CSV.parse(domains, :headers => true)
|
12
|
+
domains = domains.map { |row| row["Domain Name"] }
|
13
|
+
|
14
|
+
Gman.import("German Federal" => domains)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'open-uri'
|
5
|
+
require './lib/gman'
|
6
|
+
require './lib/gman/importer'
|
7
|
+
|
8
|
+
url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
|
9
|
+
|
10
|
+
csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
+
|
12
|
+
# For some reason, the header row is actually the last row
|
13
|
+
# Pop the last line off the file and prepend it at the begining
|
14
|
+
# So that when we pass it to CSV it detects the headers properly
|
15
|
+
lines = csv.split("\n")
|
16
|
+
lines.unshift lines.pop
|
17
|
+
csv = lines.join("\n")
|
18
|
+
|
19
|
+
# Load municipal domains
|
20
|
+
data = CSV.parse(csv, :headers => true, :col_sep => ";")
|
21
|
+
domains = data.map { |row| row["Internet"] }
|
22
|
+
|
23
|
+
Gman.import("German Municipalities" => domains)
|
data/script/vendor-nl
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
FileUtils.rm_rf("almanak.overheid.nl")
|
7
|
+
domains = `wget -r -np https://almanak.overheid.nl/
|
8
|
+
grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
|
9
|
+
|
10
|
+
Gman.import("Netherlands" => domains.split("\n"))
|
@@ -3,14 +3,13 @@
|
|
3
3
|
|
4
4
|
require "public_suffix"
|
5
5
|
require "yaml"
|
6
|
+
require_relative "../lib/gman"
|
7
|
+
require_relative "../lib/gman/importer"
|
6
8
|
|
7
9
|
# https://gist.github.com/benbalter/6147066
|
8
10
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
9
11
|
|
10
|
-
|
11
|
-
domains = YAML.load_file YAML_FILE
|
12
|
-
domains = [] unless domains
|
13
|
-
|
12
|
+
domains = []
|
14
13
|
PublicSuffix::List.default.each do |rule|
|
15
14
|
domain = nil
|
16
15
|
|
@@ -23,5 +22,4 @@ PublicSuffix::List.default.each do |rule|
|
|
23
22
|
domains.push domain unless domain.nil? or domains.include? domain
|
24
23
|
end
|
25
24
|
|
26
|
-
|
27
|
-
File.open(YAML_FILE, 'w+') {|f| f.write(domains.to_yaml)}
|
25
|
+
Gman.import("non-us gov" => domains)
|
data/script/vendor-se
CHANGED
@@ -2,52 +2,18 @@
|
|
2
2
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'csv'
|
5
|
-
require 'swot'
|
6
5
|
require './lib/gman'
|
7
|
-
require './lib/gman/
|
6
|
+
require './lib/gman/importer'
|
8
7
|
|
9
8
|
url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
|
10
9
|
agent = Mechanize.new
|
11
10
|
page = agent.get(url)
|
12
|
-
form = page.
|
11
|
+
form = page.forms.first
|
13
12
|
form.radiobuttons.find { |r| r.value = "Textfil" }.check
|
14
13
|
submit_button = form.buttons.find { |b| b.type == "submit" }
|
15
14
|
response = agent.submit(form, submit_button)
|
16
15
|
|
17
|
-
domains = []
|
18
16
|
rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
|
17
|
+
domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
rows.each do |row|
|
23
|
-
next if Swot.valid?(row["Webbadress"]) # Filter out Swot'd domains
|
24
|
-
next if row["Namn"] =~ /UNIVERSITET/ # Filter out domains that are clearly edu
|
25
|
-
domains.push Gman.new(row["Webbadress"]).domain.to_s.gsub(/^www\./,"")
|
26
|
-
end
|
27
|
-
|
28
|
-
domains.reject! { |domain| domain.empty? }
|
29
|
-
domains.compact!
|
30
|
-
domains.uniq!
|
31
|
-
domains.select! { |domain| PublicSuffix.valid?(".#{domain}") }
|
32
|
-
|
33
|
-
puts "Ended up with #{domains.count} domains."
|
34
|
-
|
35
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
36
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
37
|
-
|
38
|
-
current_hash["Swedish Administrative Authorities"] = domains
|
39
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
40
|
-
|
41
|
-
# PublicSuffix Formatted Output
|
42
|
-
current_group = ""
|
43
|
-
output = ""
|
44
|
-
current_hash.each do |group, domains|
|
45
|
-
if group != current_group
|
46
|
-
output << "\n\n" unless current_group.empty? # first entry
|
47
|
-
output << "// #{group}\n"
|
48
|
-
current_group = group
|
49
|
-
end
|
50
|
-
output << domains.join("\n")
|
51
|
-
end
|
52
|
-
|
53
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
19
|
+
Gman.import("Swedish Administrative Authorities" => domains)
|