gman 4.6.5 → 4.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +0 -7
- data/config/domains.txt +283 -367
- data/gman.gemspec +1 -1
- data/lib/gman/domain_list.rb +78 -0
- data/lib/gman/importer.rb +135 -0
- data/script/alphabetize +4 -34
- data/script/cibuild +0 -1
- data/script/dedupe +10 -27
- data/script/prune +5 -2
- data/script/vendor-federal-de +14 -0
- data/script/vendor-municipal-de +23 -0
- data/script/vendor-nl +9 -4
- data/script/{build → vendor-public-suffix} +4 -6
- data/script/vendor-se +4 -38
- data/script/vendor-us +7 -67
- data/test/helper.rb +4 -2
- data/test/test_gman_country_codes.rb +3 -3
- data/test/test_gman_domains.rb +28 -0
- data/test/test_gman_identifier.rb +0 -1
- metadata +9 -8
- data/lib/gman/parser.rb +0 -59
- data/script/state-domains +0 -38
- data/script/vendor-de +0 -44
- data/test/test_domains.rb +0 -54
data/gman.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.name = "gman"
|
3
3
|
s.summary = "Check if a given domain or email address belong to a governemnt entity"
|
4
4
|
s.description = "A ruby gem to check if the owner of a given email address is working for THE MAN."
|
5
|
-
s.version = '4.
|
5
|
+
s.version = '4.7.0'
|
6
6
|
s.authors = ["Ben Balter"]
|
7
7
|
s.email = "ben.balter@github.com"
|
8
8
|
s.homepage = "https://github.com/benbalter/gman"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Gman < NaughtyOrNice
|
2
|
+
class DomainList
|
3
|
+
|
4
|
+
attr_accessor :list
|
5
|
+
alias_method :to_h, :list
|
6
|
+
|
7
|
+
COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
|
8
|
+
|
9
|
+
def initialize(list)
|
10
|
+
@list = list
|
11
|
+
end
|
12
|
+
|
13
|
+
def groups
|
14
|
+
list.keys
|
15
|
+
end
|
16
|
+
|
17
|
+
def domains
|
18
|
+
list.values.flatten
|
19
|
+
end
|
20
|
+
|
21
|
+
def count
|
22
|
+
domains.count
|
23
|
+
end
|
24
|
+
|
25
|
+
def alphabetize
|
26
|
+
@list = @list.sort_by { |k,v| k.downcase }.to_h
|
27
|
+
@list.each { |group, domains| domains.sort!.uniq! }
|
28
|
+
end
|
29
|
+
|
30
|
+
def write
|
31
|
+
File.write(Gman.list_path, to_public_suffix)
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_public_suffix
|
35
|
+
current_group = ""
|
36
|
+
output = ""
|
37
|
+
list.sort_by { |group, domains| group.downcase }.each do |group, domains|
|
38
|
+
if group != current_group
|
39
|
+
output << "\n\n" unless current_group.empty? # first entry
|
40
|
+
output << "// #{group}\n"
|
41
|
+
current_group = group
|
42
|
+
end
|
43
|
+
output << domains.join("\n")
|
44
|
+
end
|
45
|
+
output
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.current
|
49
|
+
current = File.open(Gman::list_path).read
|
50
|
+
DomainList.from_public_suffix(current)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.from_public_suffix(string)
|
54
|
+
string = string.gsub(/\r\n?/, "\n").split("\n")
|
55
|
+
hash = array_to_hash(string)
|
56
|
+
DomainList.new(hash)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
# Given an array of comments/domains in public suffix format
|
62
|
+
# Converts to a hash in the form of :group => [domain1, domain2...]
|
63
|
+
def self.array_to_hash(domains)
|
64
|
+
group = ""
|
65
|
+
domain_hash = {}
|
66
|
+
domains.each do |line|
|
67
|
+
next if line.empty?
|
68
|
+
if match = COMMENT_REGEX.match(line)
|
69
|
+
group = match[1]
|
70
|
+
else
|
71
|
+
domain_hash[group] = [] if domain_hash[group].nil?
|
72
|
+
domain_hash[group].push line.downcase
|
73
|
+
end
|
74
|
+
end
|
75
|
+
domain_hash
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Utility functions for parsing and manipulating public-suffix formatted domain lists
|
2
|
+
# Only used in development and not loaded by default
|
3
|
+
require 'yaml'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'net/dns'
|
6
|
+
require 'net/dns/resolver'
|
7
|
+
require_relative '../gman'
|
8
|
+
require_relative './domain_list'
|
9
|
+
|
10
|
+
class Gman < NaughtyOrNice
|
11
|
+
class Importer
|
12
|
+
|
13
|
+
attr_accessor :domains
|
14
|
+
|
15
|
+
# Known false positives from vendored lists
|
16
|
+
BLACKLIST = %w[
|
17
|
+
business.centurytel.net
|
18
|
+
chesnee.net
|
19
|
+
citlink.net
|
20
|
+
egovlink.com
|
21
|
+
emainehosting.com
|
22
|
+
fantasyspringsresort.com
|
23
|
+
frontiernet.net
|
24
|
+
hartford-hwp.com
|
25
|
+
homepages.sover.net
|
26
|
+
htc.net
|
27
|
+
koasekabenaki.org
|
28
|
+
kstrom.net
|
29
|
+
laworkforce.net
|
30
|
+
mississippistateparks.reserveamerica.com
|
31
|
+
mylocalgov.com
|
32
|
+
myweb.cebridge.net
|
33
|
+
ncstars.org
|
34
|
+
neagrelations.org
|
35
|
+
qis.net
|
36
|
+
rootsweb.com
|
37
|
+
showcase.netins.net
|
38
|
+
valuworld.com
|
39
|
+
wctc.net
|
40
|
+
webconnections.net
|
41
|
+
webpages.charter.net
|
42
|
+
]
|
43
|
+
|
44
|
+
def initialize(domains)
|
45
|
+
@domains = DomainList.new(domains)
|
46
|
+
end
|
47
|
+
|
48
|
+
def logger
|
49
|
+
@logger ||= Logger.new(STDOUT)
|
50
|
+
end
|
51
|
+
|
52
|
+
def normalize_domain(domain)
|
53
|
+
domain.to_s.downcase.strip.gsub(/^www./, "").gsub(/\/$/, "")
|
54
|
+
end
|
55
|
+
|
56
|
+
def valid_domain?(domain, options={})
|
57
|
+
return false if domain.empty?
|
58
|
+
return reject(domain, "home. regex") if domain =~ /^home\./
|
59
|
+
return reject(domain, "user. regex") if domain =~ /^users?\./
|
60
|
+
return reject(domain, "sites. regex") if domain =~ /^sites?\./
|
61
|
+
return reject(domain, "weebly") if domain =~ /weebly\.com$/
|
62
|
+
return reject(domain, "govoffice") if domain =~ /govoffice\d?\.com$/
|
63
|
+
return reject(domain, "homestead") if domain =~ /homestead\.com$/
|
64
|
+
return reject(domain, "wix.com") if domain =~ /wix\.com$/
|
65
|
+
return reject(domain, "locality") if domain =~ Gman::LOCALITY_REGEX
|
66
|
+
return reject(domain, "blacklist") if BLACKLIST.include?(domain)
|
67
|
+
return reject(domain, "duplicate") if !options[:skip_dupe] && current.domains.include?(domain)
|
68
|
+
return reject(domain, "invalid") unless PublicSuffix.valid?(".#{domain}")
|
69
|
+
return reject(domain, "academic") if Swot::is_academic?(domain)
|
70
|
+
return reject(domain, "unresolvable") if !options[:skip_resolve] && !domain_resolves?(domain)
|
71
|
+
true
|
72
|
+
end
|
73
|
+
|
74
|
+
def reject(domain, reason)
|
75
|
+
logger.info "👎 `#{domain}`: #{reason}"
|
76
|
+
false
|
77
|
+
end
|
78
|
+
|
79
|
+
def current
|
80
|
+
@current ||= DomainList.current
|
81
|
+
end
|
82
|
+
|
83
|
+
def import
|
84
|
+
logger.info "Current: #{Gman::DomainList.current.count} domains"
|
85
|
+
logger.info "Adding: #{domains.count} domains"
|
86
|
+
|
87
|
+
domains.list.each do |group, domains|
|
88
|
+
domains.map! { |domain| Gman.new(domain).to_s }
|
89
|
+
domains.map! { |domain| normalize_domain(domain) }
|
90
|
+
domains.select! { |domain| valid_domain?(domain) }
|
91
|
+
end
|
92
|
+
|
93
|
+
logger.info "Filtered to: #{domains.count} domains"
|
94
|
+
|
95
|
+
if domains.count == 0
|
96
|
+
logger.info "Nothing to add. Aborting"
|
97
|
+
exit 0
|
98
|
+
end
|
99
|
+
|
100
|
+
domains.list.each do |group,domains|
|
101
|
+
current.list[group] = [] if current.list[group].nil?
|
102
|
+
current.list[group].concat domains
|
103
|
+
current.list[group].sort! # Alphabetize
|
104
|
+
current.list[group].uniq! # Ensure uniqueness
|
105
|
+
end
|
106
|
+
|
107
|
+
logger.info "New: #{current.count} domains"
|
108
|
+
|
109
|
+
logger.info "Writing to disk..."
|
110
|
+
current.write
|
111
|
+
logger.info "Fin."
|
112
|
+
end
|
113
|
+
|
114
|
+
def resolver
|
115
|
+
@resolver ||= begin
|
116
|
+
resolver = Net::DNS::Resolver.new
|
117
|
+
resolver.nameservers = ["8.8.8.8","8.8.4.4"]
|
118
|
+
resolver
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Verifies that the given domain has an MX record, and thus is valid
|
123
|
+
def domain_resolves?(domain)
|
124
|
+
resolver.search(domain).header.anCount > 0 ||
|
125
|
+
resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
|
126
|
+
resolver.search(domain, Net::DNS::MX).header.anCount > 0
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class Gman < NaughtyOrNice
|
132
|
+
def self.import(hash)
|
133
|
+
Gman::Importer.new(hash).import
|
134
|
+
end
|
135
|
+
end
|
data/script/alphabetize
CHANGED
@@ -5,38 +5,8 @@
|
|
5
5
|
# usage: script/alphabetize
|
6
6
|
|
7
7
|
require_relative "../lib/gman"
|
8
|
+
require_relative "../lib/gman/importer"
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
domains = domains.split("\n")
|
13
|
-
|
14
|
-
# Split list into grouped hash
|
15
|
-
group = ""
|
16
|
-
domain_hash = {}
|
17
|
-
domains.each do |line|
|
18
|
-
next if line.empty?
|
19
|
-
if match = /\/\/[\/\s]*(.*)$/i.match(line)
|
20
|
-
group = match[1]
|
21
|
-
else
|
22
|
-
domain_hash[group] = [] if domain_hash[group].nil?
|
23
|
-
domain_hash[group].push line.downcase
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Sort by groups
|
28
|
-
domain_hash = domain_hash.sort_by { |k,v| k.downcase }.to_h
|
29
|
-
|
30
|
-
# Sort within groups
|
31
|
-
domain_hash.each do |group, domains|
|
32
|
-
domain_hash[group].sort!
|
33
|
-
end
|
34
|
-
|
35
|
-
output = ""
|
36
|
-
domain_hash.each do |group, domains|
|
37
|
-
output << "// #{group}\n"
|
38
|
-
output << domains.join("\n")
|
39
|
-
output << "\n\n"
|
40
|
-
end
|
41
|
-
|
42
|
-
File.write Gman.list_path, output.strip
|
10
|
+
current = Gman::DomainList.current
|
11
|
+
current.alphabetize
|
12
|
+
current.write
|
data/script/cibuild
CHANGED
data/script/dedupe
CHANGED
@@ -3,36 +3,19 @@
|
|
3
3
|
require 'yaml'
|
4
4
|
require 'open-uri'
|
5
5
|
require './lib/gman'
|
6
|
-
require './lib/gman/
|
6
|
+
require './lib/gman/importer'
|
7
7
|
|
8
|
-
|
9
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
10
|
-
domain_hash = Gman::Parser.array_to_hash(current)
|
11
|
-
domain_list = domain_hash.flat_map { |k,v| v }
|
8
|
+
current = Gman::DomainList.current
|
12
9
|
|
13
10
|
puts "Checking for duplicate domains in the domain list..."
|
14
|
-
puts "Current list contains #{
|
15
|
-
|
16
|
-
SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
|
17
|
-
source_hash = YAML.load(open(SOURCE).read)
|
18
|
-
source_list = source_hash.flat_map { |k,v| v }
|
19
|
-
|
20
|
-
dupes = []
|
21
|
-
domain_hash.each do |group,domains|
|
22
|
-
domains.each do |domain|
|
23
|
-
if domain_list.count(domain) > 1 && source_list.count(domain) <= 1
|
24
|
-
dupes.push(domain)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
11
|
+
puts "Current list contains #{current.count} domains..."
|
28
12
|
|
29
|
-
|
13
|
+
dupe = current.count - current.domains.uniq.count
|
14
|
+
puts "Found #{dupe} duplicate domains"
|
15
|
+
exit 0 if dupe == 0
|
30
16
|
|
31
|
-
|
17
|
+
dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
|
32
18
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
else
|
37
|
-
exit 0
|
38
|
-
end
|
19
|
+
puts "Duplicate domains:"
|
20
|
+
puts dupes
|
21
|
+
exit 1
|
data/script/prune
CHANGED
@@ -2,16 +2,19 @@
|
|
2
2
|
# Given an array of domains, removes them from the list
|
3
3
|
# Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
|
4
4
|
|
5
|
+
require_relative "../lib/gman"
|
6
|
+
require_relative "../lib/gman/domain_list"
|
7
|
+
|
5
8
|
domains = ARGV
|
6
9
|
domains = domains.clone.map { |d| d.gsub ",", "" }
|
7
10
|
|
8
11
|
list = File.open("./config/domains.txt").read
|
9
|
-
puts "Starting list: #{
|
12
|
+
puts "Starting list: #{Gman::DomainList.current.count} domains"
|
10
13
|
|
11
14
|
domains.each do |domain|
|
12
15
|
list.gsub! /^#{domain}$\n/, ""
|
13
16
|
end
|
14
17
|
|
15
|
-
puts "Ending list: #{
|
18
|
+
puts "Ending list: #{Gman::DomainList.current.count} domains"
|
16
19
|
|
17
20
|
File.write "./config/domains.txt", list
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'open-uri'
|
5
|
+
require './lib/gman'
|
6
|
+
require './lib/gman/importer'
|
7
|
+
|
8
|
+
url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
|
9
|
+
|
10
|
+
domains = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
+
domains = CSV.parse(domains, :headers => true)
|
12
|
+
domains = domains.map { |row| row["Domain Name"] }
|
13
|
+
|
14
|
+
Gman.import("German Federal" => domains)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'open-uri'
|
5
|
+
require './lib/gman'
|
6
|
+
require './lib/gman/importer'
|
7
|
+
|
8
|
+
url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
|
9
|
+
|
10
|
+
csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
|
11
|
+
|
12
|
+
# For some reason, the header row is actually the last row
|
13
|
+
# Pop the last line off the file and prepend it at the begining
|
14
|
+
# So that when we pass it to CSV it detects the headers properly
|
15
|
+
lines = csv.split("\n")
|
16
|
+
lines.unshift lines.pop
|
17
|
+
csv = lines.join("\n")
|
18
|
+
|
19
|
+
# Load municipal domains
|
20
|
+
data = CSV.parse(csv, :headers => true, :col_sep => ";")
|
21
|
+
domains = data.map { |row| row["Internet"] }
|
22
|
+
|
23
|
+
Gman.import("German Municipalities" => domains)
|
data/script/vendor-nl
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
FileUtils.rm_rf("almanak.overheid.nl")
|
7
|
+
domains = `wget -r -np https://almanak.overheid.nl/
|
8
|
+
grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
|
9
|
+
|
10
|
+
Gman.import("Netherlands" => domains.split("\n"))
|
@@ -3,14 +3,13 @@
|
|
3
3
|
|
4
4
|
require "public_suffix"
|
5
5
|
require "yaml"
|
6
|
+
require_relative "../lib/gman"
|
7
|
+
require_relative "../lib/gman/importer"
|
6
8
|
|
7
9
|
# https://gist.github.com/benbalter/6147066
|
8
10
|
REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
|
9
11
|
|
10
|
-
|
11
|
-
domains = YAML.load_file YAML_FILE
|
12
|
-
domains = [] unless domains
|
13
|
-
|
12
|
+
domains = []
|
14
13
|
PublicSuffix::List.default.each do |rule|
|
15
14
|
domain = nil
|
16
15
|
|
@@ -23,5 +22,4 @@ PublicSuffix::List.default.each do |rule|
|
|
23
22
|
domains.push domain unless domain.nil? or domains.include? domain
|
24
23
|
end
|
25
24
|
|
26
|
-
|
27
|
-
File.open(YAML_FILE, 'w+') {|f| f.write(domains.to_yaml)}
|
25
|
+
Gman.import("non-us gov" => domains)
|
data/script/vendor-se
CHANGED
@@ -2,52 +2,18 @@
|
|
2
2
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'csv'
|
5
|
-
require 'swot'
|
6
5
|
require './lib/gman'
|
7
|
-
require './lib/gman/
|
6
|
+
require './lib/gman/importer'
|
8
7
|
|
9
8
|
url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
|
10
9
|
agent = Mechanize.new
|
11
10
|
page = agent.get(url)
|
12
|
-
form = page.
|
11
|
+
form = page.forms.first
|
13
12
|
form.radiobuttons.find { |r| r.value = "Textfil" }.check
|
14
13
|
submit_button = form.buttons.find { |b| b.type == "submit" }
|
15
14
|
response = agent.submit(form, submit_button)
|
16
15
|
|
17
|
-
domains = []
|
18
16
|
rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
|
17
|
+
domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
rows.each do |row|
|
23
|
-
next if Swot.valid?(row["Webbadress"]) # Filter out Swot'd domains
|
24
|
-
next if row["Namn"] =~ /UNIVERSITET/ # Filter out domains that are clearly edu
|
25
|
-
domains.push Gman.new(row["Webbadress"]).domain.to_s.gsub(/^www\./,"")
|
26
|
-
end
|
27
|
-
|
28
|
-
domains.reject! { |domain| domain.empty? }
|
29
|
-
domains.compact!
|
30
|
-
domains.uniq!
|
31
|
-
domains.select! { |domain| PublicSuffix.valid?(".#{domain}") }
|
32
|
-
|
33
|
-
puts "Ended up with #{domains.count} domains."
|
34
|
-
|
35
|
-
current = Gman::Parser.file_to_array( Gman::list_path )
|
36
|
-
current_hash = Gman::Parser.array_to_hash(current)
|
37
|
-
|
38
|
-
current_hash["Swedish Administrative Authorities"] = domains
|
39
|
-
current_hash = current_hash.sort_by { |group, domains| group.downcase }
|
40
|
-
|
41
|
-
# PublicSuffix Formatted Output
|
42
|
-
current_group = ""
|
43
|
-
output = ""
|
44
|
-
current_hash.each do |group, domains|
|
45
|
-
if group != current_group
|
46
|
-
output << "\n\n" unless current_group.empty? # first entry
|
47
|
-
output << "// #{group}\n"
|
48
|
-
current_group = group
|
49
|
-
end
|
50
|
-
output << domains.join("\n")
|
51
|
-
end
|
52
|
-
|
53
|
-
File.open(Gman.list_path, "w") { |file| file.write output }
|
19
|
+
Gman.import("Swedish Administrative Authorities" => domains)
|