gman 4.6.5 → 4.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/gman.gemspec CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
2
2
  s.name = "gman"
3
3
  s.summary = "Check if a given domain or email address belong to a governemnt entity"
4
4
  s.description = "A ruby gem to check if the owner of a given email address is working for THE MAN."
5
- s.version = '4.6.5'
5
+ s.version = '4.7.0'
6
6
  s.authors = ["Ben Balter"]
7
7
  s.email = "ben.balter@github.com"
8
8
  s.homepage = "https://github.com/benbalter/gman"
@@ -0,0 +1,78 @@
1
+ class Gman < NaughtyOrNice
2
+ class DomainList
3
+
4
+ attr_accessor :list
5
+ alias_method :to_h, :list
6
+
7
+ COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
8
+
9
+ def initialize(list)
10
+ @list = list
11
+ end
12
+
13
+ def groups
14
+ list.keys
15
+ end
16
+
17
+ def domains
18
+ list.values.flatten
19
+ end
20
+
21
+ def count
22
+ domains.count
23
+ end
24
+
25
+ def alphabetize
26
+ @list = @list.sort_by { |k,v| k.downcase }.to_h
27
+ @list.each { |group, domains| domains.sort!.uniq! }
28
+ end
29
+
30
+ def write
31
+ File.write(Gman.list_path, to_public_suffix)
32
+ end
33
+
34
+ def to_public_suffix
35
+ current_group = ""
36
+ output = ""
37
+ list.sort_by { |group, domains| group.downcase }.each do |group, domains|
38
+ if group != current_group
39
+ output << "\n\n" unless current_group.empty? # first entry
40
+ output << "// #{group}\n"
41
+ current_group = group
42
+ end
43
+ output << domains.join("\n")
44
+ end
45
+ output
46
+ end
47
+
48
+ def self.current
49
+ current = File.open(Gman::list_path).read
50
+ DomainList.from_public_suffix(current)
51
+ end
52
+
53
+ def self.from_public_suffix(string)
54
+ string = string.gsub(/\r\n?/, "\n").split("\n")
55
+ hash = array_to_hash(string)
56
+ DomainList.new(hash)
57
+ end
58
+
59
+ private
60
+
61
+ # Given an array of comments/domains in public suffix format
62
+ # Converts to a hash in the form of :group => [domain1, domain2...]
63
+ def self.array_to_hash(domains)
64
+ group = ""
65
+ domain_hash = {}
66
+ domains.each do |line|
67
+ next if line.empty?
68
+ if match = COMMENT_REGEX.match(line)
69
+ group = match[1]
70
+ else
71
+ domain_hash[group] = [] if domain_hash[group].nil?
72
+ domain_hash[group].push line.downcase
73
+ end
74
+ end
75
+ domain_hash
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,135 @@
1
+ # Utility functions for parsing and manipulating public-suffix formatted domain lists
2
+ # Only used in development and not loaded by default
3
+ require 'yaml'
4
+ require 'open-uri'
5
+ require 'net/dns'
6
+ require 'net/dns/resolver'
7
+ require_relative '../gman'
8
+ require_relative './domain_list'
9
+
10
+ class Gman < NaughtyOrNice
11
+ class Importer
12
+
13
+ attr_accessor :domains
14
+
15
+ # Known false positives from vendored lists
16
+ BLACKLIST = %w[
17
+ business.centurytel.net
18
+ chesnee.net
19
+ citlink.net
20
+ egovlink.com
21
+ emainehosting.com
22
+ fantasyspringsresort.com
23
+ frontiernet.net
24
+ hartford-hwp.com
25
+ homepages.sover.net
26
+ htc.net
27
+ koasekabenaki.org
28
+ kstrom.net
29
+ laworkforce.net
30
+ mississippistateparks.reserveamerica.com
31
+ mylocalgov.com
32
+ myweb.cebridge.net
33
+ ncstars.org
34
+ neagrelations.org
35
+ qis.net
36
+ rootsweb.com
37
+ showcase.netins.net
38
+ valuworld.com
39
+ wctc.net
40
+ webconnections.net
41
+ webpages.charter.net
42
+ ]
43
+
44
+ def initialize(domains)
45
+ @domains = DomainList.new(domains)
46
+ end
47
+
48
+ def logger
49
+ @logger ||= Logger.new(STDOUT)
50
+ end
51
+
52
+ def normalize_domain(domain)
53
+ domain.to_s.downcase.strip.gsub(/^www./, "").gsub(/\/$/, "")
54
+ end
55
+
56
+ def valid_domain?(domain, options={})
57
+ return false if domain.empty?
58
+ return reject(domain, "home. regex") if domain =~ /^home\./
59
+ return reject(domain, "user. regex") if domain =~ /^users?\./
60
+ return reject(domain, "sites. regex") if domain =~ /^sites?\./
61
+ return reject(domain, "weebly") if domain =~ /weebly\.com$/
62
+ return reject(domain, "govoffice") if domain =~ /govoffice\d?\.com$/
63
+ return reject(domain, "homestead") if domain =~ /homestead\.com$/
64
+ return reject(domain, "wix.com") if domain =~ /wix\.com$/
65
+ return reject(domain, "locality") if domain =~ Gman::LOCALITY_REGEX
66
+ return reject(domain, "blacklist") if BLACKLIST.include?(domain)
67
+ return reject(domain, "duplicate") if !options[:skip_dupe] && current.domains.include?(domain)
68
+ return reject(domain, "invalid") unless PublicSuffix.valid?(".#{domain}")
69
+ return reject(domain, "academic") if Swot::is_academic?(domain)
70
+ return reject(domain, "unresolvable") if !options[:skip_resolve] && !domain_resolves?(domain)
71
+ true
72
+ end
73
+
74
+ def reject(domain, reason)
75
+ logger.info "👎 `#{domain}`: #{reason}"
76
+ false
77
+ end
78
+
79
+ def current
80
+ @current ||= DomainList.current
81
+ end
82
+
83
+ def import
84
+ logger.info "Current: #{Gman::DomainList.current.count} domains"
85
+ logger.info "Adding: #{domains.count} domains"
86
+
87
+ domains.list.each do |group, domains|
88
+ domains.map! { |domain| Gman.new(domain).to_s }
89
+ domains.map! { |domain| normalize_domain(domain) }
90
+ domains.select! { |domain| valid_domain?(domain) }
91
+ end
92
+
93
+ logger.info "Filtered to: #{domains.count} domains"
94
+
95
+ if domains.count == 0
96
+ logger.info "Nothing to add. Aborting"
97
+ exit 0
98
+ end
99
+
100
+ domains.list.each do |group,domains|
101
+ current.list[group] = [] if current.list[group].nil?
102
+ current.list[group].concat domains
103
+ current.list[group].sort! # Alphabetize
104
+ current.list[group].uniq! # Ensure uniqueness
105
+ end
106
+
107
+ logger.info "New: #{current.count} domains"
108
+
109
+ logger.info "Writing to disk..."
110
+ current.write
111
+ logger.info "Fin."
112
+ end
113
+
114
+ def resolver
115
+ @resolver ||= begin
116
+ resolver = Net::DNS::Resolver.new
117
+ resolver.nameservers = ["8.8.8.8","8.8.4.4"]
118
+ resolver
119
+ end
120
+ end
121
+
122
+ # Verifies that the given domain has an MX record, and thus is valid
123
+ def domain_resolves?(domain)
124
+ resolver.search(domain).header.anCount > 0 ||
125
+ resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
126
+ resolver.search(domain, Net::DNS::MX).header.anCount > 0
127
+ end
128
+ end
129
+ end
130
+
131
+ class Gman < NaughtyOrNice
132
+ def self.import(hash)
133
+ Gman::Importer.new(hash).import
134
+ end
135
+ end
data/script/alphabetize CHANGED
@@ -5,38 +5,8 @@
5
5
  # usage: script/alphabetize
6
6
 
7
7
  require_relative "../lib/gman"
8
+ require_relative "../lib/gman/importer"
8
9
 
9
- # Read in existing list
10
- domains = File.open(Gman.list_path).read
11
- domains = domains.gsub /\r\n?/, "\n" #normalize line endings
12
- domains = domains.split("\n")
13
-
14
- # Split list into grouped hash
15
- group = ""
16
- domain_hash = {}
17
- domains.each do |line|
18
- next if line.empty?
19
- if match = /\/\/[\/\s]*(.*)$/i.match(line)
20
- group = match[1]
21
- else
22
- domain_hash[group] = [] if domain_hash[group].nil?
23
- domain_hash[group].push line.downcase
24
- end
25
- end
26
-
27
- # Sort by groups
28
- domain_hash = domain_hash.sort_by { |k,v| k.downcase }.to_h
29
-
30
- # Sort within groups
31
- domain_hash.each do |group, domains|
32
- domain_hash[group].sort!
33
- end
34
-
35
- output = ""
36
- domain_hash.each do |group, domains|
37
- output << "// #{group}\n"
38
- output << domains.join("\n")
39
- output << "\n\n"
40
- end
41
-
42
- File.write Gman.list_path, output.strip
10
+ current = Gman::DomainList.current
11
+ current.alphabetize
12
+ current.write
data/script/cibuild CHANGED
@@ -4,4 +4,3 @@ set -e
4
4
 
5
5
  bundle exec rake test
6
6
  bundle exec script/dedupe
7
- bundle exec script/state-domains
data/script/dedupe CHANGED
@@ -3,36 +3,19 @@
3
3
  require 'yaml'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/parser'
6
+ require './lib/gman/importer'
7
7
 
8
-
9
- current = Gman::Parser.file_to_array( Gman::list_path )
10
- domain_hash = Gman::Parser.array_to_hash(current)
11
- domain_list = domain_hash.flat_map { |k,v| v }
8
+ current = Gman::DomainList.current
12
9
 
13
10
  puts "Checking for duplicate domains in the domain list..."
14
- puts "Current list contains #{domain_list.count} domains..."
15
-
16
- SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
17
- source_hash = YAML.load(open(SOURCE).read)
18
- source_list = source_hash.flat_map { |k,v| v }
19
-
20
- dupes = []
21
- domain_hash.each do |group,domains|
22
- domains.each do |domain|
23
- if domain_list.count(domain) > 1 && source_list.count(domain) <= 1
24
- dupes.push(domain)
25
- end
26
- end
27
- end
11
+ puts "Current list contains #{current.count} domains..."
28
12
 
29
- dupes.uniq!
13
+ dupe = current.count - current.domains.uniq.count
14
+ puts "Found #{dupe} duplicate domains"
15
+ exit 0 if dupe == 0
30
16
 
31
- puts "Found #{dupes.count} dupes!"
17
+ dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
32
18
 
33
- if dupes.count > 0
34
- puts dupes.inspect
35
- exit 1
36
- else
37
- exit 0
38
- end
19
+ puts "Duplicate domains:"
20
+ puts dupes
21
+ exit 1
data/script/prune CHANGED
@@ -2,16 +2,19 @@
2
2
  # Given an array of domains, removes them from the list
3
3
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
4
 
5
+ require_relative "../lib/gman"
6
+ require_relative "../lib/gman/domain_list"
7
+
5
8
  domains = ARGV
6
9
  domains = domains.clone.map { |d| d.gsub ",", "" }
7
10
 
8
11
  list = File.open("./config/domains.txt").read
9
- puts "Starting list: #{list.size} lines"
12
+ puts "Starting list: #{Gman::DomainList.current.count} domains"
10
13
 
11
14
  domains.each do |domain|
12
15
  list.gsub! /^#{domain}$\n/, ""
13
16
  end
14
17
 
15
- puts "Ending list: #{list.size} lines"
18
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
16
19
 
17
20
  File.write "./config/domains.txt", list
@@ -0,0 +1,14 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'open-uri'
5
+ require './lib/gman'
6
+ require './lib/gman/importer'
7
+
8
+ url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
9
+
10
+ domains = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
11
+ domains = CSV.parse(domains, :headers => true)
12
+ domains = domains.map { |row| row["Domain Name"] }
13
+
14
+ Gman.import("German Federal" => domains)
@@ -0,0 +1,23 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'open-uri'
5
+ require './lib/gman'
6
+ require './lib/gman/importer'
7
+
8
+ url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
9
+
10
+ csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
11
+
12
+ # For some reason, the header row is actually the last row
13
+ # Pop the last line off the file and prepend it at the begining
14
+ # So that when we pass it to CSV it detects the headers properly
15
+ lines = csv.split("\n")
16
+ lines.unshift lines.pop
17
+ csv = lines.join("\n")
18
+
19
+ # Load municipal domains
20
+ data = CSV.parse(csv, :headers => true, :col_sep => ";")
21
+ domains = data.map { |row| row["Internet"] }
22
+
23
+ Gman.import("German Municipalities" => domains)
data/script/vendor-nl CHANGED
@@ -1,5 +1,10 @@
1
- #!/bin/sh
1
+ #! /usr/bin/env ruby
2
+ # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
2
3
 
3
- # https://github.com/github/government.github.com/pull/367#issuecomment-102108763
4
- wget -r -np https://almanak.overheid.nl/
5
- grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq
4
+ require 'fileutils'
5
+
6
+ FileUtils.rm_rf("almanak.overheid.nl")
7
+ domains = `wget -r -np https://almanak.overheid.nl/
8
+ grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
9
+
10
+ Gman.import("Netherlands" => domains.split("\n"))
@@ -3,14 +3,13 @@
3
3
 
4
4
  require "public_suffix"
5
5
  require "yaml"
6
+ require_relative "../lib/gman"
7
+ require_relative "../lib/gman/importer"
6
8
 
7
9
  # https://gist.github.com/benbalter/6147066
8
10
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
9
11
 
10
- YAML_FILE = File.dirname(__FILE__) + "/../lib/domains.yml"
11
- domains = YAML.load_file YAML_FILE
12
- domains = [] unless domains
13
-
12
+ domains = []
14
13
  PublicSuffix::List.default.each do |rule|
15
14
  domain = nil
16
15
 
@@ -23,5 +22,4 @@ PublicSuffix::List.default.each do |rule|
23
22
  domains.push domain unless domain.nil? or domains.include? domain
24
23
  end
25
24
 
26
- domains = domains.sort
27
- File.open(YAML_FILE, 'w+') {|f| f.write(domains.to_yaml)}
25
+ Gman.import("non-us gov" => domains)
data/script/vendor-se CHANGED
@@ -2,52 +2,18 @@
2
2
 
3
3
  require 'mechanize'
4
4
  require 'csv'
5
- require 'swot'
6
5
  require './lib/gman'
7
- require './lib/gman/parser'
6
+ require './lib/gman/importer'
8
7
 
9
8
  url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
10
9
  agent = Mechanize.new
11
10
  page = agent.get(url)
12
- form = page.form("form1")
11
+ form = page.forms.first
13
12
  form.radiobuttons.find { |r| r.value = "Textfil" }.check
14
13
  submit_button = form.buttons.find { |b| b.type == "submit" }
15
14
  response = agent.submit(form, submit_button)
16
15
 
17
- domains = []
18
16
  rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
17
+ domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
19
18
 
20
- puts "Starting with #{rows.count} domains..."
21
-
22
- rows.each do |row|
23
- next if Swot.valid?(row["Webbadress"]) # Filter out Swot'd domains
24
- next if row["Namn"] =~ /UNIVERSITET/ # Filter out domains that are clearly edu
25
- domains.push Gman.new(row["Webbadress"]).domain.to_s.gsub(/^www\./,"")
26
- end
27
-
28
- domains.reject! { |domain| domain.empty? }
29
- domains.compact!
30
- domains.uniq!
31
- domains.select! { |domain| PublicSuffix.valid?(".#{domain}") }
32
-
33
- puts "Ended up with #{domains.count} domains."
34
-
35
- current = Gman::Parser.file_to_array( Gman::list_path )
36
- current_hash = Gman::Parser.array_to_hash(current)
37
-
38
- current_hash["Swedish Administrative Authorities"] = domains
39
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
40
-
41
- # PublicSuffix Formatted Output
42
- current_group = ""
43
- output = ""
44
- current_hash.each do |group, domains|
45
- if group != current_group
46
- output << "\n\n" unless current_group.empty? # first entry
47
- output << "// #{group}\n"
48
- current_group = group
49
- end
50
- output << domains.join("\n")
51
- end
52
-
53
- File.open(Gman.list_path, "w") { |file| file.write output }
19
+ Gman.import("Swedish Administrative Authorities" => domains)