gman 4.6.5 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/gman.gemspec CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
2
2
  s.name = "gman"
3
3
  s.summary = "Check if a given domain or email address belong to a governemnt entity"
4
4
  s.description = "A ruby gem to check if the owner of a given email address is working for THE MAN."
5
- s.version = '4.6.5'
5
+ s.version = '4.7.0'
6
6
  s.authors = ["Ben Balter"]
7
7
  s.email = "ben.balter@github.com"
8
8
  s.homepage = "https://github.com/benbalter/gman"
@@ -0,0 +1,78 @@
1
+ class Gman < NaughtyOrNice
2
+ class DomainList
3
+
4
+ attr_accessor :list
5
+ alias_method :to_h, :list
6
+
7
+ COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
8
+
9
+ def initialize(list)
10
+ @list = list
11
+ end
12
+
13
+ def groups
14
+ list.keys
15
+ end
16
+
17
+ def domains
18
+ list.values.flatten
19
+ end
20
+
21
+ def count
22
+ domains.count
23
+ end
24
+
25
+ def alphabetize
26
+ @list = @list.sort_by { |k,v| k.downcase }.to_h
27
+ @list.each { |group, domains| domains.sort!.uniq! }
28
+ end
29
+
30
+ def write
31
+ File.write(Gman.list_path, to_public_suffix)
32
+ end
33
+
34
+ def to_public_suffix
35
+ current_group = ""
36
+ output = ""
37
+ list.sort_by { |group, domains| group.downcase }.each do |group, domains|
38
+ if group != current_group
39
+ output << "\n\n" unless current_group.empty? # first entry
40
+ output << "// #{group}\n"
41
+ current_group = group
42
+ end
43
+ output << domains.join("\n")
44
+ end
45
+ output
46
+ end
47
+
48
+ def self.current
49
+ current = File.open(Gman::list_path).read
50
+ DomainList.from_public_suffix(current)
51
+ end
52
+
53
+ def self.from_public_suffix(string)
54
+ string = string.gsub(/\r\n?/, "\n").split("\n")
55
+ hash = array_to_hash(string)
56
+ DomainList.new(hash)
57
+ end
58
+
59
+ private
60
+
61
+ # Given an array of comments/domains in public suffix format
62
+ # Converts to a hash in the form of :group => [domain1, domain2...]
63
+ def self.array_to_hash(domains)
64
+ group = ""
65
+ domain_hash = {}
66
+ domains.each do |line|
67
+ next if line.empty?
68
+ if match = COMMENT_REGEX.match(line)
69
+ group = match[1]
70
+ else
71
+ domain_hash[group] = [] if domain_hash[group].nil?
72
+ domain_hash[group].push line.downcase
73
+ end
74
+ end
75
+ domain_hash
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,135 @@
1
+ # Utility functions for parsing and manipulating public-suffix formatted domain lists
2
+ # Only used in development and not loaded by default
3
+ require 'yaml'
4
+ require 'open-uri'
5
+ require 'net/dns'
6
+ require 'net/dns/resolver'
7
+ require_relative '../gman'
8
+ require_relative './domain_list'
9
+
10
+ class Gman < NaughtyOrNice
11
+ class Importer
12
+
13
+ attr_accessor :domains
14
+
15
+ # Known false positives from vendored lists
16
+ BLACKLIST = %w[
17
+ business.centurytel.net
18
+ chesnee.net
19
+ citlink.net
20
+ egovlink.com
21
+ emainehosting.com
22
+ fantasyspringsresort.com
23
+ frontiernet.net
24
+ hartford-hwp.com
25
+ homepages.sover.net
26
+ htc.net
27
+ koasekabenaki.org
28
+ kstrom.net
29
+ laworkforce.net
30
+ mississippistateparks.reserveamerica.com
31
+ mylocalgov.com
32
+ myweb.cebridge.net
33
+ ncstars.org
34
+ neagrelations.org
35
+ qis.net
36
+ rootsweb.com
37
+ showcase.netins.net
38
+ valuworld.com
39
+ wctc.net
40
+ webconnections.net
41
+ webpages.charter.net
42
+ ]
43
+
44
+ def initialize(domains)
45
+ @domains = DomainList.new(domains)
46
+ end
47
+
48
+ def logger
49
+ @logger ||= Logger.new(STDOUT)
50
+ end
51
+
52
+ def normalize_domain(domain)
53
+ domain.to_s.downcase.strip.gsub(/^www./, "").gsub(/\/$/, "")
54
+ end
55
+
56
+ def valid_domain?(domain, options={})
57
+ return false if domain.empty?
58
+ return reject(domain, "home. regex") if domain =~ /^home\./
59
+ return reject(domain, "user. regex") if domain =~ /^users?\./
60
+ return reject(domain, "sites. regex") if domain =~ /^sites?\./
61
+ return reject(domain, "weebly") if domain =~ /weebly\.com$/
62
+ return reject(domain, "govoffice") if domain =~ /govoffice\d?\.com$/
63
+ return reject(domain, "homestead") if domain =~ /homestead\.com$/
64
+ return reject(domain, "wix.com") if domain =~ /wix\.com$/
65
+ return reject(domain, "locality") if domain =~ Gman::LOCALITY_REGEX
66
+ return reject(domain, "blacklist") if BLACKLIST.include?(domain)
67
+ return reject(domain, "duplicate") if !options[:skip_dupe] && current.domains.include?(domain)
68
+ return reject(domain, "invalid") unless PublicSuffix.valid?(".#{domain}")
69
+ return reject(domain, "academic") if Swot::is_academic?(domain)
70
+ return reject(domain, "unresolvable") if !options[:skip_resolve] && !domain_resolves?(domain)
71
+ true
72
+ end
73
+
74
+ def reject(domain, reason)
75
+ logger.info "👎 `#{domain}`: #{reason}"
76
+ false
77
+ end
78
+
79
+ def current
80
+ @current ||= DomainList.current
81
+ end
82
+
83
+ def import
84
+ logger.info "Current: #{Gman::DomainList.current.count} domains"
85
+ logger.info "Adding: #{domains.count} domains"
86
+
87
+ domains.list.each do |group, domains|
88
+ domains.map! { |domain| Gman.new(domain).to_s }
89
+ domains.map! { |domain| normalize_domain(domain) }
90
+ domains.select! { |domain| valid_domain?(domain) }
91
+ end
92
+
93
+ logger.info "Filtered to: #{domains.count} domains"
94
+
95
+ if domains.count == 0
96
+ logger.info "Nothing to add. Aborting"
97
+ exit 0
98
+ end
99
+
100
+ domains.list.each do |group,domains|
101
+ current.list[group] = [] if current.list[group].nil?
102
+ current.list[group].concat domains
103
+ current.list[group].sort! # Alphabetize
104
+ current.list[group].uniq! # Ensure uniqueness
105
+ end
106
+
107
+ logger.info "New: #{current.count} domains"
108
+
109
+ logger.info "Writing to disk..."
110
+ current.write
111
+ logger.info "Fin."
112
+ end
113
+
114
+ def resolver
115
+ @resolver ||= begin
116
+ resolver = Net::DNS::Resolver.new
117
+ resolver.nameservers = ["8.8.8.8","8.8.4.4"]
118
+ resolver
119
+ end
120
+ end
121
+
122
+ # Verifies that the given domain has an MX record, and thus is valid
123
+ def domain_resolves?(domain)
124
+ resolver.search(domain).header.anCount > 0 ||
125
+ resolver.search(domain, Net::DNS::NS).header.anCount > 0 ||
126
+ resolver.search(domain, Net::DNS::MX).header.anCount > 0
127
+ end
128
+ end
129
+ end
130
+
131
+ class Gman < NaughtyOrNice
132
+ def self.import(hash)
133
+ Gman::Importer.new(hash).import
134
+ end
135
+ end
data/script/alphabetize CHANGED
@@ -5,38 +5,8 @@
5
5
  # usage: script/alphabetize
6
6
 
7
7
  require_relative "../lib/gman"
8
+ require_relative "../lib/gman/importer"
8
9
 
9
- # Read in existing list
10
- domains = File.open(Gman.list_path).read
11
- domains = domains.gsub /\r\n?/, "\n" #normalize line endings
12
- domains = domains.split("\n")
13
-
14
- # Split list into grouped hash
15
- group = ""
16
- domain_hash = {}
17
- domains.each do |line|
18
- next if line.empty?
19
- if match = /\/\/[\/\s]*(.*)$/i.match(line)
20
- group = match[1]
21
- else
22
- domain_hash[group] = [] if domain_hash[group].nil?
23
- domain_hash[group].push line.downcase
24
- end
25
- end
26
-
27
- # Sort by groups
28
- domain_hash = domain_hash.sort_by { |k,v| k.downcase }.to_h
29
-
30
- # Sort within groups
31
- domain_hash.each do |group, domains|
32
- domain_hash[group].sort!
33
- end
34
-
35
- output = ""
36
- domain_hash.each do |group, domains|
37
- output << "// #{group}\n"
38
- output << domains.join("\n")
39
- output << "\n\n"
40
- end
41
-
42
- File.write Gman.list_path, output.strip
10
+ current = Gman::DomainList.current
11
+ current.alphabetize
12
+ current.write
data/script/cibuild CHANGED
@@ -4,4 +4,3 @@ set -e
4
4
 
5
5
  bundle exec rake test
6
6
  bundle exec script/dedupe
7
- bundle exec script/state-domains
data/script/dedupe CHANGED
@@ -3,36 +3,19 @@
3
3
  require 'yaml'
4
4
  require 'open-uri'
5
5
  require './lib/gman'
6
- require './lib/gman/parser'
6
+ require './lib/gman/importer'
7
7
 
8
-
9
- current = Gman::Parser.file_to_array( Gman::list_path )
10
- domain_hash = Gman::Parser.array_to_hash(current)
11
- domain_list = domain_hash.flat_map { |k,v| v }
8
+ current = Gman::DomainList.current
12
9
 
13
10
  puts "Checking for duplicate domains in the domain list..."
14
- puts "Current list contains #{domain_list.count} domains..."
15
-
16
- SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
17
- source_hash = YAML.load(open(SOURCE).read)
18
- source_list = source_hash.flat_map { |k,v| v }
19
-
20
- dupes = []
21
- domain_hash.each do |group,domains|
22
- domains.each do |domain|
23
- if domain_list.count(domain) > 1 && source_list.count(domain) <= 1
24
- dupes.push(domain)
25
- end
26
- end
27
- end
11
+ puts "Current list contains #{current.count} domains..."
28
12
 
29
- dupes.uniq!
13
+ dupe = current.count - current.domains.uniq.count
14
+ puts "Found #{dupe} duplicate domains"
15
+ exit 0 if dupe == 0
30
16
 
31
- puts "Found #{dupes.count} dupes!"
17
+ dupes = current.domains.select { |domain| current.domains.count(domain) > 1 }
32
18
 
33
- if dupes.count > 0
34
- puts dupes.inspect
35
- exit 1
36
- else
37
- exit 0
38
- end
19
+ puts "Duplicate domains:"
20
+ puts dupes
21
+ exit 1
data/script/prune CHANGED
@@ -2,16 +2,19 @@
2
2
  # Given an array of domains, removes them from the list
3
3
  # Example usage: script/prune foo.invalid, bar.invalid, foo.bar.invalid
4
4
 
5
+ require_relative "../lib/gman"
6
+ require_relative "../lib/gman/domain_list"
7
+
5
8
  domains = ARGV
6
9
  domains = domains.clone.map { |d| d.gsub ",", "" }
7
10
 
8
11
  list = File.open("./config/domains.txt").read
9
- puts "Starting list: #{list.size} lines"
12
+ puts "Starting list: #{Gman::DomainList.current.count} domains"
10
13
 
11
14
  domains.each do |domain|
12
15
  list.gsub! /^#{domain}$\n/, ""
13
16
  end
14
17
 
15
- puts "Ending list: #{list.size} lines"
18
+ puts "Ending list: #{Gman::DomainList.current.count} domains"
16
19
 
17
20
  File.write "./config/domains.txt", list
@@ -0,0 +1,14 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'open-uri'
5
+ require './lib/gman'
6
+ require './lib/gman/importer'
7
+
8
+ url = "https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.csv"
9
+
10
+ domains = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
11
+ domains = CSV.parse(domains, :headers => true)
12
+ domains = domains.map { |row| row["Domain Name"] }
13
+
14
+ Gman.import("German Federal" => domains)
@@ -0,0 +1,23 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'open-uri'
5
+ require './lib/gman'
6
+ require './lib/gman/importer'
7
+
8
+ url = "http://www.mik.nrw.de/nc/themen-aufgaben/kommunales/kommunale-adressen.html?tx_szkommunaldb_pi1%5Bexport%5D=csv"
9
+
10
+ csv = open(url).read.force_encoding("iso-8859-1").encode("UTF-8")
11
+
12
+ # For some reason, the header row is actually the last row
13
+ # Pop the last line off the file and prepend it at the begining
14
+ # So that when we pass it to CSV it detects the headers properly
15
+ lines = csv.split("\n")
16
+ lines.unshift lines.pop
17
+ csv = lines.join("\n")
18
+
19
+ # Load municipal domains
20
+ data = CSV.parse(csv, :headers => true, :col_sep => ";")
21
+ domains = data.map { |row| row["Internet"] }
22
+
23
+ Gman.import("German Municipalities" => domains)
data/script/vendor-nl CHANGED
@@ -1,5 +1,10 @@
1
- #!/bin/sh
1
+ #! /usr/bin/env ruby
2
+ # See https://github.com/github/government.github.com/pull/367#issuecomment-102108763
2
3
 
3
- # https://github.com/github/government.github.com/pull/367#issuecomment-102108763
4
- wget -r -np https://almanak.overheid.nl/
5
- grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq
4
+ require 'fileutils'
5
+
6
+ FileUtils.rm_rf("almanak.overheid.nl")
7
+ domains = `wget -r -np https://almanak.overheid.nl/
8
+ grep @ -rI almanak.overheid.nl/|cut -f 2 -d @|cut -f 1 -d '"'|grep \\.nl$|sort|uniq`
9
+
10
+ Gman.import("Netherlands" => domains.split("\n"))
@@ -3,14 +3,13 @@
3
3
 
4
4
  require "public_suffix"
5
5
  require "yaml"
6
+ require_relative "../lib/gman"
7
+ require_relative "../lib/gman/importer"
6
8
 
7
9
  # https://gist.github.com/benbalter/6147066
8
10
  REGEX = /(\.g[ou]{1,2}(v|b|vt)|\.mil|\.gc|\.fed)(\.[a-z]{2})?$/i
9
11
 
10
- YAML_FILE = File.dirname(__FILE__) + "/../lib/domains.yml"
11
- domains = YAML.load_file YAML_FILE
12
- domains = [] unless domains
13
-
12
+ domains = []
14
13
  PublicSuffix::List.default.each do |rule|
15
14
  domain = nil
16
15
 
@@ -23,5 +22,4 @@ PublicSuffix::List.default.each do |rule|
23
22
  domains.push domain unless domain.nil? or domains.include? domain
24
23
  end
25
24
 
26
- domains = domains.sort
27
- File.open(YAML_FILE, 'w+') {|f| f.write(domains.to_yaml)}
25
+ Gman.import("non-us gov" => domains)
data/script/vendor-se CHANGED
@@ -2,52 +2,18 @@
2
2
 
3
3
  require 'mechanize'
4
4
  require 'csv'
5
- require 'swot'
6
5
  require './lib/gman'
7
- require './lib/gman/parser'
6
+ require './lib/gman/importer'
8
7
 
9
8
  url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx"
10
9
  agent = Mechanize.new
11
10
  page = agent.get(url)
12
- form = page.form("form1")
11
+ form = page.forms.first
13
12
  form.radiobuttons.find { |r| r.value = "Textfil" }.check
14
13
  submit_button = form.buttons.find { |b| b.type == "submit" }
15
14
  response = agent.submit(form, submit_button)
16
15
 
17
- domains = []
18
16
  rows = CSV.parse(response.content, :headers => true, :col_sep => "\t")
17
+ domains = rows.map { |row| row["Webbadress"] unless row["Namn"] =~ /UNIVERSITET/}
19
18
 
20
- puts "Starting with #{rows.count} domains..."
21
-
22
- rows.each do |row|
23
- next if Swot.valid?(row["Webbadress"]) # Filter out Swot'd domains
24
- next if row["Namn"] =~ /UNIVERSITET/ # Filter out domains that are clearly edu
25
- domains.push Gman.new(row["Webbadress"]).domain.to_s.gsub(/^www\./,"")
26
- end
27
-
28
- domains.reject! { |domain| domain.empty? }
29
- domains.compact!
30
- domains.uniq!
31
- domains.select! { |domain| PublicSuffix.valid?(".#{domain}") }
32
-
33
- puts "Ended up with #{domains.count} domains."
34
-
35
- current = Gman::Parser.file_to_array( Gman::list_path )
36
- current_hash = Gman::Parser.array_to_hash(current)
37
-
38
- current_hash["Swedish Administrative Authorities"] = domains
39
- current_hash = current_hash.sort_by { |group, domains| group.downcase }
40
-
41
- # PublicSuffix Formatted Output
42
- current_group = ""
43
- output = ""
44
- current_hash.each do |group, domains|
45
- if group != current_group
46
- output << "\n\n" unless current_group.empty? # first entry
47
- output << "// #{group}\n"
48
- current_group = group
49
- end
50
- output << domains.join("\n")
51
- end
52
-
53
- File.open(Gman.list_path, "w") { |file| file.write output }
19
+ Gman.import("Swedish Administrative Authorities" => domains)