gman 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +6 -6
- data/README.md +0 -10
- data/Rakefile +9 -1
- data/gman.gemspec +1 -2
- data/lib/domains.txt +10031 -9909
- data/lib/gman.rb +56 -18
- data/test/helper.rb +21 -20
- data/test/test_gman.rb +83 -122
- metadata +2 -18
data/lib/gman.rb
CHANGED
@@ -2,10 +2,48 @@ require 'public_suffix'
|
|
2
2
|
require 'yaml'
|
3
3
|
require 'swot'
|
4
4
|
require "addressable/uri"
|
5
|
-
require "email_veracity"
|
6
5
|
|
7
6
|
module Gman
|
8
7
|
|
8
|
+
# Source: http://bit.ly/1n2X9iv
|
9
|
+
EMAIL_REGEX = %r{
|
10
|
+
^
|
11
|
+
(
|
12
|
+
[\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
|
13
|
+
\.
|
14
|
+
)
|
15
|
+
*
|
16
|
+
[\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
|
17
|
+
@
|
18
|
+
(
|
19
|
+
(
|
20
|
+
(
|
21
|
+
(
|
22
|
+
(
|
23
|
+
[a-z0-9]{1}
|
24
|
+
[a-z0-9\-]{0,62}
|
25
|
+
[a-z0-9]{1}
|
26
|
+
)
|
27
|
+
|
|
28
|
+
[a-z]
|
29
|
+
)
|
30
|
+
\.
|
31
|
+
)+
|
32
|
+
[a-z]{2,6}
|
33
|
+
)
|
34
|
+
|
|
35
|
+
(
|
36
|
+
\d{1,3}
|
37
|
+
\.
|
38
|
+
){3}
|
39
|
+
\d{1,3}
|
40
|
+
(
|
41
|
+
\:\d{1,5}
|
42
|
+
)?
|
43
|
+
)
|
44
|
+
$
|
45
|
+
}xi
|
46
|
+
|
9
47
|
class << self
|
10
48
|
|
11
49
|
# Normalizes and checks if a given string represents a governemnt domain
|
@@ -16,20 +54,12 @@ module Gman
|
|
16
54
|
# "foo.gov.uk"
|
17
55
|
# "http://foo.bar.gov"
|
18
56
|
#
|
19
|
-
# check_mx - if an email is passed, check the domain for an mx record
|
20
|
-
#
|
21
57
|
# Returns boolean true if a government domain
|
22
|
-
def valid?(text
|
58
|
+
def valid?(text)
|
23
59
|
|
24
60
|
domain = get_domain text
|
25
61
|
return false unless PublicSuffix.valid?(domain)
|
26
62
|
|
27
|
-
# validate mx record
|
28
|
-
if check_mx && email?(text)
|
29
|
-
EmailVeracity::Config[:skip_lookup] = false
|
30
|
-
return false unless EmailVeracity::Address.new(text).valid?
|
31
|
-
end
|
32
|
-
|
33
63
|
# Ensure non-edu
|
34
64
|
return false if Swot::is_academic?(domain)
|
35
65
|
|
@@ -44,7 +74,7 @@ module Gman
|
|
44
74
|
# returns an instance of our custom public suffix list
|
45
75
|
# list behaves like PublicSuffix::List but is limited to our whitelisted domains
|
46
76
|
def list
|
47
|
-
@list ||= PublicSuffix::List::parse(
|
77
|
+
@list ||= PublicSuffix::List::parse(File.new(list_path, "r:utf-8"))
|
48
78
|
end
|
49
79
|
|
50
80
|
# Get the FQDN name from a URL or email address.
|
@@ -60,12 +90,16 @@ module Gman
|
|
60
90
|
if uri.host # valid https?://* URI
|
61
91
|
uri.host
|
62
92
|
elsif email?(text)
|
63
|
-
|
93
|
+
text.match(/@([\w\.\-]+)\Z/i)[1]
|
64
94
|
else # url sans http://
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
95
|
+
begin
|
96
|
+
uri = Addressable::URI.parse("http://#{text}")
|
97
|
+
# properly parse http://foo edge cases
|
98
|
+
# see https://github.com/sporkmonger/addressable/issues/145
|
99
|
+
uri.host if uri.host =~ /\./
|
100
|
+
rescue Addressable::URI::InvalidURIError
|
101
|
+
nil
|
102
|
+
end
|
69
103
|
end
|
70
104
|
end
|
71
105
|
|
@@ -86,8 +120,12 @@ module Gman
|
|
86
120
|
#
|
87
121
|
# Returns true if email, otherwise false
|
88
122
|
def email?(text)
|
89
|
-
|
90
|
-
|
123
|
+
text =~ EMAIL_REGEX
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns the absolute path to the domain list
|
127
|
+
def list_path
|
128
|
+
@list_path ||= File.join(File.dirname(__FILE__), "domains.txt")
|
91
129
|
end
|
92
130
|
end
|
93
131
|
end
|
data/test/helper.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
require 'test/unit'
|
11
|
-
require 'shoulda'
|
12
|
-
|
13
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
-
require 'gman'
|
16
|
-
require 'net/dns'
|
17
|
-
require 'net/dns/resolver'
|
18
|
-
|
19
|
-
|
20
|
-
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'gman'
|
16
|
+
require 'net/dns'
|
17
|
+
require 'net/dns/resolver'
|
18
|
+
require './lib/gman/parser'
|
19
|
+
|
20
|
+
class Test::Unit::TestCase
|
21
|
+
end
|
data/test/test_gman.rb
CHANGED
@@ -1,122 +1,83 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
VALID = [ "foo.gov",
|
4
|
-
"http://foo.mil",
|
5
|
-
"foo@bar.gc.ca",
|
6
|
-
"foo.gov.au",
|
7
|
-
"https://www.foo.gouv.fr",
|
8
|
-
"foo@ci.champaign.il.us",
|
9
|
-
"foo.bar.baz.gov.au",
|
10
|
-
"foo@bar.gov.uk",
|
11
|
-
".gov",
|
12
|
-
"foo.fed.us",
|
13
|
-
]
|
14
|
-
|
15
|
-
INVALID = [ "foo.bar.com",
|
16
|
-
"bar@foo.biz",
|
17
|
-
"http://www.foo.biz",
|
18
|
-
"foo.uk",
|
19
|
-
"gov",
|
20
|
-
"foo@k12.champaign.il.us",
|
21
|
-
"foo@kii.gov.by",
|
22
|
-
"foo",
|
23
|
-
"",
|
24
|
-
nil,
|
25
|
-
" ",
|
26
|
-
]
|
27
|
-
|
28
|
-
class TestGman < Test::Unit::TestCase
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
should "recognize government email addresses and domains" do
|
38
|
-
|
39
|
-
assert_equal
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
should "not
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
Gman.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
assert_equal "
|
69
|
-
assert_equal "
|
70
|
-
assert_equal "
|
71
|
-
assert_equal
|
72
|
-
end
|
73
|
-
|
74
|
-
should "
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
|
80
|
-
should "
|
81
|
-
Gman.
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
should "pass any email on the list" do
|
87
|
-
Gman.list.each do |entry|
|
88
|
-
assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
should "pass any domain on the list" do
|
93
|
-
Gman.list.each do |entry|
|
94
|
-
assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
should "only contain resolvable domains" do
|
99
|
-
Gman.list.each do |entry|
|
100
|
-
assert_equal true, domain_resolves? domain
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
|
105
|
-
should "not err out on invalid domains" do
|
106
|
-
assert_equal false, Gman.valid?("foo@act.gov.au")
|
107
|
-
assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
|
108
|
-
assert_equal nil, Gman.domain_parts("foo@act.gov.au")
|
109
|
-
end
|
110
|
-
|
111
|
-
should "return public suffix domain" do
|
112
|
-
assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
|
113
|
-
assert_equal NilClass, Gman.domain_parts("foo.bar").class
|
114
|
-
end
|
115
|
-
|
116
|
-
should "parse domain parts" do
|
117
|
-
assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
|
118
|
-
assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
|
119
|
-
assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
|
120
|
-
assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
|
121
|
-
end
|
122
|
-
end
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
VALID = [ "foo.gov",
|
4
|
+
"http://foo.mil",
|
5
|
+
"foo@bar.gc.ca",
|
6
|
+
"foo.gov.au",
|
7
|
+
"https://www.foo.gouv.fr",
|
8
|
+
"foo@ci.champaign.il.us",
|
9
|
+
"foo.bar.baz.gov.au",
|
10
|
+
"foo@bar.gov.uk",
|
11
|
+
".gov",
|
12
|
+
"foo.fed.us",
|
13
|
+
]
|
14
|
+
|
15
|
+
INVALID = [ "foo.bar.com",
|
16
|
+
"bar@foo.biz",
|
17
|
+
"http://www.foo.biz",
|
18
|
+
"foo.uk",
|
19
|
+
"gov",
|
20
|
+
"foo@k12.champaign.il.us",
|
21
|
+
"foo@kii.gov.by",
|
22
|
+
"foo",
|
23
|
+
"",
|
24
|
+
nil,
|
25
|
+
" ",
|
26
|
+
]
|
27
|
+
|
28
|
+
class TestGman < Test::Unit::TestCase
|
29
|
+
|
30
|
+
|
31
|
+
should "recognize government email addresses and domains" do
|
32
|
+
VALID.each do |test|
|
33
|
+
assert_equal true, Gman::valid?(test), "#{test} should be detected as a valid government domain"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
should "not recognize non-government email addresses and domains" do
|
38
|
+
INVALID.each do |test|
|
39
|
+
assert_equal false, Gman::valid?(test), "#{test} should be detected as an invalid government domain"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
should "not allow educational domains" do
|
44
|
+
assert_equal false, Gman::valid?("foo@gwu.edu")
|
45
|
+
end
|
46
|
+
|
47
|
+
should "properly parse domains from strings" do
|
48
|
+
assert_equal "github.gov", Gman::get_domain("foo@github.gov")
|
49
|
+
assert_equal "foo.github.gov", Gman::get_domain("foo.github.gov")
|
50
|
+
assert_equal "github.gov", Gman::get_domain("http://github.gov")
|
51
|
+
assert_equal "github.gov", Gman::get_domain("https://github.gov")
|
52
|
+
assert_equal ".gov", Gman::get_domain(".gov")
|
53
|
+
assert_equal nil, Gman.get_domain("foo")
|
54
|
+
end
|
55
|
+
|
56
|
+
should "not err out on invalid domains" do
|
57
|
+
assert_equal false, Gman.valid?("foo@act.gov.au")
|
58
|
+
assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
|
59
|
+
assert_equal nil, Gman.domain_parts("foo@act.gov.au")
|
60
|
+
end
|
61
|
+
|
62
|
+
should "return public suffix domain" do
|
63
|
+
assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
|
64
|
+
assert_equal NilClass, Gman.domain_parts("foo.bar").class
|
65
|
+
end
|
66
|
+
|
67
|
+
should "parse domain parts" do
|
68
|
+
assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
|
69
|
+
assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
|
70
|
+
assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
|
71
|
+
assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
|
72
|
+
end
|
73
|
+
|
74
|
+
should "not err out on invalid hosts" do
|
75
|
+
assert_nothing_raised do
|
76
|
+
assert_equal nil, Gman.get_domain("</@foo.com")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
should "returns the path to domains.txt" do
|
81
|
+
assert_equal true, File.exists?(Gman.list_path)
|
82
|
+
end
|
83
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2014-01-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: public_suffix
|
@@ -43,22 +43,6 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
-
- !ruby/object:Gem::Dependency
|
47
|
-
name: email_veracity
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
|
-
requirements:
|
51
|
-
- - ! '>='
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
type: :runtime
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
46
|
- !ruby/object:Gem::Dependency
|
63
47
|
name: addressable
|
64
48
|
requirement: !ruby/object:Gem::Requirement
|