gman 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -6
- data/README.md +0 -10
- data/Rakefile +9 -1
- data/gman.gemspec +1 -2
- data/lib/domains.txt +10031 -9909
- data/lib/gman.rb +56 -18
- data/test/helper.rb +21 -20
- data/test/test_gman.rb +83 -122
- metadata +2 -18
data/lib/gman.rb
CHANGED
@@ -2,10 +2,48 @@ require 'public_suffix'
|
|
2
2
|
require 'yaml'
|
3
3
|
require 'swot'
|
4
4
|
require "addressable/uri"
|
5
|
-
require "email_veracity"
|
6
5
|
|
7
6
|
module Gman
|
8
7
|
|
8
|
+
# Source: http://bit.ly/1n2X9iv
|
9
|
+
EMAIL_REGEX = %r{
|
10
|
+
^
|
11
|
+
(
|
12
|
+
[\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
|
13
|
+
\.
|
14
|
+
)
|
15
|
+
*
|
16
|
+
[\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
|
17
|
+
@
|
18
|
+
(
|
19
|
+
(
|
20
|
+
(
|
21
|
+
(
|
22
|
+
(
|
23
|
+
[a-z0-9]{1}
|
24
|
+
[a-z0-9\-]{0,62}
|
25
|
+
[a-z0-9]{1}
|
26
|
+
)
|
27
|
+
|
|
28
|
+
[a-z]
|
29
|
+
)
|
30
|
+
\.
|
31
|
+
)+
|
32
|
+
[a-z]{2,6}
|
33
|
+
)
|
34
|
+
|
|
35
|
+
(
|
36
|
+
\d{1,3}
|
37
|
+
\.
|
38
|
+
){3}
|
39
|
+
\d{1,3}
|
40
|
+
(
|
41
|
+
\:\d{1,5}
|
42
|
+
)?
|
43
|
+
)
|
44
|
+
$
|
45
|
+
}xi
|
46
|
+
|
9
47
|
class << self
|
10
48
|
|
11
49
|
# Normalizes and checks if a given string represents a governemnt domain
|
@@ -16,20 +54,12 @@ module Gman
|
|
16
54
|
# "foo.gov.uk"
|
17
55
|
# "http://foo.bar.gov"
|
18
56
|
#
|
19
|
-
# check_mx - if an email is passed, check the domain for an mx record
|
20
|
-
#
|
21
57
|
# Returns boolean true if a government domain
|
22
|
-
def valid?(text
|
58
|
+
def valid?(text)
|
23
59
|
|
24
60
|
domain = get_domain text
|
25
61
|
return false unless PublicSuffix.valid?(domain)
|
26
62
|
|
27
|
-
# validate mx record
|
28
|
-
if check_mx && email?(text)
|
29
|
-
EmailVeracity::Config[:skip_lookup] = false
|
30
|
-
return false unless EmailVeracity::Address.new(text).valid?
|
31
|
-
end
|
32
|
-
|
33
63
|
# Ensure non-edu
|
34
64
|
return false if Swot::is_academic?(domain)
|
35
65
|
|
@@ -44,7 +74,7 @@ module Gman
|
|
44
74
|
# returns an instance of our custom public suffix list
|
45
75
|
# list behaves like PublicSuffix::List but is limited to our whitelisted domains
|
46
76
|
def list
|
47
|
-
@list ||= PublicSuffix::List::parse(
|
77
|
+
@list ||= PublicSuffix::List::parse(File.new(list_path, "r:utf-8"))
|
48
78
|
end
|
49
79
|
|
50
80
|
# Get the FQDN name from a URL or email address.
|
@@ -60,12 +90,16 @@ module Gman
|
|
60
90
|
if uri.host # valid https?://* URI
|
61
91
|
uri.host
|
62
92
|
elsif email?(text)
|
63
|
-
|
93
|
+
text.match(/@([\w\.\-]+)\Z/i)[1]
|
64
94
|
else # url sans http://
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
95
|
+
begin
|
96
|
+
uri = Addressable::URI.parse("http://#{text}")
|
97
|
+
# properly parse http://foo edge cases
|
98
|
+
# see https://github.com/sporkmonger/addressable/issues/145
|
99
|
+
uri.host if uri.host =~ /\./
|
100
|
+
rescue Addressable::URI::InvalidURIError
|
101
|
+
nil
|
102
|
+
end
|
69
103
|
end
|
70
104
|
end
|
71
105
|
|
@@ -86,8 +120,12 @@ module Gman
|
|
86
120
|
#
|
87
121
|
# Returns true if email, otherwise false
|
88
122
|
def email?(text)
|
89
|
-
|
90
|
-
|
123
|
+
text =~ EMAIL_REGEX
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns the absolute path to the domain list
|
127
|
+
def list_path
|
128
|
+
@list_path ||= File.join(File.dirname(__FILE__), "domains.txt")
|
91
129
|
end
|
92
130
|
end
|
93
131
|
end
|
data/test/helper.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
require 'test/unit'
|
11
|
-
require 'shoulda'
|
12
|
-
|
13
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
-
require 'gman'
|
16
|
-
require 'net/dns'
|
17
|
-
require 'net/dns/resolver'
|
18
|
-
|
19
|
-
|
20
|
-
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'gman'
|
16
|
+
require 'net/dns'
|
17
|
+
require 'net/dns/resolver'
|
18
|
+
require './lib/gman/parser'
|
19
|
+
|
20
|
+
class Test::Unit::TestCase
|
21
|
+
end
|
data/test/test_gman.rb
CHANGED
@@ -1,122 +1,83 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
VALID = [ "foo.gov",
|
4
|
-
"http://foo.mil",
|
5
|
-
"foo@bar.gc.ca",
|
6
|
-
"foo.gov.au",
|
7
|
-
"https://www.foo.gouv.fr",
|
8
|
-
"foo@ci.champaign.il.us",
|
9
|
-
"foo.bar.baz.gov.au",
|
10
|
-
"foo@bar.gov.uk",
|
11
|
-
".gov",
|
12
|
-
"foo.fed.us",
|
13
|
-
]
|
14
|
-
|
15
|
-
INVALID = [ "foo.bar.com",
|
16
|
-
"bar@foo.biz",
|
17
|
-
"http://www.foo.biz",
|
18
|
-
"foo.uk",
|
19
|
-
"gov",
|
20
|
-
"foo@k12.champaign.il.us",
|
21
|
-
"foo@kii.gov.by",
|
22
|
-
"foo",
|
23
|
-
"",
|
24
|
-
nil,
|
25
|
-
" ",
|
26
|
-
]
|
27
|
-
|
28
|
-
class TestGman < Test::Unit::TestCase
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
should "recognize government email addresses and domains" do
|
38
|
-
|
39
|
-
assert_equal
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
should "not
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
Gman.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
assert_equal "
|
69
|
-
assert_equal "
|
70
|
-
assert_equal "
|
71
|
-
assert_equal
|
72
|
-
end
|
73
|
-
|
74
|
-
should "
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
|
80
|
-
should "
|
81
|
-
Gman.
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
should "pass any email on the list" do
|
87
|
-
Gman.list.each do |entry|
|
88
|
-
assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
should "pass any domain on the list" do
|
93
|
-
Gman.list.each do |entry|
|
94
|
-
assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
should "only contain resolvable domains" do
|
99
|
-
Gman.list.each do |entry|
|
100
|
-
assert_equal true, domain_resolves? domain
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
|
105
|
-
should "not err out on invalid domains" do
|
106
|
-
assert_equal false, Gman.valid?("foo@act.gov.au")
|
107
|
-
assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
|
108
|
-
assert_equal nil, Gman.domain_parts("foo@act.gov.au")
|
109
|
-
end
|
110
|
-
|
111
|
-
should "return public suffix domain" do
|
112
|
-
assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
|
113
|
-
assert_equal NilClass, Gman.domain_parts("foo.bar").class
|
114
|
-
end
|
115
|
-
|
116
|
-
should "parse domain parts" do
|
117
|
-
assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
|
118
|
-
assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
|
119
|
-
assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
|
120
|
-
assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
|
121
|
-
end
|
122
|
-
end
|
1
|
+
require File.join(File.dirname(__FILE__), 'helper')
|
2
|
+
|
3
|
+
VALID = [ "foo.gov",
|
4
|
+
"http://foo.mil",
|
5
|
+
"foo@bar.gc.ca",
|
6
|
+
"foo.gov.au",
|
7
|
+
"https://www.foo.gouv.fr",
|
8
|
+
"foo@ci.champaign.il.us",
|
9
|
+
"foo.bar.baz.gov.au",
|
10
|
+
"foo@bar.gov.uk",
|
11
|
+
".gov",
|
12
|
+
"foo.fed.us",
|
13
|
+
]
|
14
|
+
|
15
|
+
INVALID = [ "foo.bar.com",
|
16
|
+
"bar@foo.biz",
|
17
|
+
"http://www.foo.biz",
|
18
|
+
"foo.uk",
|
19
|
+
"gov",
|
20
|
+
"foo@k12.champaign.il.us",
|
21
|
+
"foo@kii.gov.by",
|
22
|
+
"foo",
|
23
|
+
"",
|
24
|
+
nil,
|
25
|
+
" ",
|
26
|
+
]
|
27
|
+
|
28
|
+
class TestGman < Test::Unit::TestCase
|
29
|
+
|
30
|
+
|
31
|
+
should "recognize government email addresses and domains" do
|
32
|
+
VALID.each do |test|
|
33
|
+
assert_equal true, Gman::valid?(test), "#{test} should be detected as a valid government domain"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
should "not recognize non-government email addresses and domains" do
|
38
|
+
INVALID.each do |test|
|
39
|
+
assert_equal false, Gman::valid?(test), "#{test} should be detected as an invalid government domain"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
should "not allow educational domains" do
|
44
|
+
assert_equal false, Gman::valid?("foo@gwu.edu")
|
45
|
+
end
|
46
|
+
|
47
|
+
should "properly parse domains from strings" do
|
48
|
+
assert_equal "github.gov", Gman::get_domain("foo@github.gov")
|
49
|
+
assert_equal "foo.github.gov", Gman::get_domain("foo.github.gov")
|
50
|
+
assert_equal "github.gov", Gman::get_domain("http://github.gov")
|
51
|
+
assert_equal "github.gov", Gman::get_domain("https://github.gov")
|
52
|
+
assert_equal ".gov", Gman::get_domain(".gov")
|
53
|
+
assert_equal nil, Gman.get_domain("foo")
|
54
|
+
end
|
55
|
+
|
56
|
+
should "not err out on invalid domains" do
|
57
|
+
assert_equal false, Gman.valid?("foo@act.gov.au")
|
58
|
+
assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
|
59
|
+
assert_equal nil, Gman.domain_parts("foo@act.gov.au")
|
60
|
+
end
|
61
|
+
|
62
|
+
should "return public suffix domain" do
|
63
|
+
assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
|
64
|
+
assert_equal NilClass, Gman.domain_parts("foo.bar").class
|
65
|
+
end
|
66
|
+
|
67
|
+
should "parse domain parts" do
|
68
|
+
assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
|
69
|
+
assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
|
70
|
+
assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
|
71
|
+
assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
|
72
|
+
end
|
73
|
+
|
74
|
+
should "not err out on invalid hosts" do
|
75
|
+
assert_nothing_raised do
|
76
|
+
assert_equal nil, Gman.get_domain("</@foo.com")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
should "returns the path to domains.txt" do
|
81
|
+
assert_equal true, File.exists?(Gman.list_path)
|
82
|
+
end
|
83
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gman
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2014-01-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: public_suffix
|
@@ -43,22 +43,6 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
-
- !ruby/object:Gem::Dependency
|
47
|
-
name: email_veracity
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
|
-
requirements:
|
51
|
-
- - ! '>='
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
type: :runtime
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
46
|
- !ruby/object:Gem::Dependency
|
63
47
|
name: addressable
|
64
48
|
requirement: !ruby/object:Gem::Requirement
|