gman 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -6
- data/README.md +0 -10
- data/Rakefile +9 -1
- data/gman.gemspec +1 -2
- data/lib/domains.txt +10031 -9909
- data/lib/gman.rb +56 -18
- data/test/helper.rb +21 -20
- data/test/test_gman.rb +83 -122
- metadata +2 -18
    
        data/lib/gman.rb
    CHANGED
    
    | @@ -2,10 +2,48 @@ require 'public_suffix' | |
| 2 2 | 
             
            require 'yaml'
         | 
| 3 3 | 
             
            require 'swot'
         | 
| 4 4 | 
             
            require "addressable/uri"
         | 
| 5 | 
            -
            require "email_veracity"
         | 
| 6 5 |  | 
| 7 6 | 
             
            module Gman
         | 
| 8 7 |  | 
| 8 | 
            +
              # Source: http://bit.ly/1n2X9iv
         | 
| 9 | 
            +
              EMAIL_REGEX = %r{
         | 
| 10 | 
            +
                    ^
         | 
| 11 | 
            +
                    (
         | 
| 12 | 
            +
                      [\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
         | 
| 13 | 
            +
                      \.
         | 
| 14 | 
            +
                    )
         | 
| 15 | 
            +
                    *
         | 
| 16 | 
            +
                    [\w\!\#\$\%\&\'\*\+\-\/\=\?\^\`\{\|\}\~]+
         | 
| 17 | 
            +
                    @
         | 
| 18 | 
            +
                    (
         | 
| 19 | 
            +
                      (
         | 
| 20 | 
            +
                        (
         | 
| 21 | 
            +
                          (
         | 
| 22 | 
            +
                            (
         | 
| 23 | 
            +
                              [a-z0-9]{1}
         | 
| 24 | 
            +
                              [a-z0-9\-]{0,62}
         | 
| 25 | 
            +
                              [a-z0-9]{1}
         | 
| 26 | 
            +
                            )
         | 
| 27 | 
            +
                            |
         | 
| 28 | 
            +
                            [a-z]
         | 
| 29 | 
            +
                          )
         | 
| 30 | 
            +
                          \.
         | 
| 31 | 
            +
                        )+
         | 
| 32 | 
            +
                        [a-z]{2,6}
         | 
| 33 | 
            +
                      )
         | 
| 34 | 
            +
                      |
         | 
| 35 | 
            +
                      (
         | 
| 36 | 
            +
                        \d{1,3}
         | 
| 37 | 
            +
                        \.
         | 
| 38 | 
            +
                      ){3}
         | 
| 39 | 
            +
                      \d{1,3}
         | 
| 40 | 
            +
                      (
         | 
| 41 | 
            +
                        \:\d{1,5}
         | 
| 42 | 
            +
                      )?
         | 
| 43 | 
            +
                    )
         | 
| 44 | 
            +
                    $
         | 
| 45 | 
            +
                  }xi
         | 
| 46 | 
            +
             | 
| 9 47 | 
             
              class << self
         | 
| 10 48 |  | 
| 11 49 | 
             
                # Normalizes and checks if a given string represents a governemnt domain
         | 
| @@ -16,20 +54,12 @@ module Gman | |
| 16 54 | 
             
                #   "foo.gov.uk"
         | 
| 17 55 | 
             
                #   "http://foo.bar.gov"
         | 
| 18 56 | 
             
                #
         | 
| 19 | 
            -
                # check_mx - if an email is passed, check the domain for an mx record
         | 
| 20 | 
            -
                #
         | 
| 21 57 | 
             
                # Returns boolean true if a government domain
         | 
| 22 | 
            -
                def valid?(text | 
| 58 | 
            +
                def valid?(text)
         | 
| 23 59 |  | 
| 24 60 | 
             
                  domain = get_domain text
         | 
| 25 61 | 
             
                  return false unless PublicSuffix.valid?(domain)
         | 
| 26 62 |  | 
| 27 | 
            -
                  # validate mx record
         | 
| 28 | 
            -
                  if check_mx && email?(text)
         | 
| 29 | 
            -
                    EmailVeracity::Config[:skip_lookup] = false
         | 
| 30 | 
            -
                    return false unless EmailVeracity::Address.new(text).valid?
         | 
| 31 | 
            -
                  end
         | 
| 32 | 
            -
             | 
| 33 63 | 
             
                  # Ensure non-edu
         | 
| 34 64 | 
             
                  return false if Swot::is_academic?(domain)
         | 
| 35 65 |  | 
| @@ -44,7 +74,7 @@ module Gman | |
| 44 74 | 
             
                # returns an instance of our custom public suffix list
         | 
| 45 75 | 
             
                # list behaves like PublicSuffix::List but is limited to our whitelisted domains
         | 
| 46 76 | 
             
                def list
         | 
| 47 | 
            -
                  @list ||= PublicSuffix::List::parse( | 
| 77 | 
            +
                  @list ||= PublicSuffix::List::parse(File.new(list_path, "r:utf-8"))
         | 
| 48 78 | 
             
                end
         | 
| 49 79 |  | 
| 50 80 | 
             
                # Get the FQDN name from a URL or email address.
         | 
| @@ -60,12 +90,16 @@ module Gman | |
| 60 90 | 
             
                  if uri.host # valid https?://* URI
         | 
| 61 91 | 
             
                    uri.host
         | 
| 62 92 | 
             
                  elsif email?(text)
         | 
| 63 | 
            -
                     | 
| 93 | 
            +
                    text.match(/@([\w\.\-]+)\Z/i)[1]
         | 
| 64 94 | 
             
                  else # url sans http://
         | 
| 65 | 
            -
                     | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 95 | 
            +
                    begin
         | 
| 96 | 
            +
                      uri = Addressable::URI.parse("http://#{text}")
         | 
| 97 | 
            +
                      # properly parse http://foo edge cases
         | 
| 98 | 
            +
                      # see https://github.com/sporkmonger/addressable/issues/145
         | 
| 99 | 
            +
                      uri.host if uri.host =~ /\./
         | 
| 100 | 
            +
                    rescue Addressable::URI::InvalidURIError
         | 
| 101 | 
            +
                      nil
         | 
| 102 | 
            +
                    end
         | 
| 69 103 | 
             
                  end
         | 
| 70 104 | 
             
                end
         | 
| 71 105 |  | 
| @@ -86,8 +120,12 @@ module Gman | |
| 86 120 | 
             
                #
         | 
| 87 121 | 
             
                # Returns true if email, otherwise false
         | 
| 88 122 | 
             
                def email?(text)
         | 
| 89 | 
            -
                   | 
| 90 | 
            -
             | 
| 123 | 
            +
                  text =~ EMAIL_REGEX
         | 
| 124 | 
            +
                end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                # Returns the absolute path to the domain list
         | 
| 127 | 
            +
                def list_path
         | 
| 128 | 
            +
                  @list_path ||= File.join(File.dirname(__FILE__), "domains.txt")
         | 
| 91 129 | 
             
                end
         | 
| 92 130 | 
             
              end
         | 
| 93 131 | 
             
            end
         | 
    
        data/test/helper.rb
    CHANGED
    
    | @@ -1,20 +1,21 @@ | |
| 1 | 
            -
            require 'rubygems'
         | 
| 2 | 
            -
            require 'bundler'
         | 
| 3 | 
            -
            begin
         | 
| 4 | 
            -
              Bundler.setup(:default, :development)
         | 
| 5 | 
            -
            rescue Bundler::BundlerError => e
         | 
| 6 | 
            -
              $stderr.puts e.message
         | 
| 7 | 
            -
              $stderr.puts "Run `bundle install` to install missing gems"
         | 
| 8 | 
            -
              exit e.status_code
         | 
| 9 | 
            -
            end
         | 
| 10 | 
            -
            require 'test/unit'
         | 
| 11 | 
            -
            require 'shoulda'
         | 
| 12 | 
            -
             | 
| 13 | 
            -
            $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
         | 
| 14 | 
            -
            $LOAD_PATH.unshift(File.dirname(__FILE__))
         | 
| 15 | 
            -
            require 'gman'
         | 
| 16 | 
            -
            require 'net/dns'
         | 
| 17 | 
            -
            require 'net/dns/resolver'
         | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 1 | 
            +
            require 'rubygems'
         | 
| 2 | 
            +
            require 'bundler'
         | 
| 3 | 
            +
            begin
         | 
| 4 | 
            +
              Bundler.setup(:default, :development)
         | 
| 5 | 
            +
            rescue Bundler::BundlerError => e
         | 
| 6 | 
            +
              $stderr.puts e.message
         | 
| 7 | 
            +
              $stderr.puts "Run `bundle install` to install missing gems"
         | 
| 8 | 
            +
              exit e.status_code
         | 
| 9 | 
            +
            end
         | 
| 10 | 
            +
            require 'test/unit'
         | 
| 11 | 
            +
            require 'shoulda'
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
         | 
| 14 | 
            +
            $LOAD_PATH.unshift(File.dirname(__FILE__))
         | 
| 15 | 
            +
            require 'gman'
         | 
| 16 | 
            +
            require 'net/dns'
         | 
| 17 | 
            +
            require 'net/dns/resolver'
         | 
| 18 | 
            +
            require './lib/gman/parser'
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            class Test::Unit::TestCase
         | 
| 21 | 
            +
            end
         | 
    
        data/test/test_gman.rb
    CHANGED
    
    | @@ -1,122 +1,83 @@ | |
| 1 | 
            -
            require 'helper'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            VALID = [  "foo.gov",
         | 
| 4 | 
            -
                        "http://foo.mil",
         | 
| 5 | 
            -
                        "foo@bar.gc.ca",
         | 
| 6 | 
            -
                        "foo.gov.au",
         | 
| 7 | 
            -
                        "https://www.foo.gouv.fr",
         | 
| 8 | 
            -
                        "foo@ci.champaign.il.us",
         | 
| 9 | 
            -
                        "foo.bar.baz.gov.au",
         | 
| 10 | 
            -
                        "foo@bar.gov.uk",
         | 
| 11 | 
            -
                        ".gov",
         | 
| 12 | 
            -
                        "foo.fed.us",
         | 
| 13 | 
            -
                    ]
         | 
| 14 | 
            -
             | 
| 15 | 
            -
            INVALID = [ "foo.bar.com",
         | 
| 16 | 
            -
                        "bar@foo.biz",
         | 
| 17 | 
            -
                        "http://www.foo.biz",
         | 
| 18 | 
            -
                        "foo.uk",
         | 
| 19 | 
            -
                        "gov",
         | 
| 20 | 
            -
                        "foo@k12.champaign.il.us",
         | 
| 21 | 
            -
                        "foo@kii.gov.by",
         | 
| 22 | 
            -
                        "foo",
         | 
| 23 | 
            -
                        "",
         | 
| 24 | 
            -
                        nil,
         | 
| 25 | 
            -
                        " ",
         | 
| 26 | 
            -
                      ]
         | 
| 27 | 
            -
             | 
| 28 | 
            -
            class TestGman < Test::Unit::TestCase
         | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
                 | 
| 33 | 
            -
             | 
| 34 | 
            -
                 | 
| 35 | 
            -
              end
         | 
| 36 | 
            -
             | 
| 37 | 
            -
              should "recognize government email addresses and domains" do
         | 
| 38 | 
            -
                 | 
| 39 | 
            -
                  assert_equal  | 
| 40 | 
            -
                end
         | 
| 41 | 
            -
              end
         | 
| 42 | 
            -
             | 
| 43 | 
            -
              should "not  | 
| 44 | 
            -
                 | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
               | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
                Gman. | 
| 51 | 
            -
             | 
| 52 | 
            -
                 | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
                 | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
               | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
                assert_equal " | 
| 69 | 
            -
                assert_equal " | 
| 70 | 
            -
                assert_equal " | 
| 71 | 
            -
                assert_equal  | 
| 72 | 
            -
              end
         | 
| 73 | 
            -
             | 
| 74 | 
            -
              should " | 
| 75 | 
            -
                 | 
| 76 | 
            -
             | 
| 77 | 
            -
                 | 
| 78 | 
            -
              end
         | 
| 79 | 
            -
             | 
| 80 | 
            -
              should " | 
| 81 | 
            -
                Gman. | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
              end
         | 
| 85 | 
            -
             | 
| 86 | 
            -
              should "pass any email on the list" do
         | 
| 87 | 
            -
                Gman.list.each do |entry|
         | 
| 88 | 
            -
                  assert_equal true, Gman.valid?("foo@bar.#{entry.name}"), "foo@bar.#{entry.name} is not a valid"
         | 
| 89 | 
            -
                end
         | 
| 90 | 
            -
              end
         | 
| 91 | 
            -
             | 
| 92 | 
            -
              should "pass any domain on the list" do
         | 
| 93 | 
            -
                Gman.list.each do |entry|
         | 
| 94 | 
            -
                  assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
         | 
| 95 | 
            -
                end
         | 
| 96 | 
            -
              end
         | 
| 97 | 
            -
             | 
| 98 | 
            -
              should "only contain resolvable domains" do
         | 
| 99 | 
            -
                Gman.list.each do |entry|
         | 
| 100 | 
            -
                  assert_equal true, domain_resolves? domain
         | 
| 101 | 
            -
                end
         | 
| 102 | 
            -
              end
         | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
              should "not err out on invalid domains" do
         | 
| 106 | 
            -
                assert_equal false, Gman.valid?("foo@act.gov.au")
         | 
| 107 | 
            -
                assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
         | 
| 108 | 
            -
                assert_equal nil, Gman.domain_parts("foo@act.gov.au")
         | 
| 109 | 
            -
              end
         | 
| 110 | 
            -
             | 
| 111 | 
            -
              should "return public suffix domain" do
         | 
| 112 | 
            -
                assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
         | 
| 113 | 
            -
                assert_equal NilClass, Gman.domain_parts("foo.bar").class
         | 
| 114 | 
            -
              end
         | 
| 115 | 
            -
             | 
| 116 | 
            -
              should "parse domain parts" do
         | 
| 117 | 
            -
                assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
         | 
| 118 | 
            -
                assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
         | 
| 119 | 
            -
                assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
         | 
| 120 | 
            -
                assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
         | 
| 121 | 
            -
              end
         | 
| 122 | 
            -
            end
         | 
| 1 | 
            +
            require File.join(File.dirname(__FILE__), 'helper')
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            VALID = [  "foo.gov",
         | 
| 4 | 
            +
                        "http://foo.mil",
         | 
| 5 | 
            +
                        "foo@bar.gc.ca",
         | 
| 6 | 
            +
                        "foo.gov.au",
         | 
| 7 | 
            +
                        "https://www.foo.gouv.fr",
         | 
| 8 | 
            +
                        "foo@ci.champaign.il.us",
         | 
| 9 | 
            +
                        "foo.bar.baz.gov.au",
         | 
| 10 | 
            +
                        "foo@bar.gov.uk",
         | 
| 11 | 
            +
                        ".gov",
         | 
| 12 | 
            +
                        "foo.fed.us",
         | 
| 13 | 
            +
                    ]
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            INVALID = [ "foo.bar.com",
         | 
| 16 | 
            +
                        "bar@foo.biz",
         | 
| 17 | 
            +
                        "http://www.foo.biz",
         | 
| 18 | 
            +
                        "foo.uk",
         | 
| 19 | 
            +
                        "gov",
         | 
| 20 | 
            +
                        "foo@k12.champaign.il.us",
         | 
| 21 | 
            +
                        "foo@kii.gov.by",
         | 
| 22 | 
            +
                        "foo",
         | 
| 23 | 
            +
                        "",
         | 
| 24 | 
            +
                        nil,
         | 
| 25 | 
            +
                        " ",
         | 
| 26 | 
            +
                      ]
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            class TestGman < Test::Unit::TestCase
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
              should "recognize government email addresses and domains" do
         | 
| 32 | 
            +
                VALID.each do |test|
         | 
| 33 | 
            +
                  assert_equal true, Gman::valid?(test), "#{test} should be detected as a valid government domain"
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
              end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
              should "not recognize non-government email addresses and domains" do
         | 
| 38 | 
            +
                INVALID.each do |test|
         | 
| 39 | 
            +
                  assert_equal false, Gman::valid?(test), "#{test} should be detected as an invalid government domain"
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              should "not allow educational domains" do
         | 
| 44 | 
            +
                assert_equal false, Gman::valid?("foo@gwu.edu")
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              should "properly parse domains from strings" do
         | 
| 48 | 
            +
                assert_equal "github.gov", Gman::get_domain("foo@github.gov")
         | 
| 49 | 
            +
                assert_equal "foo.github.gov", Gman::get_domain("foo.github.gov")
         | 
| 50 | 
            +
                assert_equal "github.gov", Gman::get_domain("http://github.gov")
         | 
| 51 | 
            +
                assert_equal "github.gov", Gman::get_domain("https://github.gov")
         | 
| 52 | 
            +
                assert_equal ".gov", Gman::get_domain(".gov")
         | 
| 53 | 
            +
                assert_equal nil, Gman.get_domain("foo")
         | 
| 54 | 
            +
              end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
              should "not err out on invalid domains" do
         | 
| 57 | 
            +
                assert_equal false, Gman.valid?("foo@act.gov.au")
         | 
| 58 | 
            +
                assert_equal "act.gov.au", Gman.get_domain("foo@act.gov.au")
         | 
| 59 | 
            +
                assert_equal nil, Gman.domain_parts("foo@act.gov.au")
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              should "return public suffix domain" do
         | 
| 63 | 
            +
                assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
         | 
| 64 | 
            +
                assert_equal NilClass, Gman.domain_parts("foo.bar").class
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              should "parse domain parts" do
         | 
| 68 | 
            +
                assert_equal "gov", Gman.domain_parts("foo@bar.gov").tld
         | 
| 69 | 
            +
                assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
         | 
| 70 | 
            +
                assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
         | 
| 71 | 
            +
                assert_equal "bar.gov", Gman.domain_parts("foo@bar.gov").domain
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              should "not err out on invalid hosts" do
         | 
| 75 | 
            +
                assert_nothing_raised do
         | 
| 76 | 
            +
                  assert_equal nil, Gman.get_domain("</@foo.com")
         | 
| 77 | 
            +
                end
         | 
| 78 | 
            +
              end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
              should "returns the path to domains.txt" do
         | 
| 81 | 
            +
                assert_equal true, File.exists?(Gman.list_path)
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: gman
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version:  | 
| 4 | 
            +
              version: 2.0.0
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date:  | 
| 12 | 
            +
            date: 2014-01-31 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: public_suffix
         | 
| @@ -43,22 +43,6 @@ dependencies: | |
| 43 43 | 
             
                - - ! '>='
         | 
| 44 44 | 
             
                  - !ruby/object:Gem::Version
         | 
| 45 45 | 
             
                    version: '0'
         | 
| 46 | 
            -
            - !ruby/object:Gem::Dependency
         | 
| 47 | 
            -
              name: email_veracity
         | 
| 48 | 
            -
              requirement: !ruby/object:Gem::Requirement
         | 
| 49 | 
            -
                none: false
         | 
| 50 | 
            -
                requirements:
         | 
| 51 | 
            -
                - - ! '>='
         | 
| 52 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 53 | 
            -
                    version: '0'
         | 
| 54 | 
            -
              type: :runtime
         | 
| 55 | 
            -
              prerelease: false
         | 
| 56 | 
            -
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 57 | 
            -
                none: false
         | 
| 58 | 
            -
                requirements:
         | 
| 59 | 
            -
                - - ! '>='
         | 
| 60 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 61 | 
            -
                    version: '0'
         | 
| 62 46 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 63 47 | 
             
              name: addressable
         | 
| 64 48 | 
             
              requirement: !ruby/object:Gem::Requirement
         |