gman 5.0.9 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -1
- data/Gemfile +1 -0
- data/README.md +16 -22
- data/Rakefile +3 -3
- data/bin/gman +10 -11
- data/bin/gman_filter +7 -7
- data/config/domains.txt +19 -19
- data/config/vendor/dotgovs.csv +398 -355
- data/gman.gemspec +34 -27
- data/lib/gman.rb +29 -23
- data/lib/gman/country_codes.rb +14 -15
- data/lib/gman/domain_list.rb +34 -25
- data/lib/gman/identifier.rb +39 -43
- data/lib/gman/importer.rb +111 -61
- data/lib/gman/locality.rb +22 -10
- data/lib/gman/version.rb +1 -1
- data/script/add +2 -2
- data/script/alphabetize +2 -2
- data/script/cibuild +2 -0
- data/script/dedupe +2 -2
- data/script/profile +5 -2
- data/script/prune +7 -7
- data/script/reconcile-us +26 -21
- data/script/vendor-federal-de +5 -5
- data/script/vendor-municipal-de +5 -5
- data/script/vendor-nl +12 -4
- data/script/vendor-public-suffix +8 -8
- data/script/vendor-se +8 -6
- data/script/vendor-us +7 -7
- data/test/fixtures/domains.txt +2 -0
- data/test/{obama.txt → fixtures/obama.txt} +0 -0
- data/test/helper.rb +19 -5
- data/test/test_gman.rb +43 -38
- data/test/test_gman_bin.rb +37 -43
- data/test/test_gman_country_codes.rb +10 -6
- data/test/test_gman_domains.rb +15 -10
- data/test/test_gman_filter.rb +5 -7
- data/test/test_gman_identifier.rb +36 -35
- data/test/test_gman_importer.rb +250 -0
- data/test/test_gman_locality.rb +5 -5
- metadata +28 -10
- data/lib/gman/sanctions.rb +0 -29
- data/test/test_gman_sanctions.rb +0 -20
    
        data/gman.gemspec
    CHANGED
    
    | @@ -1,36 +1,43 @@ | |
| 1 | 
            -
            require File.expand_path  | 
| 1 | 
            +
            require File.expand_path './lib/gman/version', File.dirname(__FILE__)
         | 
| 2 2 |  | 
| 3 3 | 
             
            Gem::Specification.new do |s|
         | 
| 4 | 
            -
              s.name =  | 
| 5 | 
            -
              s.summary =  | 
| 6 | 
            -
             | 
| 4 | 
            +
              s.name = 'gman'
         | 
| 5 | 
            +
              s.summary = <<-EOF
         | 
| 6 | 
            +
                Check if a given domain or email address belong to a governemnt entity
         | 
| 7 | 
            +
              EOF
         | 
| 8 | 
            +
              s.description = <<-EOF
         | 
| 9 | 
            +
                A ruby gem to check if the owner of a given email address is working for
         | 
| 10 | 
            +
                THE MAN.
         | 
| 11 | 
            +
              EOF
         | 
| 7 12 | 
             
              s.version = Gman::VERSION
         | 
| 8 | 
            -
              s.authors = [ | 
| 9 | 
            -
              s.email =  | 
| 10 | 
            -
              s.homepage =  | 
| 11 | 
            -
              s.licenses = [ | 
| 13 | 
            +
              s.authors = ['Ben Balter']
         | 
| 14 | 
            +
              s.email = 'ben.balter@github.com'
         | 
| 15 | 
            +
              s.homepage = 'https://github.com/benbalter/gman'
         | 
| 16 | 
            +
              s.licenses = ['MIT']
         | 
| 12 17 |  | 
| 13 | 
            -
              s.files | 
| 14 | 
            -
              s.test_files | 
| 15 | 
            -
              s.executables | 
| 16 | 
            -
             | 
| 18 | 
            +
              s.files         = `git ls-files`.split("\n")
         | 
| 19 | 
            +
              s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
         | 
| 20 | 
            +
              s.executables   = `git ls-files -- bin/*`.split("\n").map do |f|
         | 
| 21 | 
            +
                File.basename(f)
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
              s.require_paths = ['lib']
         | 
| 17 24 |  | 
| 18 | 
            -
              s.require_paths = [ | 
| 25 | 
            +
              s.require_paths = ['lib']
         | 
| 19 26 | 
             
              s.required_ruby_version = '~> 2.0'
         | 
| 20 27 |  | 
| 21 | 
            -
              s.add_dependency( | 
| 22 | 
            -
              s.add_dependency( | 
| 23 | 
            -
              s.add_dependency( | 
| 24 | 
            -
              s.add_dependency( | 
| 25 | 
            -
             | 
| 26 | 
            -
              s.add_development_dependency( "rake", "~> 10.4" )
         | 
| 27 | 
            -
              s.add_development_dependency( "shoulda", "~> 3.5" )
         | 
| 28 | 
            -
              s.add_development_dependency( "rdoc", "~> 4.2" )
         | 
| 29 | 
            -
              s.add_development_dependency( "bundler", "~> 1.10" )
         | 
| 30 | 
            -
              s.add_development_dependency( "pry", "~> 0.10" )
         | 
| 31 | 
            -
              s.add_development_dependency( "parallel", "~> 1.6" )
         | 
| 32 | 
            -
              s.add_development_dependency( "mechanize", "~> 2.7" )
         | 
| 33 | 
            -
              s.add_development_dependency( "addressable", "~> 2.3" )
         | 
| 34 | 
            -
              s.add_development_dependency( "ruby-prof", "~> 0.15" )
         | 
| 28 | 
            +
              s.add_dependency('swot', '~> 1.0')
         | 
| 29 | 
            +
              s.add_dependency('iso_country_codes', '~> 0.6')
         | 
| 30 | 
            +
              s.add_dependency('naughty_or_nice', '~> 2.0')
         | 
| 31 | 
            +
              s.add_dependency('colored', '~> 1.2')
         | 
| 35 32 |  | 
| 33 | 
            +
              s.add_development_dependency('rake', '~> 10.4')
         | 
| 34 | 
            +
              s.add_development_dependency('shoulda', '~> 3.5')
         | 
| 35 | 
            +
              s.add_development_dependency('rdoc', '~> 4.2')
         | 
| 36 | 
            +
              s.add_development_dependency('bundler', '~> 1.10')
         | 
| 37 | 
            +
              s.add_development_dependency('pry', '~> 0.10')
         | 
| 38 | 
            +
              s.add_development_dependency('parallel', '~> 1.6')
         | 
| 39 | 
            +
              s.add_development_dependency('mechanize', '~> 2.7')
         | 
| 40 | 
            +
              s.add_development_dependency('addressable', '~> 2.3')
         | 
| 41 | 
            +
              s.add_development_dependency('ruby-prof', '~> 0.15')
         | 
| 42 | 
            +
              s.add_development_dependency('rubocop', '~> 0.37')
         | 
| 36 43 | 
             
            end
         | 
    
        data/lib/gman.rb
    CHANGED
    
    | @@ -6,30 +6,33 @@ require_relative 'gman/version' | |
| 6 6 | 
             
            require_relative 'gman/country_codes'
         | 
| 7 7 | 
             
            require_relative 'gman/locality'
         | 
| 8 8 | 
             
            require_relative 'gman/identifier'
         | 
| 9 | 
            -
            require_relative 'gman/sanctions'
         | 
| 10 9 |  | 
| 11 10 | 
             
            class Gman
         | 
| 12 | 
            -
             | 
| 13 11 | 
             
              include NaughtyOrNice
         | 
| 14 12 |  | 
| 15 13 | 
             
              class << self
         | 
| 16 14 | 
             
                # returns an instance of our custom public suffix list
         | 
| 17 | 
            -
                # list behaves like PublicSuffix::List | 
| 15 | 
            +
                # list behaves like PublicSuffix::List
         | 
| 16 | 
            +
                # but is limited to our whitelisted domains
         | 
| 18 17 | 
             
                def list
         | 
| 19 | 
            -
                   | 
| 18 | 
            +
                  @list ||= PublicSuffix::List.parse(list_contents)
         | 
| 20 19 | 
             
                end
         | 
| 21 20 |  | 
| 22 21 | 
             
                def config_path
         | 
| 23 | 
            -
                  File. | 
| 22 | 
            +
                  File.expand_path '../config', File.dirname(__FILE__)
         | 
| 24 23 | 
             
                end
         | 
| 25 24 |  | 
| 26 25 | 
             
                # Returns the absolute path to the domain list
         | 
| 27 26 | 
             
                def list_path
         | 
| 28 | 
            -
                   | 
| 27 | 
            +
                  if ENV['GMAN_STUB_DOMAINS']
         | 
| 28 | 
            +
                    File.expand_path '../test/fixtures/domains.txt', File.dirname(__FILE__)
         | 
| 29 | 
            +
                  else
         | 
| 30 | 
            +
                    File.expand_path 'domains.txt', config_path
         | 
| 31 | 
            +
                  end
         | 
| 29 32 | 
             
                end
         | 
| 30 33 |  | 
| 31 34 | 
             
                def list_contents
         | 
| 32 | 
            -
                   | 
| 35 | 
            +
                  @list_contents ||= File.new(list_path, 'r:utf-8').read
         | 
| 33 36 | 
             
                end
         | 
| 34 37 | 
             
              end
         | 
| 35 38 |  | 
| @@ -37,25 +40,28 @@ class Gman | |
| 37 40 | 
             
              #
         | 
| 38 41 | 
             
              # Returns boolean true if a government domain
         | 
| 39 42 | 
             
              def valid?
         | 
| 40 | 
            -
                 | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
                   | 
| 44 | 
            -
             | 
| 43 | 
            +
                @valid ||= begin
         | 
| 44 | 
            +
                  return false unless valid_domain?
         | 
| 45 | 
            +
                  return false if academic?
         | 
| 46 | 
            +
                  locality? || public_suffix_valid?
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
              end
         | 
| 45 49 |  | 
| 46 | 
            -
             | 
| 47 | 
            -
                  return false if Swot::is_academic?(domain)
         | 
| 50 | 
            +
              private
         | 
| 48 51 |  | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 52 | 
            +
              def valid_domain?
         | 
| 53 | 
            +
                domain && domain.valid? && !academic?
         | 
| 54 | 
            +
              end
         | 
| 51 55 |  | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 56 | 
            +
              def academic?
         | 
| 57 | 
            +
                domain && Swot.is_academic?(domain)
         | 
| 58 | 
            +
              end
         | 
| 54 59 |  | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
                 | 
| 60 | 
            +
              # domain is on the domain list and
         | 
| 61 | 
            +
              # domain is not explicitly blacklisted and
         | 
| 62 | 
            +
              # domain matches a standard public suffix list rule
         | 
| 63 | 
            +
              def public_suffix_valid?
         | 
| 64 | 
            +
                rule = Gman.list.find(to_s)
         | 
| 65 | 
            +
                !rule.nil? && rule.type != :exception && rule.allow?(".#{domain}")
         | 
| 60 66 | 
             
              end
         | 
| 61 67 | 
             
            end
         | 
    
        data/lib/gman/country_codes.rb
    CHANGED
    
    | @@ -1,21 +1,20 @@ | |
| 1 1 | 
             
            class Gman
         | 
| 2 | 
            -
             | 
| 3 2 | 
             
              # Map last part of TLD to alpha2 country code
         | 
| 4 3 | 
             
              ALPHA2_MAP = {
         | 
| 5 | 
            -
                : | 
| 6 | 
            -
                : | 
| 7 | 
            -
                : | 
| 8 | 
            -
                : | 
| 9 | 
            -
                : | 
| 10 | 
            -
                : | 
| 11 | 
            -
                : | 
| 12 | 
            -
                : | 
| 13 | 
            -
                : | 
| 14 | 
            -
                : | 
| 15 | 
            -
                : | 
| 16 | 
            -
                : | 
| 17 | 
            -
                : | 
| 18 | 
            -
              }
         | 
| 4 | 
            +
                ac:     'sh',
         | 
| 5 | 
            +
                uk:     'gb',
         | 
| 6 | 
            +
                su:     'ru',
         | 
| 7 | 
            +
                tp:     'tl',
         | 
| 8 | 
            +
                yu:     'rs',
         | 
| 9 | 
            +
                gov:    'us',
         | 
| 10 | 
            +
                mil:    'us',
         | 
| 11 | 
            +
                org:    'us',
         | 
| 12 | 
            +
                com:    'us',
         | 
| 13 | 
            +
                net:    'us',
         | 
| 14 | 
            +
                edu:    'us',
         | 
| 15 | 
            +
                travel: 'us',
         | 
| 16 | 
            +
                info:   'us'
         | 
| 17 | 
            +
              }.freeze
         | 
| 19 18 |  | 
| 20 19 | 
             
              # Returns the two character alpha county code represented by the domain
         | 
| 21 20 | 
             
              #
         | 
    
        data/lib/gman/domain_list.rb
    CHANGED
    
    | @@ -1,13 +1,12 @@ | |
| 1 1 | 
             
            class Gman
         | 
| 2 2 | 
             
              class DomainList
         | 
| 3 | 
            -
             | 
| 4 3 | 
             
                attr_accessor :list
         | 
| 5 | 
            -
                 | 
| 4 | 
            +
                alias to_h list
         | 
| 6 5 |  | 
| 7 | 
            -
                COMMENT_REGEX =  | 
| 6 | 
            +
                COMMENT_REGEX = %r{//[/\s]*(.*)$}i
         | 
| 8 7 |  | 
| 9 8 | 
             
                def initialize(list)
         | 
| 10 | 
            -
                  @list = list.reject { | | 
| 9 | 
            +
                  @list = list.reject { |_group, domains| domains.compact.empty? }
         | 
| 11 10 | 
             
                end
         | 
| 12 11 |  | 
| 13 12 | 
             
                def groups
         | 
| @@ -15,7 +14,7 @@ class Gman | |
| 15 14 | 
             
                end
         | 
| 16 15 |  | 
| 17 16 | 
             
                def domains
         | 
| 18 | 
            -
                  list.values.flatten.sort.uniq
         | 
| 17 | 
            +
                  list.values.flatten.compact.sort.uniq
         | 
| 19 18 | 
             
                end
         | 
| 20 19 |  | 
| 21 20 | 
             
                def count
         | 
| @@ -23,18 +22,18 @@ class Gman | |
| 23 22 | 
             
                end
         | 
| 24 23 |  | 
| 25 24 | 
             
                def alphabetize
         | 
| 26 | 
            -
                  @list = @list.sort_by { |k, | 
| 27 | 
            -
                  @list.each { | | 
| 25 | 
            +
                  @list = @list.sort_by { |k, _v| k.downcase }.to_h
         | 
| 26 | 
            +
                  @list.each { |_group, domains| domains.sort!.uniq! }
         | 
| 28 27 | 
             
                end
         | 
| 29 28 |  | 
| 30 29 | 
             
                def write
         | 
| 30 | 
            +
                  alphabetize
         | 
| 31 31 | 
             
                  File.write(Gman.list_path, to_public_suffix)
         | 
| 32 32 | 
             
                end
         | 
| 33 33 |  | 
| 34 34 | 
             
                def to_public_suffix
         | 
| 35 | 
            -
                  current_group =  | 
| 36 | 
            -
                   | 
| 37 | 
            -
                  list.sort_by { |group, domains| group.downcase }.each do |group, domains|
         | 
| 35 | 
            +
                  current_group = output = ''
         | 
| 36 | 
            +
                  list.sort_by { |group, _domains| group.downcase }.each do |group, domains|
         | 
| 38 37 | 
             
                    if group != current_group
         | 
| 39 38 | 
             
                      output << "\n\n" unless current_group.empty? # first entry
         | 
| 40 39 | 
             
                      output << "// #{group}\n"
         | 
| @@ -46,7 +45,7 @@ class Gman | |
| 46 45 | 
             
                end
         | 
| 47 46 |  | 
| 48 47 | 
             
                def self.current
         | 
| 49 | 
            -
                  current = File.open(Gman | 
| 48 | 
            +
                  current = File.open(Gman.list_path).read
         | 
| 50 49 | 
             
                  DomainList.from_public_suffix(current)
         | 
| 51 50 | 
             
                end
         | 
| 52 51 |  | 
| @@ -56,23 +55,33 @@ class Gman | |
| 56 55 | 
             
                  DomainList.new(hash)
         | 
| 57 56 | 
             
                end
         | 
| 58 57 |  | 
| 59 | 
            -
                 | 
| 58 | 
            +
                def parent_domain(domain)
         | 
| 59 | 
            +
                  domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/ }
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                class << self
         | 
| 63 | 
            +
                  private
         | 
| 60 64 |  | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
                       | 
| 70 | 
            -
             | 
| 71 | 
            -
                       | 
| 72 | 
            -
                      domain_hash[group].push line.downcase
         | 
| 65 | 
            +
                  # Given an array of comments/domains in public suffix format
         | 
| 66 | 
            +
                  # Converts to a hash in the form of :group => [domain1, domain2...]
         | 
| 67 | 
            +
                  def array_to_hash(domains)
         | 
| 68 | 
            +
                    domain_hash = {}
         | 
| 69 | 
            +
                    group = ''
         | 
| 70 | 
            +
                    domains.each do |line|
         | 
| 71 | 
            +
                      if line =~ COMMENT_REGEX
         | 
| 72 | 
            +
                        group = COMMENT_REGEX.match(line)[1]
         | 
| 73 | 
            +
                      else
         | 
| 74 | 
            +
                        safe_push(domain_hash, group, line.downcase)
         | 
| 75 | 
            +
                      end
         | 
| 73 76 | 
             
                    end
         | 
| 77 | 
            +
                    domain_hash
         | 
| 78 | 
            +
                  end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                  def safe_push(hash, key, value)
         | 
| 81 | 
            +
                    return if value.empty?
         | 
| 82 | 
            +
                    hash[key] ||= []
         | 
| 83 | 
            +
                    hash[key].push value
         | 
| 74 84 | 
             
                  end
         | 
| 75 | 
            -
                  domain_hash
         | 
| 76 85 | 
             
                end
         | 
| 77 86 | 
             
              end
         | 
| 78 87 | 
             
            end
         | 
    
        data/lib/gman/identifier.rb
    CHANGED
    
    | @@ -1,21 +1,10 @@ | |
| 1 1 | 
             
            class Gman
         | 
| 2 | 
            -
             | 
| 3 2 | 
             
              def type
         | 
| 4 | 
            -
                 | 
| 5 | 
            -
                   | 
| 6 | 
            -
                 | 
| 7 | 
            -
             | 
| 8 | 
            -
                 | 
| 9 | 
            -
                  :cog
         | 
| 10 | 
            -
                elsif city?
         | 
| 11 | 
            -
                  :city
         | 
| 12 | 
            -
                elsif federal?
         | 
| 13 | 
            -
                  :federal
         | 
| 14 | 
            -
                elsif county?
         | 
| 15 | 
            -
                  :county
         | 
| 16 | 
            -
                elsif list_category.nil?
         | 
| 17 | 
            -
                  nil
         | 
| 18 | 
            -
                elsif list_category.include?("usagov")
         | 
| 3 | 
            +
                [:state, :district, :cog, :city, :federal, :county].each do |type|
         | 
| 4 | 
            +
                  return type if send "#{type}?"
         | 
| 5 | 
            +
                end
         | 
| 6 | 
            +
                return if list_category.nil?
         | 
| 7 | 
            +
                if list_category.include?('usagov')
         | 
| 19 8 | 
             
                  :unknown
         | 
| 20 9 | 
             
                else
         | 
| 21 10 | 
             
                  list_category.to_sym
         | 
| @@ -26,7 +15,7 @@ class Gman | |
| 26 15 | 
             
                if matches
         | 
| 27 16 | 
             
                  matches[4].upcase
         | 
| 28 17 | 
             
                elsif dotgov_listing
         | 
| 29 | 
            -
                  dotgov_listing[ | 
| 18 | 
            +
                  dotgov_listing['State']
         | 
| 30 19 | 
             
                elsif list_category
         | 
| 31 20 | 
             
                  matches = list_category.match(/usagov([A-Z]{2})/)
         | 
| 32 21 | 
             
                  matches[1] if matches
         | 
| @@ -34,80 +23,87 @@ class Gman | |
| 34 23 | 
             
              end
         | 
| 35 24 |  | 
| 36 25 | 
             
              def city
         | 
| 37 | 
            -
                dotgov_listing[ | 
| 26 | 
            +
                dotgov_listing['City'] if dotgov_listing
         | 
| 38 27 | 
             
              end
         | 
| 39 28 |  | 
| 40 29 | 
             
              def agency
         | 
| 41 | 
            -
                dotgov_listing[ | 
| 30 | 
            +
                dotgov_listing['Agency'] if federal?
         | 
| 42 31 | 
             
              end
         | 
| 43 32 |  | 
| 44 33 | 
             
              def dotgov?
         | 
| 45 | 
            -
                domain.tld ==  | 
| 34 | 
            +
                domain.tld == 'gov'
         | 
| 46 35 | 
             
              end
         | 
| 47 36 |  | 
| 48 37 | 
             
              def federal?
         | 
| 49 | 
            -
                dotgov_listing && dotgov_listing[ | 
| 38 | 
            +
                dotgov_listing && dotgov_listing['Domain Type'] == 'Federal Agency'
         | 
| 50 39 | 
             
              end
         | 
| 51 40 |  | 
| 52 41 | 
             
              def city?
         | 
| 53 42 | 
             
                if matches
         | 
| 54 | 
            -
                  %w | 
| 43 | 
            +
                  %w(ci town vil).include?(matches[3])
         | 
| 55 44 | 
             
                elsif dotgov_listing
         | 
| 56 | 
            -
                  dotgov_listing[ | 
| 45 | 
            +
                  dotgov_listing['Domain Type'] == 'City'
         | 
| 57 46 | 
             
                end
         | 
| 58 47 | 
             
              end
         | 
| 59 48 |  | 
| 60 49 | 
             
              def county?
         | 
| 61 50 | 
             
                if matches
         | 
| 62 | 
            -
                  matches[3] ==  | 
| 51 | 
            +
                  matches[3] == 'co'
         | 
| 63 52 | 
             
                elsif dotgov_listing
         | 
| 64 | 
            -
                  dotgov_listing[ | 
| 53 | 
            +
                  dotgov_listing['Domain Type'] == 'County'
         | 
| 65 54 | 
             
                end
         | 
| 66 55 | 
             
              end
         | 
| 67 56 |  | 
| 68 57 | 
             
              def state?
         | 
| 69 58 | 
             
                if matches
         | 
| 70 | 
            -
                  matches[1] ==  | 
| 59 | 
            +
                  matches[1] == 'state'
         | 
| 71 60 | 
             
                elsif dotgov_listing
         | 
| 72 | 
            -
                  dotgov_listing[ | 
| 61 | 
            +
                  dotgov_listing['Domain Type'] == 'State/Local Govt'
         | 
| 73 62 | 
             
                end
         | 
| 74 63 | 
             
              end
         | 
| 75 64 |  | 
| 76 65 | 
             
              def district?
         | 
| 77 | 
            -
                 | 
| 66 | 
            +
                matches && matches[1] == 'dst'
         | 
| 78 67 | 
             
              end
         | 
| 79 68 |  | 
| 80 69 | 
             
              def cog?
         | 
| 81 | 
            -
                 | 
| 70 | 
            +
                matches && matches[1] == 'cog'
         | 
| 82 71 | 
             
              end
         | 
| 83 72 |  | 
| 84 73 | 
             
              private
         | 
| 85 74 |  | 
| 86 75 | 
             
              def list_category
         | 
| 87 76 | 
             
                @list_category ||= begin
         | 
| 88 | 
            -
                   | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
                   | 
| 77 | 
            +
                  match = Gman.list.find(domain.to_s)
         | 
| 78 | 
            +
                  return unless match
         | 
| 79 | 
            +
                  regex = %r{// ([^\n]+)\n?[^/]*\n#{Regexp.escape(match.name)}\n}im
         | 
| 80 | 
            +
                  matches = Gman.list_contents.match(regex)
         | 
| 81 | 
            +
                  matches[1] if matches
         | 
| 93 82 | 
             
                end
         | 
| 94 83 | 
             
              end
         | 
| 95 84 |  | 
| 96 85 | 
             
              def matches
         | 
| 97 86 | 
             
                return @matches if defined? @matches
         | 
| 98 | 
            -
                @matches = domain.to_s.match( | 
| 87 | 
            +
                @matches = domain.to_s.match(Locality::REGEX)
         | 
| 99 88 | 
             
              end
         | 
| 100 89 |  | 
| 101 | 
            -
              def  | 
| 102 | 
            -
                 | 
| 90 | 
            +
              def dotgov_listing
         | 
| 91 | 
            +
                return @dotgov_listing if defined? @dotgov_listing
         | 
| 92 | 
            +
                return unless dotgov?
         | 
| 93 | 
            +
                @dotgov_listing = Gman.dotgov_list.find do |listing|
         | 
| 94 | 
            +
                  listing['Domain Name'].casecmp("#{domain.sld}.gov") == 0
         | 
| 95 | 
            +
                end
         | 
| 103 96 | 
             
              end
         | 
| 104 97 |  | 
| 105 | 
            -
               | 
| 106 | 
            -
                 | 
| 107 | 
            -
             | 
| 98 | 
            +
              class << self
         | 
| 99 | 
            +
                def dotgov_list
         | 
| 100 | 
            +
                  @dotgov_list ||= CSV.read(dotgov_list_path, headers: true)
         | 
| 101 | 
            +
                end
         | 
| 108 102 |  | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
                 | 
| 103 | 
            +
                private
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                def dotgov_list_path
         | 
| 106 | 
            +
                  File.join Gman.config_path, 'vendor/dotgovs.csv'
         | 
| 107 | 
            +
                end
         | 
| 112 108 | 
             
              end
         | 
| 113 109 | 
             
            end
         | 
    
        data/lib/gman/importer.rb
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            # Utility functions for parsing and manipulating public-suffix  | 
| 1 | 
            +
            # Utility functions for parsing and manipulating public-suffix domain lists
         | 
| 2 2 | 
             
            # Only used in development and not loaded by default
         | 
| 3 3 | 
             
            require 'yaml'
         | 
| 4 4 | 
             
            require 'open-uri'
         | 
| @@ -9,11 +9,10 @@ require_relative './domain_list' | |
| 9 9 |  | 
| 10 10 | 
             
            class Gman
         | 
| 11 11 | 
             
              class Importer
         | 
| 12 | 
            -
             | 
| 13 12 | 
             
                attr_accessor :domains
         | 
| 14 13 |  | 
| 15 14 | 
             
                # Known false positives from vendored lists
         | 
| 16 | 
            -
                BLACKLIST = %w | 
| 15 | 
            +
                BLACKLIST = %w(
         | 
| 17 16 | 
             
                  business.centurytel.net
         | 
| 18 17 | 
             
                  chesnee.net
         | 
| 19 18 | 
             
                  citlink.net
         | 
| @@ -39,7 +38,24 @@ class Gman | |
| 39 38 | 
             
                  wctc.net
         | 
| 40 39 | 
             
                  webconnections.net
         | 
| 41 40 | 
             
                  webpages.charter.net
         | 
| 42 | 
            -
                 | 
| 41 | 
            +
                ).freeze
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                REGEX_CHECKS = {
         | 
| 44 | 
            +
                  'home. regex'     => /^home\./,
         | 
| 45 | 
            +
                  'user. regex'     => /^users?\./,
         | 
| 46 | 
            +
                  'sites. regex'    => /^sites?\./,
         | 
| 47 | 
            +
                  'weebly'          => /weebly\.com$/,
         | 
| 48 | 
            +
                  'wordpress'       => /wordpress\.com$/,
         | 
| 49 | 
            +
                  'govoffice'       => /govoffice\d?\.com$/,
         | 
| 50 | 
            +
                  'homestead'       => /homestead\.com$/,
         | 
| 51 | 
            +
                  'wix.com'         => /wix\.com$/,
         | 
| 52 | 
            +
                  'blogspot.com'    => /blogspot\.com$/,
         | 
| 53 | 
            +
                  'tripod.com'      => /tripod\.com$/,
         | 
| 54 | 
            +
                  'squarespace.com' => /squarespace\.com$/,
         | 
| 55 | 
            +
                  'github.io'       => /github\.io$/,
         | 
| 56 | 
            +
                  'tumblr'          => /tumblr\.com$/,
         | 
| 57 | 
            +
                  'locality'        => Gman::Locality::REGEX
         | 
| 58 | 
            +
                }.freeze
         | 
| 43 59 |  | 
| 44 60 | 
             
                def initialize(domains)
         | 
| 45 61 | 
             
                  @domains = DomainList.new(domains)
         | 
| @@ -50,40 +66,21 @@ class Gman | |
| 50 66 | 
             
                end
         | 
| 51 67 |  | 
| 52 68 | 
             
                def normalize_domain(domain)
         | 
| 53 | 
            -
                  domain. | 
| 69 | 
            +
                  domain = Gman.new(domain).to_s
         | 
| 70 | 
            +
                  domain.to_s.downcase.strip.gsub(/^www./, '').gsub(%r{/$}, '')
         | 
| 54 71 | 
             
                end
         | 
| 55 72 |  | 
| 56 | 
            -
                def valid_domain?(domain, options={})
         | 
| 57 | 
            -
                  return false  | 
| 58 | 
            -
                  return  | 
| 59 | 
            -
                  return  | 
| 60 | 
            -
                  return reject(domain, "sites. regex")    if domain =~ /^sites?\./
         | 
| 61 | 
            -
                  return reject(domain, "weebly")          if domain =~ /weebly\.com$/
         | 
| 62 | 
            -
                  return reject(domain, "wordpress")       if domain =~ /wordpress\.com$/
         | 
| 63 | 
            -
                  return reject(domain, "govoffice")       if domain =~ /govoffice\d?\.com$/
         | 
| 64 | 
            -
                  return reject(domain, "homestead")       if domain =~ /homestead\.com$/
         | 
| 65 | 
            -
                  return reject(domain, "wix.com")         if domain =~ /wix\.com$/
         | 
| 66 | 
            -
                  return reject(domain, "blogspot.com")    if domain =~ /blogspot\.com$/
         | 
| 67 | 
            -
                  return reject(domain, "tripod.com")      if domain =~ /tripod\.com$/
         | 
| 68 | 
            -
                  return reject(domain, "squarespace.com") if domain =~ /squarespace\.com$/
         | 
| 69 | 
            -
                  return reject(domain, "github.io")       if domain =~ /github\.io$/
         | 
| 70 | 
            -
                  return reject(domain, "locality")        if domain =~ Gman::LOCALITY_REGEX
         | 
| 71 | 
            -
                  return reject(domain, "blacklist")       if BLACKLIST.include?(domain)
         | 
| 72 | 
            -
                  return reject(domain, "duplicate")       if !options[:skip_dupe] && current.domains.include?(domain)
         | 
| 73 | 
            -
                  return reject(domain, "invalid")         unless PublicSuffix.valid?(".#{domain}")
         | 
| 74 | 
            -
                  return reject(domain, "academic")        if Swot::is_academic?(domain)
         | 
| 75 | 
            -
             | 
| 76 | 
            -
                  if !options[:skip_dupe] && subdomain = current.domains.find { |c| domain =~ /\.#{Regexp.escape(c)}$/}
         | 
| 77 | 
            -
                    return reject(domain, "subdomain of #{subdomain}")
         | 
| 78 | 
            -
                  end
         | 
| 79 | 
            -
             | 
| 80 | 
            -
                  return reject(domain, "unresolvable") if !options[:skip_resolve] && !domain_resolves?(domain)
         | 
| 73 | 
            +
                def valid_domain?(domain, options = {})
         | 
| 74 | 
            +
                  return false unless ensure_valid(domain)
         | 
| 75 | 
            +
                  return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
         | 
| 76 | 
            +
                  return false if !options[:skip_resolve] && !ensure_resolves(domain)
         | 
| 81 77 | 
             
                  true
         | 
| 82 78 | 
             
                end
         | 
| 83 79 |  | 
| 84 | 
            -
                # if RECONCILING=true, return the reason, | 
| 80 | 
            +
                # if RECONCILING=true, return the reason,
         | 
| 81 | 
            +
                # rather than a bool and silence log output
         | 
| 85 82 | 
             
                def reject(domain, reason)
         | 
| 86 | 
            -
                  return reason if ENV[ | 
| 83 | 
            +
                  return reason if ENV['RECONCILING']
         | 
| 87 84 | 
             
                  logger.info "👎 `#{domain}`: #{reason}"
         | 
| 88 85 | 
             
                  false
         | 
| 89 86 | 
             
                end
         | 
| @@ -92,59 +89,112 @@ class Gman | |
| 92 89 | 
             
                  @current ||= DomainList.current
         | 
| 93 90 | 
             
                end
         | 
| 94 91 |  | 
| 95 | 
            -
                def import(options | 
| 92 | 
            +
                def import(options)
         | 
| 96 93 | 
             
                  logger.info "Current: #{Gman::DomainList.current.count} domains"
         | 
| 97 94 | 
             
                  logger.info "Adding: #{domains.count} domains"
         | 
| 98 95 |  | 
| 99 | 
            -
                   | 
| 100 | 
            -
             | 
| 101 | 
            -
                    domains.map!    { |domain| normalize_domain(domain) }
         | 
| 102 | 
            -
                    domains.select! { |domain| valid_domain?(domain, options) }
         | 
| 103 | 
            -
                  end
         | 
| 104 | 
            -
             | 
| 105 | 
            -
                  logger.info "Filtered to: #{domains.count} domains"
         | 
| 96 | 
            +
                  normalize_domains!
         | 
| 97 | 
            +
                  ensure_validity!(options)
         | 
| 106 98 |  | 
| 107 99 | 
             
                  if domains.count == 0
         | 
| 108 | 
            -
                    logger.info  | 
| 100 | 
            +
                    logger.info 'Nothing to add. Aborting'
         | 
| 109 101 | 
             
                    exit 0
         | 
| 110 102 | 
             
                  end
         | 
| 111 103 |  | 
| 112 | 
            -
                   | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 104 | 
            +
                  add_to_current
         | 
| 105 | 
            +
                  logger.info "New: #{current.count} domains"
         | 
| 106 | 
            +
                end
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                def resolver
         | 
| 109 | 
            +
                  @resolver ||= Resolv::DNS.new(nameserver: ['8.8.8.8', '8.8.4.4'])
         | 
| 110 | 
            +
                end
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                # Verifies that the given domain has an MX record, and thus is valid
         | 
| 113 | 
            +
                def domain_resolves?(domain)
         | 
| 114 | 
            +
                  domain = Addressable::URI.new(host: domain).normalize.host
         | 
| 115 | 
            +
                  return true if ip?(domain)
         | 
| 116 | 
            +
                  returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
         | 
| 117 | 
            +
                end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                private
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                def ensure_regex(domain)
         | 
| 122 | 
            +
                  REGEX_CHECKS.each do |msg, regex|
         | 
| 123 | 
            +
                    return reject(domain, msg) if domain =~ regex
         | 
| 117 124 | 
             
                  end
         | 
| 125 | 
            +
                  true
         | 
| 126 | 
            +
                end
         | 
| 118 127 |  | 
| 119 | 
            -
             | 
| 128 | 
            +
                def ensure_valid(domain)
         | 
| 129 | 
            +
                  return false if domain.empty?
         | 
| 130 | 
            +
                  if BLACKLIST.include?(domain)
         | 
| 131 | 
            +
                    reject(domain, 'blacklist')
         | 
| 132 | 
            +
                  elsif !PublicSuffix.valid?(".#{domain}")
         | 
| 133 | 
            +
                    reject(domain, 'invalid')
         | 
| 134 | 
            +
                  elsif Swot.is_academic?(domain)
         | 
| 135 | 
            +
                    reject(domain, 'academic')
         | 
| 136 | 
            +
                  else
         | 
| 137 | 
            +
                    ensure_regex(domain)
         | 
| 138 | 
            +
                  end
         | 
| 139 | 
            +
                end
         | 
| 120 140 |  | 
| 121 | 
            -
             | 
| 122 | 
            -
                   | 
| 123 | 
            -
                   | 
| 141 | 
            +
                def ensure_resolves(domain)
         | 
| 142 | 
            +
                  return reject(domain, 'unresolvable') unless domain_resolves?(domain)
         | 
| 143 | 
            +
                  true
         | 
| 124 144 | 
             
                end
         | 
| 125 145 |  | 
| 126 | 
            -
                def  | 
| 127 | 
            -
                   | 
| 146 | 
            +
                def ensure_not_dupe(domain)
         | 
| 147 | 
            +
                  return true unless dupe?(domain)
         | 
| 148 | 
            +
                  if current.domains.include?(domain)
         | 
| 149 | 
            +
                    reject(domain, 'duplicate')
         | 
| 150 | 
            +
                  else
         | 
| 151 | 
            +
                    parent = current.parent_domain(domain)
         | 
| 152 | 
            +
                    reject(domain, "subdomain of #{parent}")
         | 
| 153 | 
            +
                  end
         | 
| 154 | 
            +
                end
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                def dupe?(domain)
         | 
| 157 | 
            +
                  current.domains.include?(domain) || current.parent_domain(domain)
         | 
| 158 | 
            +
                end
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                def normalize_domains!
         | 
| 161 | 
            +
                  domains.list.each do |_group, domains|
         | 
| 162 | 
            +
                    domains.map! { |domain| normalize_domain(domain) }
         | 
| 163 | 
            +
                    domains.uniq!
         | 
| 164 | 
            +
                  end
         | 
| 165 | 
            +
                end
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                def ensure_validity!(options = {})
         | 
| 168 | 
            +
                  domains.list.each do |_group, domains|
         | 
| 169 | 
            +
                    domains.select! { |domain| valid_domain?(domain, options) }
         | 
| 170 | 
            +
                  end
         | 
| 171 | 
            +
                end
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                def add_to_current
         | 
| 174 | 
            +
                  domains.list.each do |group, domains|
         | 
| 175 | 
            +
                    current.list[group] ||= []
         | 
| 176 | 
            +
                    current.list[group].concat domains
         | 
| 177 | 
            +
                  end
         | 
| 178 | 
            +
                  current.write
         | 
| 128 179 | 
             
                end
         | 
| 129 180 |  | 
| 130 | 
            -
                def  | 
| 131 | 
            -
                   | 
| 181 | 
            +
                def ip?(domain)
         | 
| 182 | 
            +
                  resolver.getaddress(domain)
         | 
| 132 183 | 
             
                rescue Resolv::ResolvError
         | 
| 133 184 | 
             
                  false
         | 
| 134 185 | 
             
                end
         | 
| 135 186 |  | 
| 136 | 
            -
                 | 
| 137 | 
            -
             | 
| 138 | 
            -
                   | 
| 139 | 
            -
             | 
| 140 | 
            -
                   | 
| 141 | 
            -
                  resolve_without_errors { resolver.getresource(domain, Resolv::DNS::Resource::IN::MX) }
         | 
| 187 | 
            +
                def returns_record?(domain, type)
         | 
| 188 | 
            +
                  type = Object.const_get "Resolv::DNS::Resource::IN::#{type}"
         | 
| 189 | 
            +
                  resolver.getresource(domain, type)
         | 
| 190 | 
            +
                rescue Resolv::ResolvError
         | 
| 191 | 
            +
                  false
         | 
| 142 192 | 
             
                end
         | 
| 143 193 | 
             
              end
         | 
| 144 194 | 
             
            end
         | 
| 145 195 |  | 
| 146 196 | 
             
            class Gman
         | 
| 147 | 
            -
              def self.import(hash, options={})
         | 
| 197 | 
            +
              def self.import(hash, options = {})
         | 
| 148 198 | 
             
                Gman::Importer.new(hash).import(options)
         | 
| 149 199 | 
             
              end
         | 
| 150 200 | 
             
            end
         |