RubyGems - shadowbq-domainatrix - Versions diffs - 0.0.11 - Mend

shadowbq-domainatrix 0.0.11

Files changed (12) hide show

data/CHANGELOG.md +4 -0
data/README.textile +88 -0
data/lib/domainatrix/domain_parser.rb +153 -0
data/lib/domainatrix/url.rb +51 -0
data/lib/domainatrix.rb +48 -0
data/lib/effective_tld_names.dat +6868 -0
data/spec/domainatrix/domain_parser_spec.rb +157 -0
data/spec/domainatrix/url_spec.rb +64 -0
data/spec/domainatrix_spec.rb +106 -0
data/spec/spec.opts +3 -0
data/spec/spec_helper.rb +10 -0
metadata +95 -0

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,4 @@
+## Domainatrix 0.0.11 (September 22, 2012) ##
+* Update domain list
+* Add changelog

data/README.textile ADDED Viewed

@@ -0,0 +1,88 @@
+h1. Domainatrix
+"http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
+h2. Summary
+A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
+h2. Description
+This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
+h2. Installation
+Install Default domainatrix
+<pre>
+  gem install shadowbq-domainatrix
+</pre>
+Using Github Custom version in a GemFile
+Installing a gem directly from a git repository is a feature of Bundler, not a feature of RubyGems. Gems installed this way will not show up when you run gem list.
+<pre>
+  gem 'domainatrix', :git => 'git://github.com/shadowbq/domainatrix.git'
+</pre>
+h2. Use
+<pre>
+require 'rubygems'
+require 'domainatrix'
+url = Domainatrix.parse("http://www.pauldix.net")
+url.url       # => "http://www.pauldix.net/" (the original url)
+url.host      # => "www.pauldix.net"
+url.public_suffix       # => "net"
+url.domain    # => "pauldix"
+url.canonical # => "net.pauldix"
+url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
+url.public_suffix       # => "co.uk"
+url.domain    # => "pauldix"
+url.subdomain # => "foo.bar"
+url.path      # => "/asdf.html?q=arg"
+url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
+url.scheme    #=> "http"
+urls = Domainatrix.scan("wikipedia (http://en.wikipedia.org/wiki/Popular_culture): lol") do |match|
+         match.url # Given a block, works like 'map'
+       end
+urls # => ["http://en.wikipedia.org/wiki/Popular_culture"]
+</pre>
+h2. ALTERNATIVES
+publicsuffix-ruby gem is well supported.
+"https://github.com/weppos/publicsuffix-ruby":https://github.com/weppos/publicsuffix-ruby
+h2. LICENSE
+(The MIT License)
+Copyright (c) 2009:
+"Paul Dix":http://pauldix.net
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/domainatrix/domain_parser.rb ADDED Viewed

@@ -0,0 +1,153 @@
+module Domainatrix
+  class Error < RuntimeError; end
+  class ParseError < Error; end
+  class DomainParser
+    include Addressable
+    attr_reader :public_suffixes
+    VALID_SCHEMA = /^http[s]{0,1}$/
+    def initialize(file_name)
+      @public_suffixes = {}
+      read_dat_file(file_name)
+    end
+    def read_dat_file(file_name)
+      # If we're in 1.9, make sure we're opening it in UTF-8
+      if RUBY_VERSION >= '1.9'
+        dat_file = File.open(file_name, "r:UTF-8")
+      else
+        dat_file = File.open(file_name)
+      end
+      dat_file.each_line do |line|
+        line = line.strip
+        unless (line =~ /^\/\//) || line.empty?
+          parts = line.split(".").reverse
+          sub_hash = @public_suffixes
+          parts.each do |part|
+            sub_hash = (sub_hash[part] ||= {})
+          end
+        end
+      end
+    end
+    def parse(url)
+      return {} unless url && url.strip != ''
+      url = "http://#{url}" unless url[/:\/\//]
+      url = url.downcase
+      uri = begin
+        Addressable::URI.parse(url)
+      rescue Addressable::URI::InvalidURIError
+        nil
+      end
+      raise ParseError, "URL is not parsable by Addressable::URI" if not uri
+      url = uri.normalize.to_s
+      raise ParseError, "URL does not have valid scheme" unless uri.scheme =~ VALID_SCHEMA
+      raise ParseError, "URL does not have a valid host" if uri.host.nil?
+      path = uri.path
+      path << "?#{uri.query}" if uri.query
+      path << "##{uri.fragment}" if uri.fragment
+      if uri.host == 'localhost'
+        uri_hash = { :public_suffix => '', :domain => 'localhost', :subdomain => '' }
+      else
+        uri_hash = parse_domains_from_host(uri.host || uri.basename)
+      end
+      uri_hash.merge({
+        :scheme => uri.scheme,
+        :host   => uri.host,
+        :path   => path,
+        :url    => url
+      })
+    end
+    def split_domain(parts, tld_size)
+      if parts.size == 1 and tld_size == 0
+        subdomain = ''
+        domain = '*'
+        tld = ''
+      else
+        # parts are host split on . reversed, eg com.pauldix.www
+        domain_parts = parts.reverse
+        if domain_parts.size - tld_size <= 0
+          raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}"
+        end
+        tld = domain_parts.slice!(-tld_size, tld_size).join('.')
+        domain = domain_parts.pop
+        subdomain = domain_parts.join('.')
+      end
+      [subdomain, domain, tld]
+    end
+    def parse_domains_from_host(host)
+      return {} unless host
+      parts = host.split(".").reverse
+      ip_address = false
+      if host == '*'
+        tld_size = 0
+      elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil)
+        # host is an ip address
+        ip_address = true
+      else
+        main_tld = parts.first
+        tld_size = 1
+        raise ParseError, "Invalid URL" if parts.size < 2
+        if main_tld != '*'
+          #PunyCode, and New Anydomain TLD invalidate this, just use the DAT file
+          #raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/
+          if not current_suffixes = @public_suffixes[main_tld]
+            raise ParseError, "Invalid main TLD: #{main_tld}"
+          end
+          parts.each_with_index do |part, i|
+            if current_suffixes.empty?
+              # no extra rules found (eg domain.net)
+              break
+            else
+              if current_suffixes.has_key?("!#{parts[i+1]}")
+                # exception tld domain found (eg metro.tokyo.jp)
+                break
+              elsif current_suffixes.has_key?(parts[i+1])
+                # valid extra domain level found (eg co.uk)
+                tld_size += 1
+                current_suffixes = current_suffixes[parts[i+1]]
+              elsif current_suffixes.has_key?('*')
+                # wildcard domain level (eg *.jp)
+                tld_size += 1
+                break
+              else
+                # no extra rules found (eg domain.net)
+                break
+              end # if current_suffixes
+            end # if current_suffixes.empty?
+          end # parts .. do
+        end# if main_tld
+      end # if host
+      if ip_address
+        subdomain, domain, tld = '', host, ''
+      else
+        subdomain, domain, tld = split_domain(parts, tld_size)
+      end
+      {:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address}
+    end # def
+  end #class
+end# module

data/lib/domainatrix/url.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module Domainatrix
+  class Url
+    attr_accessor :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host, :ip_address
+    def initialize(attrs = {})
+      @scheme = attrs[:scheme] || ''
+      @host = attrs[:host] || ''
+      @url = attrs[:url] || ''
+      @public_suffix = attrs[:public_suffix] || ''
+      @domain = attrs[:domain] || ''
+      @subdomain = attrs[:subdomain] || ''
+      @path = attrs[:path] || ''
+      @ip_address = attrs[:ip_address]
+    end
+    def canonical(options = {})
+      public_suffix_parts = @public_suffix.split(".")
+      url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
+      if @subdomain && !@subdomain.empty?
+        subdomain_parts = @subdomain.split(".")
+        url << ".#{subdomain_parts.reverse.join(".")}"
+      end
+      url << @path if @path
+      url
+    end
+    def domain_with_public_suffix
+      [@domain, @public_suffix].compact.reject{|s|s==''}.join('.')
+    end
+    alias domain_with_tld domain_with_public_suffix
+    def to_s
+      if @scheme.nil? || @scheme.empty?
+        scheme = ''
+      else
+        scheme = "#{@scheme}://"
+      end
+      parts = []
+      parts << @subdomain if @subdomain and !@subdomain.empty?
+      parts << @domain if @domain and !@domain.empty?
+      parts << @public_suffix if @public_suffix and !@public_suffix.empty?
+      "#{scheme}#{parts.join('.')}#{@path}"
+    end
+  end
+end

data/lib/domainatrix.rb ADDED Viewed

@@ -0,0 +1,48 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+require 'addressable/uri'
+require 'domainatrix/domain_parser'
+require 'domainatrix/url'
+require 'uri'
+begin
+  require 'uri'
+rescue LoadError
+end
+module Domainatrix
+  VERSION = "0.0.11"
+  DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
+  def self.parse(url)
+    Url.new(DOMAIN_PARSER.parse(url))
+  end
+  def self.scan(text, &block)
+    return [] unless text
+    @schemes ||= %w(http https)
+    all_trailing_clutter = /[.,:);]+$/
+    clutter_without_parens = /[.,:);]+$/
+    candidate_urls = ::URI.extract(text, @schemes)
+    candidate_urls.map! do |url|
+      # If the URL has an open paren, allow closing parens.
+      if url.include?("(")
+        url.gsub(clutter_without_parens, '')
+      else
+        url.gsub(all_trailing_clutter, '')
+      end
+    end
+    urls = candidate_urls.map do |url|
+      begin
+        parse(url)
+      rescue Addressable::URI::InvalidURIError
+      end
+    end.compact
+    urls.map!(&block) if block
+    urls
+  end
+end