RubyGems - f1sherman-domainatrix - Versions diffs - 0.0.10 - Mend

f1sherman-domainatrix 0.0.10

Files changed (11) hide show

data/README.textile +64 -0
data/lib/domainatrix/domain_parser.rb +77 -0
data/lib/domainatrix/url.rb +33 -0
data/lib/domainatrix.rb +14 -0
data/lib/effective_tld_names.dat +5189 -0
data/spec/domainatrix/domain_parser_spec.rb +71 -0
data/spec/domainatrix/url_spec.rb +54 -0
data/spec/domainatrix_spec.rb +16 -0
data/spec/spec.opts +2 -0
data/spec/spec_helper.rb +10 -0
metadata +87 -0

data/README.textile ADDED Viewed

@@ -0,0 +1,64 @@
+h1. Domainatrix
+"http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
+h2. Summary
+A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
+h2. Description
+This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
+h2. Installation
+<pre>
+  gem install domainatrix --source http://gemcutter.org
+</pre>
+h2. Use
+<pre>
+require 'rubygems'
+require 'domainatrix'
+url = Domainatrix.parse("http://www.pauldix.net")
+url.url       # => "http://www.pauldix.net" (the original url)
+url.public_suffix       # => "net"
+url.domain    # => "pauldix"
+url.canonical # => "net.pauldix"
+url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
+url.public_suffix       # => "co.uk"
+url.domain    # => "pauldix"
+url.subdomain # => "foo.bar"
+url.path      # => "/asdf.html?q=arg"
+url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
+</pre>
+h2. LICENSE
+(The MIT License)
+Copyright (c) 2009:
+"Paul Dix":http://pauldix.net
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/domainatrix/domain_parser.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module Domainatrix
+  class DomainParser
+    include Addressable
+    attr_reader :public_suffixes
+    def initialize(file_name)
+      @public_suffixes = {}
+      read_dat_file(file_name)
+    end
+    def read_dat_file(file_name)
+      # If we're in 1.9, make sure we're opening it in UTF-8
+      if RUBY_VERSION >= '1.9'
+        dat_file = File.open(file_name, "r:UTF-8")
+      else
+        dat_file = File.open(file_name)
+      end
+      dat_file.each_line do |line|
+        line = line.strip
+        unless (line =~ /\/\//) || line.empty?
+          parts = line.split(".").reverse
+          sub_hash = @public_suffixes
+          parts.each do |part|
+            sub_hash = (sub_hash[part] ||= {})
+          end
+        end
+      end
+    end
+    def parse(url)
+      uri = URI.parse(url)
+      if uri.query
+        path = "#{uri.path}?#{uri.query}"
+      else
+        path = uri.path
+      end
+      parse_domains_from_host(uri.host).merge({
+        :scheme => uri.scheme,
+        :host   => uri.host,
+        :path   => path,
+        :url    => url
+      })
+    end
+    def parse_domains_from_host(host)
+      parts = host.split(".").reverse
+      public_suffix = []
+      domain = ""
+      subdomains = []
+      sub_hash = @public_suffixes
+      parts.each_index do |i|
+        part = parts[i]
+        sub_parts = sub_hash[part]
+        sub_hash = sub_parts
+        if sub_parts.has_key? "*"
+          public_suffix << part
+          public_suffix << parts[i+1]
+          domain = parts[i+2]
+          subdomains = parts.slice(i+3, parts.size)
+          break
+        elsif sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
+          public_suffix << part
+          domain = parts[i+1]
+          subdomains = parts.slice(i+2, parts.size)
+          break
+        else
+          public_suffix << part
+        end
+      end
+      {:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
+    end
+  end
+end

data/lib/domainatrix/url.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Domainatrix
+  class Url
+    attr_reader :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host
+    def initialize(attrs = {})
+      @scheme = attrs[:scheme]
+      @host = attrs[:host]
+      @url = attrs[:url]
+      @public_suffix = attrs[:public_suffix]
+      @domain = attrs[:domain]
+      @subdomain = attrs[:subdomain]
+      @path = attrs[:path]
+    end
+    def canonical(options = {})
+      public_suffix_parts = @public_suffix.split(".")
+      url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
+      if @subdomain && !@subdomain.empty?
+        subdomain_parts = @subdomain.split(".")
+        url << ".#{subdomain_parts.reverse.join(".")}"
+      end
+      url << @path if @path
+      url
+    end
+    def domain_with_public_suffix
+      "#{@domain}.#{@public_suffix}"
+    end
+    alias domain_with_tld domain_with_public_suffix
+  end
+end

data/lib/domainatrix.rb ADDED Viewed

@@ -0,0 +1,14 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+require 'addressable/uri'
+require 'domainatrix/domain_parser.rb'
+require 'domainatrix/url.rb'
+module Domainatrix
+  VERSION = "0.0.9"
+  DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
+  def self.parse(url)
+    Url.new(DOMAIN_PARSER.parse(url))
+  end
+end