RubyGems - shadowbq-domainatrix - Versions diffs - 0.0.11 - Mend

shadowbq-domainatrix 0.0.11

Files changed (12) hide show

data/CHANGELOG.md +4 -0
data/README.textile +88 -0
data/lib/domainatrix/domain_parser.rb +153 -0
data/lib/domainatrix/url.rb +51 -0
data/lib/domainatrix.rb +48 -0
data/lib/effective_tld_names.dat +6868 -0
data/spec/domainatrix/domain_parser_spec.rb +157 -0
data/spec/domainatrix/url_spec.rb +64 -0
data/spec/domainatrix_spec.rb +106 -0
data/spec/spec.opts +3 -0
data/spec/spec_helper.rb +10 -0
metadata +95 -0

data/spec/domainatrix/domain_parser_spec.rb ADDED Viewed

@@ -0,0 +1,157 @@
+# -*- encoding : utf-8 -*-
+require File.dirname(__FILE__) + '/../spec_helper'
+describe "domain parser" do
+  before(:all) do
+    @domain_parser = Domainatrix::DomainParser.new("#{File.dirname(__FILE__)}/../../lib/effective_tld_names.dat")
+  end
+  describe "reading the dat file" do
+    it "creates a tree of the domain names" do
+      @domain_parser.public_suffixes.should be_a Hash
+    end
+    it "creates the first level of the tree" do
+      @domain_parser.public_suffixes.should have_key("com")
+    end
+    it "creates the first level of the tree even when the first doesn't appear on a line by itself" do
+      @domain_parser.public_suffixes.should have_key("uk")
+    end
+    it "creates lower levels of the tree" do
+      @domain_parser.public_suffixes["jp"].should have_key("ac")
+      @domain_parser.public_suffixes["jp"]["kawasaki"].should have_key("*")
+    end
+  end
+  describe "parsing" do
+    it "returns a hash of parts" do
+      @domain_parser.parse("http://pauldix.net").should be_a Hash
+    end
+    it "includes the original url" do
+      @domain_parser.parse("http://www.pauldix.net")[:url].should == "http://www.pauldix.net/"
+    end
+    it "includes the scheme" do
+      @domain_parser.parse("http://www.pauldix.net")[:scheme].should == "http"
+    end
+    it "includes the full host" do
+      @domain_parser.parse("http://www.pauldix.net")[:host].should == "www.pauldix.net"
+    end
+    it "parses out the path" do
+      @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo#bar")[:path].should == "/foo.html?asdf=foo#bar"
+      @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo")[:path].should == "/foo.html?asdf=foo"
+      @domain_parser.parse("http://pauldix.net?asdf=foo")[:path].should == "?asdf=foo"
+      @domain_parser.parse("http://pauldix.net")[:path].should == ""
+    end
+    it "parses the tld" do
+      @domain_parser.parse("http://pauldix.net")[:public_suffix].should == "net"
+      @domain_parser.parse("http://pauldix.co.uk")[:public_suffix].should == "co.uk"
+      @domain_parser.parse("http://pauldix.com.kg")[:public_suffix].should == "com.kg"
+      @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:public_suffix].should == "com.kawasaki.jp"
+    end
+    it "should have the domain" do
+      @domain_parser.parse("http://pauldix.net")[:domain].should == "pauldix"
+      @domain_parser.parse("http://foo.pauldix.net")[:domain].should == "pauldix"
+      @domain_parser.parse("http://pauldix.co.uk")[:domain].should == "pauldix"
+      @domain_parser.parse("http://foo.pauldix.co.uk")[:domain].should == "pauldix"
+      @domain_parser.parse("http://pauldix.com.kg")[:domain].should == "pauldix"
+      @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:domain].should == "pauldix"
+    end
+    it "should have subdomains" do
+      @domain_parser.parse("http://foo.pauldix.net")[:subdomain].should == "foo"
+      @domain_parser.parse("http://bar.foo.pauldix.co.uk")[:subdomain].should == "bar.foo"
+    end
+    it "parses a link to localhost" do
+      parsed = @domain_parser.parse("http://localhost")
+      parsed[:host].should == "localhost"
+      parsed[:url].should == "http://localhost/"
+      parsed[:domain].should == "localhost"
+      parsed[:public_suffix].should == ""
+    end
+    it "should accept wildcards" do
+      @domain_parser.parse("http://*.pauldix.net")[:subdomain].should == "*"
+      @domain_parser.parse("http://pauldix.*")[:public_suffix].should == "*"
+      @domain_parser.parse("http://pauldix.net/*")[:path].should == "/*"
+      combined = @domain_parser.parse("http://*.pauldix.*/*")
+      combined[:subdomain].should == "*"
+      combined[:domain].should == "pauldix"
+      combined[:public_suffix].should == "*"
+      combined[:path].should == "/*"
+    end
+    it "should parse a URL if it has a wildcard exception" do
+      @domain_parser.parse("http://metro.tokyo.jp")[:domain].should == "metro"
+    end
+    it "should throw an exception if the tld is not valid" do
+      lambda { @domain_parser.parse("http://pauldix.nett") }.should raise_error(Domainatrix::ParseError)
+    end
+    it "should throw an exception if the domain doesn't contain a valid host" do
+      lambda { @domain_parser.parse("http://co.jp") }.should raise_error(Domainatrix::ParseError)
+    end
+    it "should throw an exception if the domain contains an invalid character" do
+      lambda { @domain_parser.parse("http://pauldix,net") }.should raise_error(Domainatrix::ParseError)
+    end
+    it "should thrown an exception if the url is malformed" do
+      lambda { @domain_parser.parse("http:/") }.should raise_error(Domainatrix::ParseError)
+    end
+    it "parses an ip address" do
+      @domain_parser.parse("http://123.123.123.123/foo/bar")[:domain].should == "123.123.123.123"
+      @domain_parser.parse("http://123.123.123.123/foo/bar")[:path].should == "/foo/bar"
+      @domain_parser.parse("http://123.123.123.123/foo/bar")[:ip_address].should == true
+    end
+    it "parses a host with numeric domain" do
+      @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:subdomain].should == "123.123"
+      @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:domain].should == "123"
+      @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:public_suffix].should == "co.uk"
+      @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:ip_address].should == false
+    end
+    it "should not parse an invalid ip address" do
+      lambda { @domain_parser.parse("http://12345") }.should raise_error(Domainatrix::ParseError)
+    end
+    it "defaults to http if no scheme is applied" do
+      @domain_parser.parse("www.pauldix.net")[:host].should == "www.pauldix.net"
+      @domain_parser.parse("www.pauldix.net")[:scheme].should == "http"
+    end
+  end
+  describe "handling utf-8" do
+    it "handles public suffixes with utf-8" do
+      @domain_parser.parse("http://pauldix.السعوديه")[:public_suffix].should == "السعوديه"
+      @domain_parser.parse("http://pauldix.臺灣")[:public_suffix].should == "臺灣"
+      @domain_parser.parse("http://pauldix.السعوديه")[:domain].should == "pauldix"
+      @domain_parser.parse("http://pauldix.臺灣")[:domain].should == "pauldix"
+    end
+    it "handles unicode urls as puny code" do
+       input = "http://✪df.ws/fil"
+       parsed = @domain_parser.parse(input)
+       parsed[:url].should == "http://xn--df-oiy.ws/fil"
+       parsed[:host].should == "✪df.ws"
+       parsed[:path].should == "/fil"
+       parsed[:public_suffix].should == "ws"
+    end
+  end
+end

data/spec/domainatrix/url_spec.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe "url" do
+  it "has the original url" do
+    Domainatrix::Url.new(:url => "http://pauldix.net").url.should == "http://pauldix.net"
+  end
+  it "has the public_suffix" do
+    Domainatrix::Url.new(:public_suffix => "net").public_suffix.should == "net"
+  end
+  it "has the domain" do
+    Domainatrix::Url.new(:domain => "pauldix").domain.should == "pauldix"
+  end
+  it "has the subdomain" do
+    Domainatrix::Url.new(:subdomain => "foo").subdomain.should == "foo"
+  end
+  it "has the path" do
+    Domainatrix::Url.new(:path => "/asdf.html").path.should == "/asdf.html"
+  end
+  it "reports if it is an ip address" do
+    Domainatrix::Url.new(:ip_address => true).ip_address.should == true
+  end
+  it "canonicalizes the url" do
+    Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix"
+    Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.foo"
+    Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.bar.foo"
+    Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
+    Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.foo"
+    Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.bar.foo"
+    Domainatrix::Url.new(:subdomain => "", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
+  end
+  it "canonicalizes the url with the path" do
+    Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net", :path => "/hello").canonical.should == "net.pauldix.foo/hello"
+  end
+  it "canonicalizes the url without the path" do
+    Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical(:include_path => false).should == "net.pauldix.foo"
+  end
+  it "combines the domain with the public_suffix" do
+    Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_public_suffix.should == "pauldix.net"
+    Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_public_suffix.should == "foo.co.uk"
+    Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_public_suffix.should == "bar.com"
+  end
+  it "combines the domain with the public_suffix as an alias" do
+    Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_tld.should == "pauldix.net"
+    Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_tld.should == "foo.co.uk"
+    Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_tld.should == "bar.com"
+  end
+  it "converts the url to a string" do
+    Domainatrix::Url.new(:scheme => "http", :subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "http://www.pauldix.net/some/path"
+    Domainatrix::Url.new(:subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "www.pauldix.net/some/path"
+    Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").to_s.should == "pauldix.co.uk"
+  end
+end

data/spec/domainatrix_spec.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe Domainatrix do
+  describe ".parse" do
+    it "should convert a string into a url object" do
+      Domainatrix.parse("http://pauldix.net").should be_a Domainatrix::Url
+    end
+    it "should canonicalize" do
+      Domainatrix.parse("http://pauldix.net").canonical.should == "net.pauldix"
+      Domainatrix.parse("http://pauldix.net/foo.html").canonical.should == "net.pauldix/foo.html"
+      Domainatrix.parse("http://pauldix.net/foo.html?asdf=bar").canonical.should == "net.pauldix/foo.html?asdf=bar"
+      Domainatrix.parse("http://foo.pauldix.net").canonical.should == "net.pauldix.foo"
+      Domainatrix.parse("http://foo.bar.pauldix.net").canonical.should == "net.pauldix.bar.foo"
+      Domainatrix.parse("http://pauldix.co.uk").canonical.should == "uk.co.pauldix"
+    end
+  end
+  describe ".scan" do
+    it "parses the url found in a string" do
+      input = "HAHA. This is why Conan should stay: http://losangeles.craigslist.org/sfv/clt/1551463643.html"
+      url = Domainatrix.scan(input).first
+      url.canonical.should == "org.craigslist.losangeles/sfv/clt/1551463643.html"
+    end
+    it "handles shouting" do
+      input = "TONIGHT!!  @chelseavperetti @toddglass @dougbenson @realjeffreyross ME and Tig Notaro   http://WWW.OPCCEVENTS.ORG/"
+      url = Domainatrix.scan(input).first
+      url.should_not be_nil
+      url.url.should == "http://www.opccevents.org/"
+    end
+    it "finds multiple urls in a string" do
+      input = <<-TEXT
+      http://google.com
+      and then http://yahoo.com
+      TEXT
+      google, yahoo = Domainatrix.scan(input)
+      google.domain.should == "google"
+      yahoo.domain.should == "yahoo"
+    end
+    it "returns a map of results when given a block" do
+      input = "http://a.com https://b.com"
+      domains = Domainatrix.scan(input) do |url|
+        url.domain
+      end
+      domains.should == %w(a b)
+    end
+    it "returns an empty array when no urls are found" do
+      Domainatrix.scan("Nope").should == []
+    end
+    it "removes unlikely characters from the end of URLs" do
+      input = <<-TEXT
+      Check out http://tobtr.com/s/821921.
+      Oh, and also (http://www.google.com): Cool stuff!
+      http://fora.tv/v/c8637, is almost as good as http://example.com...
+      http://foo.com" <http://baz.com>
+      TEXT
+      urls = Domainatrix.scan(input).map {|u| u.url}
+      urls.should == %w(http://tobtr.com/s/821921 http://www.google.com/ http://fora.tv/v/c8637 http://example.com/ http://foo.com/ http://baz.com/)
+    end
+  end
+  context 'localhost with a port' do
+    subject { Domainatrix.parse('localhost:3000') }
+    its(:scheme) { should == 'http' }
+    its(:host) { should == 'localhost' }
+    its(:url) { should == 'http://localhost:3000/' }
+    its(:public_suffix) { should == '' }
+    its(:domain) { should == 'localhost' }
+    its(:subdomain) { should == '' }
+    its(:path) { should == '' }
+    its(:domain_with_tld) { should == 'localhost' }
+  end
+  context 'without a scheme' do
+    subject { Domainatrix.parse('www.pauldix.net') }
+    its(:scheme) { should == 'http' }
+    its(:host) { should == 'www.pauldix.net' }
+    its(:url) { should == 'http://www.pauldix.net/' }
+    its(:public_suffix) { should == 'net' }
+    its(:domain) { should == 'pauldix' }
+    its(:subdomain) { should == 'www' }
+    its(:path) { should == '' }
+    its(:domain_with_tld) { should == 'pauldix.net' }
+  end
+  context 'with a blank url' do
+    subject { Domainatrix.parse(nil) }
+    its(:scheme) { should == '' }
+    its(:host) { should == '' }
+    its(:url) { should == '' }
+    its(:public_suffix) { should == '' }
+    its(:domain) { should == '' }
+    its(:subdomain) { should == '' }
+    its(:path) { should == '' }
+    its(:domain_with_tld) { should == '' }
+  end
+end

data/spec/spec.opts ADDED Viewed

@@ -0,0 +1,3 @@
+--diff
+--color
+--backtrace

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require "rubygems"
+require "rspec"
+# gem install redgreen for colored test output
+begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
+path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
+$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
+require "#{File.dirname(__FILE__)}/../lib/domainatrix"

metadata ADDED Viewed

@@ -0,0 +1,95 @@
+--- !ruby/object:Gem::Specification
+name: shadowbq-domainatrix
+version: !ruby/object:Gem::Version
+  version: 0.0.11
+  prerelease:
+platform: ruby
+authors:
+- Paul Dix
+- Brian John
+- Shadowbq
+- Menno van der Sman
+- Wouter Broekhof
+- Wilson
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-03-21 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description:
+email:
+- shadowbq@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/domainatrix.rb
+- lib/effective_tld_names.dat
+- lib/domainatrix/domain_parser.rb
+- lib/domainatrix/url.rb
+- CHANGELOG.md
+- README.textile
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/domainatrix_spec.rb
+- spec/domainatrix/domain_parser_spec.rb
+- spec/domainatrix/url_spec.rb
+homepage: http://github.com/shadowbq/domainatrix
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 2
+summary: A cruel mistress that uses the public suffix domain list to dominate URLs
+  by canonicalizing, finding the public suffix, and breaking them into their domain
+  parts.
+test_files: []