shadowbq-domainatrix 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require File.dirname(__FILE__) + '/../spec_helper'
3
+
4
+ describe "domain parser" do
5
+ before(:all) do
6
+ @domain_parser = Domainatrix::DomainParser.new("#{File.dirname(__FILE__)}/../../lib/effective_tld_names.dat")
7
+ end
8
+
9
+ describe "reading the dat file" do
10
+ it "creates a tree of the domain names" do
11
+ @domain_parser.public_suffixes.should be_a Hash
12
+ end
13
+
14
+ it "creates the first level of the tree" do
15
+ @domain_parser.public_suffixes.should have_key("com")
16
+ end
17
+
18
+ it "creates the first level of the tree even when the first doesn't appear on a line by itself" do
19
+ @domain_parser.public_suffixes.should have_key("uk")
20
+ end
21
+
22
+ it "creates lower levels of the tree" do
23
+ @domain_parser.public_suffixes["jp"].should have_key("ac")
24
+ @domain_parser.public_suffixes["jp"]["kawasaki"].should have_key("*")
25
+ end
26
+ end
27
+
28
+ describe "parsing" do
29
+ it "returns a hash of parts" do
30
+ @domain_parser.parse("http://pauldix.net").should be_a Hash
31
+ end
32
+
33
+ it "includes the original url" do
34
+ @domain_parser.parse("http://www.pauldix.net")[:url].should == "http://www.pauldix.net/"
35
+ end
36
+
37
+ it "includes the scheme" do
38
+ @domain_parser.parse("http://www.pauldix.net")[:scheme].should == "http"
39
+ end
40
+
41
+ it "includes the full host" do
42
+ @domain_parser.parse("http://www.pauldix.net")[:host].should == "www.pauldix.net"
43
+ end
44
+
45
+ it "parses out the path" do
46
+ @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo#bar")[:path].should == "/foo.html?asdf=foo#bar"
47
+ @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo")[:path].should == "/foo.html?asdf=foo"
48
+ @domain_parser.parse("http://pauldix.net?asdf=foo")[:path].should == "?asdf=foo"
49
+ @domain_parser.parse("http://pauldix.net")[:path].should == ""
50
+ end
51
+
52
+ it "parses the tld" do
53
+ @domain_parser.parse("http://pauldix.net")[:public_suffix].should == "net"
54
+ @domain_parser.parse("http://pauldix.co.uk")[:public_suffix].should == "co.uk"
55
+ @domain_parser.parse("http://pauldix.com.kg")[:public_suffix].should == "com.kg"
56
+ @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:public_suffix].should == "com.kawasaki.jp"
57
+ end
58
+
59
+ it "should have the domain" do
60
+ @domain_parser.parse("http://pauldix.net")[:domain].should == "pauldix"
61
+ @domain_parser.parse("http://foo.pauldix.net")[:domain].should == "pauldix"
62
+ @domain_parser.parse("http://pauldix.co.uk")[:domain].should == "pauldix"
63
+ @domain_parser.parse("http://foo.pauldix.co.uk")[:domain].should == "pauldix"
64
+ @domain_parser.parse("http://pauldix.com.kg")[:domain].should == "pauldix"
65
+ @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:domain].should == "pauldix"
66
+ end
67
+
68
+ it "should have subdomains" do
69
+ @domain_parser.parse("http://foo.pauldix.net")[:subdomain].should == "foo"
70
+ @domain_parser.parse("http://bar.foo.pauldix.co.uk")[:subdomain].should == "bar.foo"
71
+ end
72
+
73
+ it "parses a link to localhost" do
74
+ parsed = @domain_parser.parse("http://localhost")
75
+ parsed[:host].should == "localhost"
76
+ parsed[:url].should == "http://localhost/"
77
+ parsed[:domain].should == "localhost"
78
+ parsed[:public_suffix].should == ""
79
+ end
80
+
81
+ it "should accept wildcards" do
82
+ @domain_parser.parse("http://*.pauldix.net")[:subdomain].should == "*"
83
+ @domain_parser.parse("http://pauldix.*")[:public_suffix].should == "*"
84
+ @domain_parser.parse("http://pauldix.net/*")[:path].should == "/*"
85
+
86
+ combined = @domain_parser.parse("http://*.pauldix.*/*")
87
+ combined[:subdomain].should == "*"
88
+ combined[:domain].should == "pauldix"
89
+ combined[:public_suffix].should == "*"
90
+ combined[:path].should == "/*"
91
+ end
92
+
93
+ it "should parse a URL if it has a wildcard exception" do
94
+ @domain_parser.parse("http://metro.tokyo.jp")[:domain].should == "metro"
95
+ end
96
+
97
+ it "should throw an exception if the tld is not valid" do
98
+ lambda { @domain_parser.parse("http://pauldix.nett") }.should raise_error(Domainatrix::ParseError)
99
+ end
100
+
101
+ it "should throw an exception if the domain doesn't contain a valid host" do
102
+ lambda { @domain_parser.parse("http://co.jp") }.should raise_error(Domainatrix::ParseError)
103
+ end
104
+
105
+ it "should throw an exception if the domain contains an invalid character" do
106
+ lambda { @domain_parser.parse("http://pauldix,net") }.should raise_error(Domainatrix::ParseError)
107
+ end
108
+
109
+ it "should thrown an exception if the url is malformed" do
110
+ lambda { @domain_parser.parse("http:/") }.should raise_error(Domainatrix::ParseError)
111
+ end
112
+
113
+ it "parses an ip address" do
114
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:domain].should == "123.123.123.123"
115
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:path].should == "/foo/bar"
116
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:ip_address].should == true
117
+ end
118
+
119
+ it "parses a host with numeric domain" do
120
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:subdomain].should == "123.123"
121
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:domain].should == "123"
122
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:public_suffix].should == "co.uk"
123
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:ip_address].should == false
124
+ end
125
+
126
+ it "should not parse an invalid ip address" do
127
+ lambda { @domain_parser.parse("http://12345") }.should raise_error(Domainatrix::ParseError)
128
+ end
129
+
130
+ it "defaults to http if no scheme is applied" do
131
+ @domain_parser.parse("www.pauldix.net")[:host].should == "www.pauldix.net"
132
+ @domain_parser.parse("www.pauldix.net")[:scheme].should == "http"
133
+ end
134
+
135
+ end
136
+
137
+ describe "handling utf-8" do
138
+
139
+ it "handles public suffixes with utf-8" do
140
+ @domain_parser.parse("http://pauldix.السعوديه")[:public_suffix].should == "السعوديه"
141
+ @domain_parser.parse("http://pauldix.臺灣")[:public_suffix].should == "臺灣"
142
+ @domain_parser.parse("http://pauldix.السعوديه")[:domain].should == "pauldix"
143
+ @domain_parser.parse("http://pauldix.臺灣")[:domain].should == "pauldix"
144
+ end
145
+
146
+ it "handles unicode urls as puny code" do
147
+ input = "http://✪df.ws/fil"
148
+ parsed = @domain_parser.parse(input)
149
+ parsed[:url].should == "http://xn--df-oiy.ws/fil"
150
+ parsed[:host].should == "✪df.ws"
151
+ parsed[:path].should == "/fil"
152
+ parsed[:public_suffix].should == "ws"
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,64 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe "url" do
4
+ it "has the original url" do
5
+ Domainatrix::Url.new(:url => "http://pauldix.net").url.should == "http://pauldix.net"
6
+ end
7
+
8
+ it "has the public_suffix" do
9
+ Domainatrix::Url.new(:public_suffix => "net").public_suffix.should == "net"
10
+ end
11
+
12
+ it "has the domain" do
13
+ Domainatrix::Url.new(:domain => "pauldix").domain.should == "pauldix"
14
+ end
15
+
16
+ it "has the subdomain" do
17
+ Domainatrix::Url.new(:subdomain => "foo").subdomain.should == "foo"
18
+ end
19
+
20
+ it "has the path" do
21
+ Domainatrix::Url.new(:path => "/asdf.html").path.should == "/asdf.html"
22
+ end
23
+
24
+ it "reports if it is an ip address" do
25
+ Domainatrix::Url.new(:ip_address => true).ip_address.should == true
26
+ end
27
+
28
+ it "canonicalizes the url" do
29
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix"
30
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.foo"
31
+ Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.bar.foo"
32
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
33
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.foo"
34
+ Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.bar.foo"
35
+ Domainatrix::Url.new(:subdomain => "", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
36
+ end
37
+
38
+ it "canonicalizes the url with the path" do
39
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net", :path => "/hello").canonical.should == "net.pauldix.foo/hello"
40
+ end
41
+
42
+ it "canonicalizes the url without the path" do
43
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical(:include_path => false).should == "net.pauldix.foo"
44
+ end
45
+
46
+ it "combines the domain with the public_suffix" do
47
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_public_suffix.should == "pauldix.net"
48
+ Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_public_suffix.should == "foo.co.uk"
49
+ Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_public_suffix.should == "bar.com"
50
+ end
51
+
52
+ it "combines the domain with the public_suffix as an alias" do
53
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_tld.should == "pauldix.net"
54
+ Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_tld.should == "foo.co.uk"
55
+ Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_tld.should == "bar.com"
56
+ end
57
+
58
+ it "converts the url to a string" do
59
+ Domainatrix::Url.new(:scheme => "http", :subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "http://www.pauldix.net/some/path"
60
+ Domainatrix::Url.new(:subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "www.pauldix.net/some/path"
61
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").to_s.should == "pauldix.co.uk"
62
+ end
63
+
64
+ end
@@ -0,0 +1,106 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Domainatrix do
4
+ describe ".parse" do
5
+ it "should convert a string into a url object" do
6
+ Domainatrix.parse("http://pauldix.net").should be_a Domainatrix::Url
7
+ end
8
+
9
+ it "should canonicalize" do
10
+ Domainatrix.parse("http://pauldix.net").canonical.should == "net.pauldix"
11
+ Domainatrix.parse("http://pauldix.net/foo.html").canonical.should == "net.pauldix/foo.html"
12
+ Domainatrix.parse("http://pauldix.net/foo.html?asdf=bar").canonical.should == "net.pauldix/foo.html?asdf=bar"
13
+ Domainatrix.parse("http://foo.pauldix.net").canonical.should == "net.pauldix.foo"
14
+ Domainatrix.parse("http://foo.bar.pauldix.net").canonical.should == "net.pauldix.bar.foo"
15
+ Domainatrix.parse("http://pauldix.co.uk").canonical.should == "uk.co.pauldix"
16
+ end
17
+ end
18
+
19
+ describe ".scan" do
20
+ it "parses the url found in a string" do
21
+ input = "HAHA. This is why Conan should stay: http://losangeles.craigslist.org/sfv/clt/1551463643.html"
22
+ url = Domainatrix.scan(input).first
23
+ url.canonical.should == "org.craigslist.losangeles/sfv/clt/1551463643.html"
24
+ end
25
+
26
+ it "handles shouting" do
27
+ input = "TONIGHT!! @chelseavperetti @toddglass @dougbenson @realjeffreyross ME and Tig Notaro http://WWW.OPCCEVENTS.ORG/"
28
+ url = Domainatrix.scan(input).first
29
+ url.should_not be_nil
30
+ url.url.should == "http://www.opccevents.org/"
31
+ end
32
+
33
+
34
+ it "finds multiple urls in a string" do
35
+ input = <<-TEXT
36
+ http://google.com
37
+ and then http://yahoo.com
38
+ TEXT
39
+ google, yahoo = Domainatrix.scan(input)
40
+ google.domain.should == "google"
41
+ yahoo.domain.should == "yahoo"
42
+ end
43
+
44
+ it "returns a map of results when given a block" do
45
+ input = "http://a.com https://b.com"
46
+ domains = Domainatrix.scan(input) do |url|
47
+ url.domain
48
+ end
49
+ domains.should == %w(a b)
50
+ end
51
+
52
+ it "returns an empty array when no urls are found" do
53
+ Domainatrix.scan("Nope").should == []
54
+ end
55
+
56
+ it "removes unlikely characters from the end of URLs" do
57
+ input = <<-TEXT
58
+ Check out http://tobtr.com/s/821921.
59
+ Oh, and also (http://www.google.com): Cool stuff!
60
+ http://fora.tv/v/c8637, is almost as good as http://example.com...
61
+ http://foo.com" <http://baz.com>
62
+ TEXT
63
+
64
+ urls = Domainatrix.scan(input).map {|u| u.url}
65
+ urls.should == %w(http://tobtr.com/s/821921 http://www.google.com/ http://fora.tv/v/c8637 http://example.com/ http://foo.com/ http://baz.com/)
66
+ end
67
+
68
+ end
69
+
70
+ context 'localhost with a port' do
71
+ subject { Domainatrix.parse('localhost:3000') }
72
+ its(:scheme) { should == 'http' }
73
+ its(:host) { should == 'localhost' }
74
+ its(:url) { should == 'http://localhost:3000/' }
75
+ its(:public_suffix) { should == '' }
76
+ its(:domain) { should == 'localhost' }
77
+ its(:subdomain) { should == '' }
78
+ its(:path) { should == '' }
79
+ its(:domain_with_tld) { should == 'localhost' }
80
+ end
81
+
82
+ context 'without a scheme' do
83
+ subject { Domainatrix.parse('www.pauldix.net') }
84
+ its(:scheme) { should == 'http' }
85
+ its(:host) { should == 'www.pauldix.net' }
86
+ its(:url) { should == 'http://www.pauldix.net/' }
87
+ its(:public_suffix) { should == 'net' }
88
+ its(:domain) { should == 'pauldix' }
89
+ its(:subdomain) { should == 'www' }
90
+ its(:path) { should == '' }
91
+ its(:domain_with_tld) { should == 'pauldix.net' }
92
+ end
93
+
94
+ context 'with a blank url' do
95
+ subject { Domainatrix.parse(nil) }
96
+ its(:scheme) { should == '' }
97
+ its(:host) { should == '' }
98
+ its(:url) { should == '' }
99
+ its(:public_suffix) { should == '' }
100
+ its(:domain) { should == '' }
101
+ its(:subdomain) { should == '' }
102
+ its(:path) { should == '' }
103
+ its(:domain_with_tld) { should == '' }
104
+ end
105
+
106
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,3 @@
1
+ --diff
2
+ --color
3
+ --backtrace
@@ -0,0 +1,10 @@
1
+ require "rubygems"
2
+ require "rspec"
3
+
4
+ # gem install redgreen for colored test output
5
+ begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
6
+
7
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
8
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
9
+
10
+ require "#{File.dirname(__FILE__)}/../lib/domainatrix"
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shadowbq-domainatrix
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.11
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Paul Dix
9
+ - Brian John
10
+ - Shadowbq
11
+ - Menno van der Sman
12
+ - Wouter Broekhof
13
+ - Wilson
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+ date: 2013-03-21 00:00:00.000000000 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: addressable
21
+ requirement: !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ type: :runtime
28
+ prerelease: false
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ - !ruby/object:Gem::Dependency
36
+ name: rspec
37
+ requirement: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ description:
52
+ email:
53
+ - shadowbq@gmail.com
54
+ executables: []
55
+ extensions: []
56
+ extra_rdoc_files: []
57
+ files:
58
+ - lib/domainatrix.rb
59
+ - lib/effective_tld_names.dat
60
+ - lib/domainatrix/domain_parser.rb
61
+ - lib/domainatrix/url.rb
62
+ - CHANGELOG.md
63
+ - README.textile
64
+ - spec/spec.opts
65
+ - spec/spec_helper.rb
66
+ - spec/domainatrix_spec.rb
67
+ - spec/domainatrix/domain_parser_spec.rb
68
+ - spec/domainatrix/url_spec.rb
69
+ homepage: http://github.com/shadowbq/domainatrix
70
+ licenses: []
71
+ post_install_message:
72
+ rdoc_options: []
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ requirements: []
88
+ rubyforge_project:
89
+ rubygems_version: 1.8.24
90
+ signing_key:
91
+ specification_version: 2
92
+ summary: A cruel mistress that uses the public suffix domain list to dominate URLs
93
+ by canonicalizing, finding the public suffix, and breaking them into their domain
94
+ parts.
95
+ test_files: []