shadowbq-domainatrix 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require File.dirname(__FILE__) + '/../spec_helper'
3
+
4
+ describe "domain parser" do
5
+ before(:all) do
6
+ @domain_parser = Domainatrix::DomainParser.new("#{File.dirname(__FILE__)}/../../lib/effective_tld_names.dat")
7
+ end
8
+
9
+ describe "reading the dat file" do
10
+ it "creates a tree of the domain names" do
11
+ @domain_parser.public_suffixes.should be_a Hash
12
+ end
13
+
14
+ it "creates the first level of the tree" do
15
+ @domain_parser.public_suffixes.should have_key("com")
16
+ end
17
+
18
+ it "creates the first level of the tree even when the first doesn't appear on a line by itself" do
19
+ @domain_parser.public_suffixes.should have_key("uk")
20
+ end
21
+
22
+ it "creates lower levels of the tree" do
23
+ @domain_parser.public_suffixes["jp"].should have_key("ac")
24
+ @domain_parser.public_suffixes["jp"]["kawasaki"].should have_key("*")
25
+ end
26
+ end
27
+
28
+ describe "parsing" do
29
+ it "returns a hash of parts" do
30
+ @domain_parser.parse("http://pauldix.net").should be_a Hash
31
+ end
32
+
33
+ it "includes the original url" do
34
+ @domain_parser.parse("http://www.pauldix.net")[:url].should == "http://www.pauldix.net/"
35
+ end
36
+
37
+ it "includes the scheme" do
38
+ @domain_parser.parse("http://www.pauldix.net")[:scheme].should == "http"
39
+ end
40
+
41
+ it "includes the full host" do
42
+ @domain_parser.parse("http://www.pauldix.net")[:host].should == "www.pauldix.net"
43
+ end
44
+
45
+ it "parses out the path" do
46
+ @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo#bar")[:path].should == "/foo.html?asdf=foo#bar"
47
+ @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo")[:path].should == "/foo.html?asdf=foo"
48
+ @domain_parser.parse("http://pauldix.net?asdf=foo")[:path].should == "?asdf=foo"
49
+ @domain_parser.parse("http://pauldix.net")[:path].should == ""
50
+ end
51
+
52
+ it "parses the tld" do
53
+ @domain_parser.parse("http://pauldix.net")[:public_suffix].should == "net"
54
+ @domain_parser.parse("http://pauldix.co.uk")[:public_suffix].should == "co.uk"
55
+ @domain_parser.parse("http://pauldix.com.kg")[:public_suffix].should == "com.kg"
56
+ @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:public_suffix].should == "com.kawasaki.jp"
57
+ end
58
+
59
+ it "should have the domain" do
60
+ @domain_parser.parse("http://pauldix.net")[:domain].should == "pauldix"
61
+ @domain_parser.parse("http://foo.pauldix.net")[:domain].should == "pauldix"
62
+ @domain_parser.parse("http://pauldix.co.uk")[:domain].should == "pauldix"
63
+ @domain_parser.parse("http://foo.pauldix.co.uk")[:domain].should == "pauldix"
64
+ @domain_parser.parse("http://pauldix.com.kg")[:domain].should == "pauldix"
65
+ @domain_parser.parse("http://pauldix.com.kawasaki.jp")[:domain].should == "pauldix"
66
+ end
67
+
68
+ it "should have subdomains" do
69
+ @domain_parser.parse("http://foo.pauldix.net")[:subdomain].should == "foo"
70
+ @domain_parser.parse("http://bar.foo.pauldix.co.uk")[:subdomain].should == "bar.foo"
71
+ end
72
+
73
+ it "parses a link to localhost" do
74
+ parsed = @domain_parser.parse("http://localhost")
75
+ parsed[:host].should == "localhost"
76
+ parsed[:url].should == "http://localhost/"
77
+ parsed[:domain].should == "localhost"
78
+ parsed[:public_suffix].should == ""
79
+ end
80
+
81
+ it "should accept wildcards" do
82
+ @domain_parser.parse("http://*.pauldix.net")[:subdomain].should == "*"
83
+ @domain_parser.parse("http://pauldix.*")[:public_suffix].should == "*"
84
+ @domain_parser.parse("http://pauldix.net/*")[:path].should == "/*"
85
+
86
+ combined = @domain_parser.parse("http://*.pauldix.*/*")
87
+ combined[:subdomain].should == "*"
88
+ combined[:domain].should == "pauldix"
89
+ combined[:public_suffix].should == "*"
90
+ combined[:path].should == "/*"
91
+ end
92
+
93
+ it "should parse a URL if it has a wildcard exception" do
94
+ @domain_parser.parse("http://metro.tokyo.jp")[:domain].should == "metro"
95
+ end
96
+
97
+ it "should throw an exception if the tld is not valid" do
98
+ lambda { @domain_parser.parse("http://pauldix.nett") }.should raise_error(Domainatrix::ParseError)
99
+ end
100
+
101
+ it "should throw an exception if the domain doesn't contain a valid host" do
102
+ lambda { @domain_parser.parse("http://co.jp") }.should raise_error(Domainatrix::ParseError)
103
+ end
104
+
105
+ it "should throw an exception if the domain contains an invalid character" do
106
+ lambda { @domain_parser.parse("http://pauldix,net") }.should raise_error(Domainatrix::ParseError)
107
+ end
108
+
109
+ it "should thrown an exception if the url is malformed" do
110
+ lambda { @domain_parser.parse("http:/") }.should raise_error(Domainatrix::ParseError)
111
+ end
112
+
113
+ it "parses an ip address" do
114
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:domain].should == "123.123.123.123"
115
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:path].should == "/foo/bar"
116
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:ip_address].should == true
117
+ end
118
+
119
+ it "parses a host with numeric domain" do
120
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:subdomain].should == "123.123"
121
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:domain].should == "123"
122
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:public_suffix].should == "co.uk"
123
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:ip_address].should == false
124
+ end
125
+
126
+ it "should not parse an invalid ip address" do
127
+ lambda { @domain_parser.parse("http://12345") }.should raise_error(Domainatrix::ParseError)
128
+ end
129
+
130
+ it "defaults to http if no scheme is applied" do
131
+ @domain_parser.parse("www.pauldix.net")[:host].should == "www.pauldix.net"
132
+ @domain_parser.parse("www.pauldix.net")[:scheme].should == "http"
133
+ end
134
+
135
+ end
136
+
137
+ describe "handling utf-8" do
138
+
139
+ it "handles public suffixes with utf-8" do
140
+ @domain_parser.parse("http://pauldix.السعوديه")[:public_suffix].should == "السعوديه"
141
+ @domain_parser.parse("http://pauldix.臺灣")[:public_suffix].should == "臺灣"
142
+ @domain_parser.parse("http://pauldix.السعوديه")[:domain].should == "pauldix"
143
+ @domain_parser.parse("http://pauldix.臺灣")[:domain].should == "pauldix"
144
+ end
145
+
146
+ it "handles unicode urls as puny code" do
147
+ input = "http://✪df.ws/fil"
148
+ parsed = @domain_parser.parse(input)
149
+ parsed[:url].should == "http://xn--df-oiy.ws/fil"
150
+ parsed[:host].should == "✪df.ws"
151
+ parsed[:path].should == "/fil"
152
+ parsed[:public_suffix].should == "ws"
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,64 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe "url" do
4
+ it "has the original url" do
5
+ Domainatrix::Url.new(:url => "http://pauldix.net").url.should == "http://pauldix.net"
6
+ end
7
+
8
+ it "has the public_suffix" do
9
+ Domainatrix::Url.new(:public_suffix => "net").public_suffix.should == "net"
10
+ end
11
+
12
+ it "has the domain" do
13
+ Domainatrix::Url.new(:domain => "pauldix").domain.should == "pauldix"
14
+ end
15
+
16
+ it "has the subdomain" do
17
+ Domainatrix::Url.new(:subdomain => "foo").subdomain.should == "foo"
18
+ end
19
+
20
+ it "has the path" do
21
+ Domainatrix::Url.new(:path => "/asdf.html").path.should == "/asdf.html"
22
+ end
23
+
24
+ it "reports if it is an ip address" do
25
+ Domainatrix::Url.new(:ip_address => true).ip_address.should == true
26
+ end
27
+
28
+ it "canonicalizes the url" do
29
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix"
30
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.foo"
31
+ Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.bar.foo"
32
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
33
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.foo"
34
+ Domainatrix::Url.new(:subdomain => "foo.bar", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix.bar.foo"
35
+ Domainatrix::Url.new(:subdomain => "", :domain => "pauldix", :public_suffix => "co.uk").canonical.should == "uk.co.pauldix"
36
+ end
37
+
38
+ it "canonicalizes the url with the path" do
39
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net", :path => "/hello").canonical.should == "net.pauldix.foo/hello"
40
+ end
41
+
42
+ it "canonicalizes the url without the path" do
43
+ Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical(:include_path => false).should == "net.pauldix.foo"
44
+ end
45
+
46
+ it "combines the domain with the public_suffix" do
47
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_public_suffix.should == "pauldix.net"
48
+ Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_public_suffix.should == "foo.co.uk"
49
+ Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_public_suffix.should == "bar.com"
50
+ end
51
+
52
+ it "combines the domain with the public_suffix as an alias" do
53
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").domain_with_tld.should == "pauldix.net"
54
+ Domainatrix::Url.new(:domain => "foo", :public_suffix => "co.uk" ).domain_with_tld.should == "foo.co.uk"
55
+ Domainatrix::Url.new(:subdomain => "baz", :domain => "bar", :public_suffix => "com").domain_with_tld.should == "bar.com"
56
+ end
57
+
58
+ it "converts the url to a string" do
59
+ Domainatrix::Url.new(:scheme => "http", :subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "http://www.pauldix.net/some/path"
60
+ Domainatrix::Url.new(:subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "www.pauldix.net/some/path"
61
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").to_s.should == "pauldix.co.uk"
62
+ end
63
+
64
+ end
@@ -0,0 +1,106 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Domainatrix do
4
+ describe ".parse" do
5
+ it "should convert a string into a url object" do
6
+ Domainatrix.parse("http://pauldix.net").should be_a Domainatrix::Url
7
+ end
8
+
9
+ it "should canonicalize" do
10
+ Domainatrix.parse("http://pauldix.net").canonical.should == "net.pauldix"
11
+ Domainatrix.parse("http://pauldix.net/foo.html").canonical.should == "net.pauldix/foo.html"
12
+ Domainatrix.parse("http://pauldix.net/foo.html?asdf=bar").canonical.should == "net.pauldix/foo.html?asdf=bar"
13
+ Domainatrix.parse("http://foo.pauldix.net").canonical.should == "net.pauldix.foo"
14
+ Domainatrix.parse("http://foo.bar.pauldix.net").canonical.should == "net.pauldix.bar.foo"
15
+ Domainatrix.parse("http://pauldix.co.uk").canonical.should == "uk.co.pauldix"
16
+ end
17
+ end
18
+
19
+ describe ".scan" do
20
+ it "parses the url found in a string" do
21
+ input = "HAHA. This is why Conan should stay: http://losangeles.craigslist.org/sfv/clt/1551463643.html"
22
+ url = Domainatrix.scan(input).first
23
+ url.canonical.should == "org.craigslist.losangeles/sfv/clt/1551463643.html"
24
+ end
25
+
26
+ it "handles shouting" do
27
+ input = "TONIGHT!! @chelseavperetti @toddglass @dougbenson @realjeffreyross ME and Tig Notaro http://WWW.OPCCEVENTS.ORG/"
28
+ url = Domainatrix.scan(input).first
29
+ url.should_not be_nil
30
+ url.url.should == "http://www.opccevents.org/"
31
+ end
32
+
33
+
34
+ it "finds multiple urls in a string" do
35
+ input = <<-TEXT
36
+ http://google.com
37
+ and then http://yahoo.com
38
+ TEXT
39
+ google, yahoo = Domainatrix.scan(input)
40
+ google.domain.should == "google"
41
+ yahoo.domain.should == "yahoo"
42
+ end
43
+
44
+ it "returns a map of results when given a block" do
45
+ input = "http://a.com https://b.com"
46
+ domains = Domainatrix.scan(input) do |url|
47
+ url.domain
48
+ end
49
+ domains.should == %w(a b)
50
+ end
51
+
52
+ it "returns an empty array when no urls are found" do
53
+ Domainatrix.scan("Nope").should == []
54
+ end
55
+
56
+ it "removes unlikely characters from the end of URLs" do
57
+ input = <<-TEXT
58
+ Check out http://tobtr.com/s/821921.
59
+ Oh, and also (http://www.google.com): Cool stuff!
60
+ http://fora.tv/v/c8637, is almost as good as http://example.com...
61
+ http://foo.com" <http://baz.com>
62
+ TEXT
63
+
64
+ urls = Domainatrix.scan(input).map {|u| u.url}
65
+ urls.should == %w(http://tobtr.com/s/821921 http://www.google.com/ http://fora.tv/v/c8637 http://example.com/ http://foo.com/ http://baz.com/)
66
+ end
67
+
68
+ end
69
+
70
+ context 'localhost with a port' do
71
+ subject { Domainatrix.parse('localhost:3000') }
72
+ its(:scheme) { should == 'http' }
73
+ its(:host) { should == 'localhost' }
74
+ its(:url) { should == 'http://localhost:3000/' }
75
+ its(:public_suffix) { should == '' }
76
+ its(:domain) { should == 'localhost' }
77
+ its(:subdomain) { should == '' }
78
+ its(:path) { should == '' }
79
+ its(:domain_with_tld) { should == 'localhost' }
80
+ end
81
+
82
+ context 'without a scheme' do
83
+ subject { Domainatrix.parse('www.pauldix.net') }
84
+ its(:scheme) { should == 'http' }
85
+ its(:host) { should == 'www.pauldix.net' }
86
+ its(:url) { should == 'http://www.pauldix.net/' }
87
+ its(:public_suffix) { should == 'net' }
88
+ its(:domain) { should == 'pauldix' }
89
+ its(:subdomain) { should == 'www' }
90
+ its(:path) { should == '' }
91
+ its(:domain_with_tld) { should == 'pauldix.net' }
92
+ end
93
+
94
+ context 'with a blank url' do
95
+ subject { Domainatrix.parse(nil) }
96
+ its(:scheme) { should == '' }
97
+ its(:host) { should == '' }
98
+ its(:url) { should == '' }
99
+ its(:public_suffix) { should == '' }
100
+ its(:domain) { should == '' }
101
+ its(:subdomain) { should == '' }
102
+ its(:path) { should == '' }
103
+ its(:domain_with_tld) { should == '' }
104
+ end
105
+
106
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,3 @@
1
+ --diff
2
+ --color
3
+ --backtrace
@@ -0,0 +1,10 @@
1
+ require "rubygems"
2
+ require "rspec"
3
+
4
+ # gem install redgreen for colored test output
5
+ begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
6
+
7
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
8
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
9
+
10
+ require "#{File.dirname(__FILE__)}/../lib/domainatrix"
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shadowbq-domainatrix
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.11
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Paul Dix
9
+ - Brian John
10
+ - Shadowbq
11
+ - Menno van der Sman
12
+ - Wouter Broekhof
13
+ - Wilson
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+ date: 2013-03-21 00:00:00.000000000 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: addressable
21
+ requirement: !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ type: :runtime
28
+ prerelease: false
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ - !ruby/object:Gem::Dependency
36
+ name: rspec
37
+ requirement: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ description:
52
+ email:
53
+ - shadowbq@gmail.com
54
+ executables: []
55
+ extensions: []
56
+ extra_rdoc_files: []
57
+ files:
58
+ - lib/domainatrix.rb
59
+ - lib/effective_tld_names.dat
60
+ - lib/domainatrix/domain_parser.rb
61
+ - lib/domainatrix/url.rb
62
+ - CHANGELOG.md
63
+ - README.textile
64
+ - spec/spec.opts
65
+ - spec/spec_helper.rb
66
+ - spec/domainatrix_spec.rb
67
+ - spec/domainatrix/domain_parser_spec.rb
68
+ - spec/domainatrix/url_spec.rb
69
+ homepage: http://github.com/shadowbq/domainatrix
70
+ licenses: []
71
+ post_install_message:
72
+ rdoc_options: []
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ requirements: []
88
+ rubyforge_project:
89
+ rubygems_version: 1.8.24
90
+ signing_key:
91
+ specification_version: 2
92
+ summary: A cruel mistress that uses the public suffix domain list to dominate URLs
93
+ by canonicalizing, finding the public suffix, and breaking them into their domain
94
+ parts.
95
+ test_files: []