f1sherman-domainatrix 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +64 -0
- data/lib/domainatrix/domain_parser.rb +77 -0
- data/lib/domainatrix/url.rb +33 -0
- data/lib/domainatrix.rb +14 -0
- data/lib/effective_tld_names.dat +5189 -0
- data/spec/domainatrix/domain_parser_spec.rb +71 -0
- data/spec/domainatrix/url_spec.rb +54 -0
- data/spec/domainatrix_spec.rb +16 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +10 -0
- metadata +87 -0
data/README.textile
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
h1. Domainatrix
|
2
|
+
|
3
|
+
"http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
|
4
|
+
|
5
|
+
h2. Summary
|
6
|
+
|
7
|
+
A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
|
8
|
+
|
9
|
+
h2. Description
|
10
|
+
|
11
|
+
This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
|
12
|
+
|
13
|
+
h2. Installation
|
14
|
+
|
15
|
+
<pre>
|
16
|
+
gem install domainatrix --source http://gemcutter.org
|
17
|
+
</pre>
|
18
|
+
|
19
|
+
h2. Use
|
20
|
+
|
21
|
+
<pre>
|
22
|
+
require 'rubygems'
|
23
|
+
require 'domainatrix'
|
24
|
+
|
25
|
+
url = Domainatrix.parse("http://www.pauldix.net")
|
26
|
+
url.url # => "http://www.pauldix.net" (the original url)
|
27
|
+
url.public_suffix # => "net"
|
28
|
+
url.domain # => "pauldix"
|
29
|
+
url.canonical # => "net.pauldix"
|
30
|
+
|
31
|
+
url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
|
32
|
+
url.public_suffix # => "co.uk"
|
33
|
+
url.domain # => "pauldix"
|
34
|
+
url.subdomain # => "foo.bar"
|
35
|
+
url.path # => "/asdf.html?q=arg"
|
36
|
+
url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
|
37
|
+
</pre>
|
38
|
+
|
39
|
+
h2. LICENSE
|
40
|
+
|
41
|
+
(The MIT License)
|
42
|
+
|
43
|
+
Copyright (c) 2009:
|
44
|
+
|
45
|
+
"Paul Dix":http://pauldix.net
|
46
|
+
|
47
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
48
|
+
a copy of this software and associated documentation files (the
|
49
|
+
'Software'), to deal in the Software without restriction, including
|
50
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
51
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
52
|
+
permit persons to whom the Software is furnished to do so, subject to
|
53
|
+
the following conditions:
|
54
|
+
|
55
|
+
The above copyright notice and this permission notice shall be
|
56
|
+
included in all copies or substantial portions of the Software.
|
57
|
+
|
58
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
59
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
60
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
61
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
62
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
63
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
64
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Domainatrix
|
2
|
+
class DomainParser
|
3
|
+
include Addressable
|
4
|
+
|
5
|
+
attr_reader :public_suffixes
|
6
|
+
|
7
|
+
def initialize(file_name)
|
8
|
+
@public_suffixes = {}
|
9
|
+
read_dat_file(file_name)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read_dat_file(file_name)
|
13
|
+
# If we're in 1.9, make sure we're opening it in UTF-8
|
14
|
+
if RUBY_VERSION >= '1.9'
|
15
|
+
dat_file = File.open(file_name, "r:UTF-8")
|
16
|
+
else
|
17
|
+
dat_file = File.open(file_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
dat_file.each_line do |line|
|
21
|
+
line = line.strip
|
22
|
+
unless (line =~ /\/\//) || line.empty?
|
23
|
+
parts = line.split(".").reverse
|
24
|
+
|
25
|
+
sub_hash = @public_suffixes
|
26
|
+
parts.each do |part|
|
27
|
+
sub_hash = (sub_hash[part] ||= {})
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(url)
|
34
|
+
uri = URI.parse(url)
|
35
|
+
if uri.query
|
36
|
+
path = "#{uri.path}?#{uri.query}"
|
37
|
+
else
|
38
|
+
path = uri.path
|
39
|
+
end
|
40
|
+
parse_domains_from_host(uri.host).merge({
|
41
|
+
:scheme => uri.scheme,
|
42
|
+
:host => uri.host,
|
43
|
+
:path => path,
|
44
|
+
:url => url
|
45
|
+
})
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_domains_from_host(host)
|
49
|
+
parts = host.split(".").reverse
|
50
|
+
public_suffix = []
|
51
|
+
domain = ""
|
52
|
+
subdomains = []
|
53
|
+
sub_hash = @public_suffixes
|
54
|
+
parts.each_index do |i|
|
55
|
+
part = parts[i]
|
56
|
+
|
57
|
+
sub_parts = sub_hash[part]
|
58
|
+
sub_hash = sub_parts
|
59
|
+
if sub_parts.has_key? "*"
|
60
|
+
public_suffix << part
|
61
|
+
public_suffix << parts[i+1]
|
62
|
+
domain = parts[i+2]
|
63
|
+
subdomains = parts.slice(i+3, parts.size)
|
64
|
+
break
|
65
|
+
elsif sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
|
66
|
+
public_suffix << part
|
67
|
+
domain = parts[i+1]
|
68
|
+
subdomains = parts.slice(i+2, parts.size)
|
69
|
+
break
|
70
|
+
else
|
71
|
+
public_suffix << part
|
72
|
+
end
|
73
|
+
end
|
74
|
+
{:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Domainatrix
|
2
|
+
class Url
|
3
|
+
attr_reader :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host
|
4
|
+
|
5
|
+
def initialize(attrs = {})
|
6
|
+
@scheme = attrs[:scheme]
|
7
|
+
@host = attrs[:host]
|
8
|
+
@url = attrs[:url]
|
9
|
+
@public_suffix = attrs[:public_suffix]
|
10
|
+
@domain = attrs[:domain]
|
11
|
+
@subdomain = attrs[:subdomain]
|
12
|
+
@path = attrs[:path]
|
13
|
+
end
|
14
|
+
|
15
|
+
def canonical(options = {})
|
16
|
+
public_suffix_parts = @public_suffix.split(".")
|
17
|
+
url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
|
18
|
+
if @subdomain && !@subdomain.empty?
|
19
|
+
subdomain_parts = @subdomain.split(".")
|
20
|
+
url << ".#{subdomain_parts.reverse.join(".")}"
|
21
|
+
end
|
22
|
+
url << @path if @path
|
23
|
+
|
24
|
+
url
|
25
|
+
end
|
26
|
+
|
27
|
+
def domain_with_public_suffix
|
28
|
+
"#{@domain}.#{@public_suffix}"
|
29
|
+
end
|
30
|
+
alias domain_with_tld domain_with_public_suffix
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/domainatrix.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'domainatrix/domain_parser.rb'
|
5
|
+
require 'domainatrix/url.rb'
|
6
|
+
|
7
|
+
module Domainatrix
|
8
|
+
VERSION = "0.0.9"
|
9
|
+
DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
|
10
|
+
|
11
|
+
def self.parse(url)
|
12
|
+
Url.new(DOMAIN_PARSER.parse(url))
|
13
|
+
end
|
14
|
+
end
|