f1sherman-domainatrix 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile ADDED
@@ -0,0 +1,64 @@
1
+ h1. Domainatrix
2
+
3
+ "http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
4
+
5
+ h2. Summary
6
+
7
+ A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
8
+
9
+ h2. Description
10
+
11
+ This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
12
+
13
+ h2. Installation
14
+
15
+ <pre>
16
+ gem install domainatrix --source http://gemcutter.org
17
+ </pre>
18
+
19
+ h2. Use
20
+
21
+ <pre>
22
+ require 'rubygems'
23
+ require 'domainatrix'
24
+
25
+ url = Domainatrix.parse("http://www.pauldix.net")
26
+ url.url # => "http://www.pauldix.net" (the original url)
27
+ url.public_suffix # => "net"
28
+ url.domain # => "pauldix"
29
+ url.canonical # => "net.pauldix"
30
+
31
+ url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
32
+ url.public_suffix # => "co.uk"
33
+ url.domain # => "pauldix"
34
+ url.subdomain # => "foo.bar"
35
+ url.path # => "/asdf.html?q=arg"
36
+ url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
37
+ </pre>
38
+
39
+ h2. LICENSE
40
+
41
+ (The MIT License)
42
+
43
+ Copyright (c) 2009:
44
+
45
+ "Paul Dix":http://pauldix.net
46
+
47
+ Permission is hereby granted, free of charge, to any person obtaining
48
+ a copy of this software and associated documentation files (the
49
+ 'Software'), to deal in the Software without restriction, including
50
+ without limitation the rights to use, copy, modify, merge, publish,
51
+ distribute, sublicense, and/or sell copies of the Software, and to
52
+ permit persons to whom the Software is furnished to do so, subject to
53
+ the following conditions:
54
+
55
+ The above copyright notice and this permission notice shall be
56
+ included in all copies or substantial portions of the Software.
57
+
58
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
59
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
60
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
61
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
62
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
63
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
64
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,77 @@
1
+ module Domainatrix
2
+ class DomainParser
3
+ include Addressable
4
+
5
+ attr_reader :public_suffixes
6
+
7
+ def initialize(file_name)
8
+ @public_suffixes = {}
9
+ read_dat_file(file_name)
10
+ end
11
+
12
+ def read_dat_file(file_name)
13
+ # If we're in 1.9, make sure we're opening it in UTF-8
14
+ if RUBY_VERSION >= '1.9'
15
+ dat_file = File.open(file_name, "r:UTF-8")
16
+ else
17
+ dat_file = File.open(file_name)
18
+ end
19
+
20
+ dat_file.each_line do |line|
21
+ line = line.strip
22
+ unless (line =~ /\/\//) || line.empty?
23
+ parts = line.split(".").reverse
24
+
25
+ sub_hash = @public_suffixes
26
+ parts.each do |part|
27
+ sub_hash = (sub_hash[part] ||= {})
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def parse(url)
34
+ uri = URI.parse(url)
35
+ if uri.query
36
+ path = "#{uri.path}?#{uri.query}"
37
+ else
38
+ path = uri.path
39
+ end
40
+ parse_domains_from_host(uri.host).merge({
41
+ :scheme => uri.scheme,
42
+ :host => uri.host,
43
+ :path => path,
44
+ :url => url
45
+ })
46
+ end
47
+
48
+ def parse_domains_from_host(host)
49
+ parts = host.split(".").reverse
50
+ public_suffix = []
51
+ domain = ""
52
+ subdomains = []
53
+ sub_hash = @public_suffixes
54
+ parts.each_index do |i|
55
+ part = parts[i]
56
+
57
+ sub_parts = sub_hash[part]
58
+ sub_hash = sub_parts
59
+ if sub_parts.has_key? "*"
60
+ public_suffix << part
61
+ public_suffix << parts[i+1]
62
+ domain = parts[i+2]
63
+ subdomains = parts.slice(i+3, parts.size)
64
+ break
65
+ elsif sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
66
+ public_suffix << part
67
+ domain = parts[i+1]
68
+ subdomains = parts.slice(i+2, parts.size)
69
+ break
70
+ else
71
+ public_suffix << part
72
+ end
73
+ end
74
+ {:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,33 @@
1
+ module Domainatrix
2
+ class Url
3
+ attr_reader :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host
4
+
5
+ def initialize(attrs = {})
6
+ @scheme = attrs[:scheme]
7
+ @host = attrs[:host]
8
+ @url = attrs[:url]
9
+ @public_suffix = attrs[:public_suffix]
10
+ @domain = attrs[:domain]
11
+ @subdomain = attrs[:subdomain]
12
+ @path = attrs[:path]
13
+ end
14
+
15
+ def canonical(options = {})
16
+ public_suffix_parts = @public_suffix.split(".")
17
+ url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
18
+ if @subdomain && !@subdomain.empty?
19
+ subdomain_parts = @subdomain.split(".")
20
+ url << ".#{subdomain_parts.reverse.join(".")}"
21
+ end
22
+ url << @path if @path
23
+
24
+ url
25
+ end
26
+
27
+ def domain_with_public_suffix
28
+ "#{@domain}.#{@public_suffix}"
29
+ end
30
+ alias domain_with_tld domain_with_public_suffix
31
+
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'addressable/uri'
4
+ require 'domainatrix/domain_parser.rb'
5
+ require 'domainatrix/url.rb'
6
+
7
+ module Domainatrix
8
+ VERSION = "0.0.9"
9
+ DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
10
+
11
+ def self.parse(url)
12
+ Url.new(DOMAIN_PARSER.parse(url))
13
+ end
14
+ end