shadowbq-domainatrix 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.md ADDED
@@ -0,0 +1,4 @@
1
+ ## Domainatrix 0.0.11 (September 22, 2012) ##
2
+
3
+ * Update domain list
4
+ * Add changelog
data/README.textile ADDED
@@ -0,0 +1,88 @@
1
+ h1. Domainatrix
2
+
3
+ "http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
4
+
5
+ h2. Summary
6
+
7
+ A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
8
+
9
+ h2. Description
10
+
11
+ This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
12
+
13
+ h2. Installation
14
+
15
+ Install Default domainatrix
16
+
17
+ <pre>
18
+ gem install shadowbq-domainatrix
19
+ </pre>
20
+
21
+ Using Github Custom version in a GemFile
22
+
23
+ Installing a gem directly from a git repository is a feature of Bundler, not a feature of RubyGems. Gems installed this way will not show up when you run gem list.
24
+
25
+ <pre>
26
+ gem 'domainatrix', :git => 'git://github.com/shadowbq/domainatrix.git'
27
+ </pre>
28
+
29
+
30
+ h2. Use
31
+
32
+ <pre>
33
+ require 'rubygems'
34
+ require 'domainatrix'
35
+
36
+ url = Domainatrix.parse("http://www.pauldix.net")
37
+ url.url # => "http://www.pauldix.net/" (the original url)
38
+ url.host # => "www.pauldix.net"
39
+ url.public_suffix # => "net"
40
+ url.domain # => "pauldix"
41
+ url.canonical # => "net.pauldix"
42
+
43
+ url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
44
+ url.public_suffix # => "co.uk"
45
+ url.domain # => "pauldix"
46
+ url.subdomain # => "foo.bar"
47
+ url.path # => "/asdf.html?q=arg"
48
+ url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
49
+ url.scheme #=> "http"
50
+
51
+ urls = Domainatrix.scan("wikipedia (http://en.wikipedia.org/wiki/Popular_culture): lol") do |match|
52
+ match.url # Given a block, works like 'map'
53
+ end
54
+ urls # => ["http://en.wikipedia.org/wiki/Popular_culture"]
55
+ </pre>
56
+
57
+ h2. ALTERNATIVES
58
+
59
+ publicsuffix-ruby gem is well supported.
60
+
61
+ "https://github.com/weppos/publicsuffix-ruby":https://github.com/weppos/publicsuffix-ruby
62
+
63
+ h2. LICENSE
64
+
65
+ (The MIT License)
66
+
67
+ Copyright (c) 2009:
68
+
69
+ "Paul Dix":http://pauldix.net
70
+
71
+ Permission is hereby granted, free of charge, to any person obtaining
72
+ a copy of this software and associated documentation files (the
73
+ 'Software'), to deal in the Software without restriction, including
74
+ without limitation the rights to use, copy, modify, merge, publish,
75
+ distribute, sublicense, and/or sell copies of the Software, and to
76
+ permit persons to whom the Software is furnished to do so, subject to
77
+ the following conditions:
78
+
79
+ The above copyright notice and this permission notice shall be
80
+ included in all copies or substantial portions of the Software.
81
+
82
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
83
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
84
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
85
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
86
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
87
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
88
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,153 @@
1
+ module Domainatrix
2
+ class Error < RuntimeError; end
3
+ class ParseError < Error; end
4
+
5
+ class DomainParser
6
+ include Addressable
7
+
8
+ attr_reader :public_suffixes
9
+ VALID_SCHEMA = /^http[s]{0,1}$/
10
+
11
+ def initialize(file_name)
12
+ @public_suffixes = {}
13
+ read_dat_file(file_name)
14
+ end
15
+
16
+ def read_dat_file(file_name)
17
+ # If we're in 1.9, make sure we're opening it in UTF-8
18
+ if RUBY_VERSION >= '1.9'
19
+ dat_file = File.open(file_name, "r:UTF-8")
20
+ else
21
+ dat_file = File.open(file_name)
22
+ end
23
+
24
+ dat_file.each_line do |line|
25
+ line = line.strip
26
+ unless (line =~ /^\/\//) || line.empty?
27
+ parts = line.split(".").reverse
28
+
29
+ sub_hash = @public_suffixes
30
+ parts.each do |part|
31
+ sub_hash = (sub_hash[part] ||= {})
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def parse(url)
38
+ return {} unless url && url.strip != ''
39
+
40
+ url = "http://#{url}" unless url[/:\/\//]
41
+ url = url.downcase
42
+
43
+ uri = begin
44
+ Addressable::URI.parse(url)
45
+ rescue Addressable::URI::InvalidURIError
46
+ nil
47
+ end
48
+
49
+ raise ParseError, "URL is not parsable by Addressable::URI" if not uri
50
+ url = uri.normalize.to_s
51
+ raise ParseError, "URL does not have valid scheme" unless uri.scheme =~ VALID_SCHEMA
52
+ raise ParseError, "URL does not have a valid host" if uri.host.nil?
53
+
54
+ path = uri.path
55
+ path << "?#{uri.query}" if uri.query
56
+ path << "##{uri.fragment}" if uri.fragment
57
+
58
+ if uri.host == 'localhost'
59
+ uri_hash = { :public_suffix => '', :domain => 'localhost', :subdomain => '' }
60
+ else
61
+ uri_hash = parse_domains_from_host(uri.host || uri.basename)
62
+ end
63
+
64
+ uri_hash.merge({
65
+ :scheme => uri.scheme,
66
+ :host => uri.host,
67
+ :path => path,
68
+ :url => url
69
+ })
70
+ end
71
+
72
+ def split_domain(parts, tld_size)
73
+ if parts.size == 1 and tld_size == 0
74
+ subdomain = ''
75
+ domain = '*'
76
+ tld = ''
77
+ else
78
+ # parts are host split on . reversed, eg com.pauldix.www
79
+ domain_parts = parts.reverse
80
+ if domain_parts.size - tld_size <= 0
81
+ raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}"
82
+ end
83
+
84
+ tld = domain_parts.slice!(-tld_size, tld_size).join('.')
85
+ domain = domain_parts.pop
86
+ subdomain = domain_parts.join('.')
87
+ end
88
+
89
+ [subdomain, domain, tld]
90
+ end
91
+
92
+ def parse_domains_from_host(host)
93
+ return {} unless host
94
+
95
+ parts = host.split(".").reverse
96
+ ip_address = false
97
+
98
+ if host == '*'
99
+ tld_size = 0
100
+ elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil)
101
+ # host is an ip address
102
+ ip_address = true
103
+ else
104
+ main_tld = parts.first
105
+ tld_size = 1
106
+ raise ParseError, "Invalid URL" if parts.size < 2
107
+
108
+ if main_tld != '*'
109
+
110
+ #PunyCode, and New Anydomain TLD invalidate this, just use the DAT file
111
+ #raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/
112
+
113
+ if not current_suffixes = @public_suffixes[main_tld]
114
+ raise ParseError, "Invalid main TLD: #{main_tld}"
115
+ end
116
+
117
+ parts.each_with_index do |part, i|
118
+ if current_suffixes.empty?
119
+ # no extra rules found (eg domain.net)
120
+ break
121
+ else
122
+ if current_suffixes.has_key?("!#{parts[i+1]}")
123
+ # exception tld domain found (eg metro.tokyo.jp)
124
+ break
125
+ elsif current_suffixes.has_key?(parts[i+1])
126
+ # valid extra domain level found (eg co.uk)
127
+ tld_size += 1
128
+ current_suffixes = current_suffixes[parts[i+1]]
129
+ elsif current_suffixes.has_key?('*')
130
+ # wildcard domain level (eg *.jp)
131
+ tld_size += 1
132
+ break
133
+ else
134
+ # no extra rules found (eg domain.net)
135
+ break
136
+ end # if current_suffixes
137
+ end # if current_suffixes.empty?
138
+ end # parts .. do
139
+ end# if main_tld
140
+ end # if host
141
+
142
+ if ip_address
143
+ subdomain, domain, tld = '', host, ''
144
+ else
145
+ subdomain, domain, tld = split_domain(parts, tld_size)
146
+ end
147
+
148
+ {:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address}
149
+ end # def
150
+
151
+ end #class
152
+ end# module
153
+
@@ -0,0 +1,51 @@
1
+ module Domainatrix
2
+ class Url
3
+
4
+ attr_accessor :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host, :ip_address
5
+
6
+ def initialize(attrs = {})
7
+ @scheme = attrs[:scheme] || ''
8
+ @host = attrs[:host] || ''
9
+ @url = attrs[:url] || ''
10
+ @public_suffix = attrs[:public_suffix] || ''
11
+ @domain = attrs[:domain] || ''
12
+ @subdomain = attrs[:subdomain] || ''
13
+ @path = attrs[:path] || ''
14
+ @ip_address = attrs[:ip_address]
15
+
16
+ end
17
+
18
+ def canonical(options = {})
19
+ public_suffix_parts = @public_suffix.split(".")
20
+ url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
21
+ if @subdomain && !@subdomain.empty?
22
+ subdomain_parts = @subdomain.split(".")
23
+ url << ".#{subdomain_parts.reverse.join(".")}"
24
+ end
25
+ url << @path if @path
26
+
27
+ url
28
+ end
29
+
30
+ def domain_with_public_suffix
31
+ [@domain, @public_suffix].compact.reject{|s|s==''}.join('.')
32
+ end
33
+ alias domain_with_tld domain_with_public_suffix
34
+
35
+ def to_s
36
+ if @scheme.nil? || @scheme.empty?
37
+ scheme = ''
38
+ else
39
+ scheme = "#{@scheme}://"
40
+ end
41
+
42
+ parts = []
43
+ parts << @subdomain if @subdomain and !@subdomain.empty?
44
+ parts << @domain if @domain and !@domain.empty?
45
+ parts << @public_suffix if @public_suffix and !@public_suffix.empty?
46
+
47
+ "#{scheme}#{parts.join('.')}#{@path}"
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,48 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'addressable/uri'
4
+ require 'domainatrix/domain_parser'
5
+ require 'domainatrix/url'
6
+ require 'uri'
7
+
8
+ begin
9
+ require 'uri'
10
+ rescue LoadError
11
+ end
12
+
13
+ module Domainatrix
14
+
15
+ VERSION = "0.0.11"
16
+ DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
17
+
18
+ def self.parse(url)
19
+ Url.new(DOMAIN_PARSER.parse(url))
20
+ end
21
+
22
+ def self.scan(text, &block)
23
+ return [] unless text
24
+ @schemes ||= %w(http https)
25
+ all_trailing_clutter = /[.,:);]+$/
26
+ clutter_without_parens = /[.,:);]+$/
27
+
28
+ candidate_urls = ::URI.extract(text, @schemes)
29
+ candidate_urls.map! do |url|
30
+ # If the URL has an open paren, allow closing parens.
31
+ if url.include?("(")
32
+ url.gsub(clutter_without_parens, '')
33
+ else
34
+ url.gsub(all_trailing_clutter, '')
35
+ end
36
+ end
37
+
38
+ urls = candidate_urls.map do |url|
39
+ begin
40
+ parse(url)
41
+ rescue Addressable::URI::InvalidURIError
42
+ end
43
+ end.compact
44
+
45
+ urls.map!(&block) if block
46
+ urls
47
+ end
48
+ end