shadowbq-domainatrix 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +4 -0
- data/README.textile +88 -0
- data/lib/domainatrix/domain_parser.rb +153 -0
- data/lib/domainatrix/url.rb +51 -0
- data/lib/domainatrix.rb +48 -0
- data/lib/effective_tld_names.dat +6868 -0
- data/spec/domainatrix/domain_parser_spec.rb +157 -0
- data/spec/domainatrix/url_spec.rb +64 -0
- data/spec/domainatrix_spec.rb +106 -0
- data/spec/spec.opts +3 -0
- data/spec/spec_helper.rb +10 -0
- metadata +95 -0
data/CHANGELOG.md
ADDED
data/README.textile
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
h1. Domainatrix
|
2
|
+
|
3
|
+
"http://github.com/pauldix/domainatrix":http://github.com/pauldix/domainatrix
|
4
|
+
|
5
|
+
h2. Summary
|
6
|
+
|
7
|
+
A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding public suffixes, and breaking them into their domain parts.
|
8
|
+
|
9
|
+
h2. Description
|
10
|
+
|
11
|
+
This simple library can parse a URL into its canonical form. It uses the list of domains from "http://publicsuffix.org":http://publicsuffix.org to break the domain into its public suffix, domain, and subdomain.
|
12
|
+
|
13
|
+
h2. Installation
|
14
|
+
|
15
|
+
Install Default domainatrix
|
16
|
+
|
17
|
+
<pre>
|
18
|
+
gem install shadowbq-domainatrix
|
19
|
+
</pre>
|
20
|
+
|
21
|
+
Using Github Custom version in a GemFile
|
22
|
+
|
23
|
+
Installing a gem directly from a git repository is a feature of Bundler, not a feature of RubyGems. Gems installed this way will not show up when you run gem list.
|
24
|
+
|
25
|
+
<pre>
|
26
|
+
gem 'domainatrix', :git => 'git://github.com/shadowbq/domainatrix.git'
|
27
|
+
</pre>
|
28
|
+
|
29
|
+
|
30
|
+
h2. Use
|
31
|
+
|
32
|
+
<pre>
|
33
|
+
require 'rubygems'
|
34
|
+
require 'domainatrix'
|
35
|
+
|
36
|
+
url = Domainatrix.parse("http://www.pauldix.net")
|
37
|
+
url.url # => "http://www.pauldix.net/" (the original url)
|
38
|
+
url.host # => "www.pauldix.net"
|
39
|
+
url.public_suffix # => "net"
|
40
|
+
url.domain # => "pauldix"
|
41
|
+
url.canonical # => "net.pauldix"
|
42
|
+
|
43
|
+
url = Domainatrix.parse("http://foo.bar.pauldix.co.uk/asdf.html?q=arg")
|
44
|
+
url.public_suffix # => "co.uk"
|
45
|
+
url.domain # => "pauldix"
|
46
|
+
url.subdomain # => "foo.bar"
|
47
|
+
url.path # => "/asdf.html?q=arg"
|
48
|
+
url.canonical # => "uk.co.pauldix.bar.foo/asdf.html?q=arg"
|
49
|
+
url.scheme #=> "http"
|
50
|
+
|
51
|
+
urls = Domainatrix.scan("wikipedia (http://en.wikipedia.org/wiki/Popular_culture): lol") do |match|
|
52
|
+
match.url # Given a block, works like 'map'
|
53
|
+
end
|
54
|
+
urls # => ["http://en.wikipedia.org/wiki/Popular_culture"]
|
55
|
+
</pre>
|
56
|
+
|
57
|
+
h2. ALTERNATIVES
|
58
|
+
|
59
|
+
publicsuffix-ruby gem is well supported.
|
60
|
+
|
61
|
+
"https://github.com/weppos/publicsuffix-ruby":https://github.com/weppos/publicsuffix-ruby
|
62
|
+
|
63
|
+
h2. LICENSE
|
64
|
+
|
65
|
+
(The MIT License)
|
66
|
+
|
67
|
+
Copyright (c) 2009:
|
68
|
+
|
69
|
+
"Paul Dix":http://pauldix.net
|
70
|
+
|
71
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
72
|
+
a copy of this software and associated documentation files (the
|
73
|
+
'Software'), to deal in the Software without restriction, including
|
74
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
75
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
76
|
+
permit persons to whom the Software is furnished to do so, subject to
|
77
|
+
the following conditions:
|
78
|
+
|
79
|
+
The above copyright notice and this permission notice shall be
|
80
|
+
included in all copies or substantial portions of the Software.
|
81
|
+
|
82
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
83
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
84
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
85
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
86
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
87
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
88
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Domainatrix
|
2
|
+
class Error < RuntimeError; end
|
3
|
+
class ParseError < Error; end
|
4
|
+
|
5
|
+
class DomainParser
|
6
|
+
include Addressable
|
7
|
+
|
8
|
+
attr_reader :public_suffixes
|
9
|
+
VALID_SCHEMA = /^http[s]{0,1}$/
|
10
|
+
|
11
|
+
def initialize(file_name)
|
12
|
+
@public_suffixes = {}
|
13
|
+
read_dat_file(file_name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def read_dat_file(file_name)
|
17
|
+
# If we're in 1.9, make sure we're opening it in UTF-8
|
18
|
+
if RUBY_VERSION >= '1.9'
|
19
|
+
dat_file = File.open(file_name, "r:UTF-8")
|
20
|
+
else
|
21
|
+
dat_file = File.open(file_name)
|
22
|
+
end
|
23
|
+
|
24
|
+
dat_file.each_line do |line|
|
25
|
+
line = line.strip
|
26
|
+
unless (line =~ /^\/\//) || line.empty?
|
27
|
+
parts = line.split(".").reverse
|
28
|
+
|
29
|
+
sub_hash = @public_suffixes
|
30
|
+
parts.each do |part|
|
31
|
+
sub_hash = (sub_hash[part] ||= {})
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse(url)
|
38
|
+
return {} unless url && url.strip != ''
|
39
|
+
|
40
|
+
url = "http://#{url}" unless url[/:\/\//]
|
41
|
+
url = url.downcase
|
42
|
+
|
43
|
+
uri = begin
|
44
|
+
Addressable::URI.parse(url)
|
45
|
+
rescue Addressable::URI::InvalidURIError
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
|
49
|
+
raise ParseError, "URL is not parsable by Addressable::URI" if not uri
|
50
|
+
url = uri.normalize.to_s
|
51
|
+
raise ParseError, "URL does not have valid scheme" unless uri.scheme =~ VALID_SCHEMA
|
52
|
+
raise ParseError, "URL does not have a valid host" if uri.host.nil?
|
53
|
+
|
54
|
+
path = uri.path
|
55
|
+
path << "?#{uri.query}" if uri.query
|
56
|
+
path << "##{uri.fragment}" if uri.fragment
|
57
|
+
|
58
|
+
if uri.host == 'localhost'
|
59
|
+
uri_hash = { :public_suffix => '', :domain => 'localhost', :subdomain => '' }
|
60
|
+
else
|
61
|
+
uri_hash = parse_domains_from_host(uri.host || uri.basename)
|
62
|
+
end
|
63
|
+
|
64
|
+
uri_hash.merge({
|
65
|
+
:scheme => uri.scheme,
|
66
|
+
:host => uri.host,
|
67
|
+
:path => path,
|
68
|
+
:url => url
|
69
|
+
})
|
70
|
+
end
|
71
|
+
|
72
|
+
def split_domain(parts, tld_size)
|
73
|
+
if parts.size == 1 and tld_size == 0
|
74
|
+
subdomain = ''
|
75
|
+
domain = '*'
|
76
|
+
tld = ''
|
77
|
+
else
|
78
|
+
# parts are host split on . reversed, eg com.pauldix.www
|
79
|
+
domain_parts = parts.reverse
|
80
|
+
if domain_parts.size - tld_size <= 0
|
81
|
+
raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}"
|
82
|
+
end
|
83
|
+
|
84
|
+
tld = domain_parts.slice!(-tld_size, tld_size).join('.')
|
85
|
+
domain = domain_parts.pop
|
86
|
+
subdomain = domain_parts.join('.')
|
87
|
+
end
|
88
|
+
|
89
|
+
[subdomain, domain, tld]
|
90
|
+
end
|
91
|
+
|
92
|
+
def parse_domains_from_host(host)
|
93
|
+
return {} unless host
|
94
|
+
|
95
|
+
parts = host.split(".").reverse
|
96
|
+
ip_address = false
|
97
|
+
|
98
|
+
if host == '*'
|
99
|
+
tld_size = 0
|
100
|
+
elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil)
|
101
|
+
# host is an ip address
|
102
|
+
ip_address = true
|
103
|
+
else
|
104
|
+
main_tld = parts.first
|
105
|
+
tld_size = 1
|
106
|
+
raise ParseError, "Invalid URL" if parts.size < 2
|
107
|
+
|
108
|
+
if main_tld != '*'
|
109
|
+
|
110
|
+
#PunyCode, and New Anydomain TLD invalidate this, just use the DAT file
|
111
|
+
#raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/
|
112
|
+
|
113
|
+
if not current_suffixes = @public_suffixes[main_tld]
|
114
|
+
raise ParseError, "Invalid main TLD: #{main_tld}"
|
115
|
+
end
|
116
|
+
|
117
|
+
parts.each_with_index do |part, i|
|
118
|
+
if current_suffixes.empty?
|
119
|
+
# no extra rules found (eg domain.net)
|
120
|
+
break
|
121
|
+
else
|
122
|
+
if current_suffixes.has_key?("!#{parts[i+1]}")
|
123
|
+
# exception tld domain found (eg metro.tokyo.jp)
|
124
|
+
break
|
125
|
+
elsif current_suffixes.has_key?(parts[i+1])
|
126
|
+
# valid extra domain level found (eg co.uk)
|
127
|
+
tld_size += 1
|
128
|
+
current_suffixes = current_suffixes[parts[i+1]]
|
129
|
+
elsif current_suffixes.has_key?('*')
|
130
|
+
# wildcard domain level (eg *.jp)
|
131
|
+
tld_size += 1
|
132
|
+
break
|
133
|
+
else
|
134
|
+
# no extra rules found (eg domain.net)
|
135
|
+
break
|
136
|
+
end # if current_suffixes
|
137
|
+
end # if current_suffixes.empty?
|
138
|
+
end # parts .. do
|
139
|
+
end# if main_tld
|
140
|
+
end # if host
|
141
|
+
|
142
|
+
if ip_address
|
143
|
+
subdomain, domain, tld = '', host, ''
|
144
|
+
else
|
145
|
+
subdomain, domain, tld = split_domain(parts, tld_size)
|
146
|
+
end
|
147
|
+
|
148
|
+
{:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address}
|
149
|
+
end # def
|
150
|
+
|
151
|
+
end #class
|
152
|
+
end# module
|
153
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Domainatrix
|
2
|
+
class Url
|
3
|
+
|
4
|
+
attr_accessor :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host, :ip_address
|
5
|
+
|
6
|
+
def initialize(attrs = {})
|
7
|
+
@scheme = attrs[:scheme] || ''
|
8
|
+
@host = attrs[:host] || ''
|
9
|
+
@url = attrs[:url] || ''
|
10
|
+
@public_suffix = attrs[:public_suffix] || ''
|
11
|
+
@domain = attrs[:domain] || ''
|
12
|
+
@subdomain = attrs[:subdomain] || ''
|
13
|
+
@path = attrs[:path] || ''
|
14
|
+
@ip_address = attrs[:ip_address]
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def canonical(options = {})
|
19
|
+
public_suffix_parts = @public_suffix.split(".")
|
20
|
+
url = "#{public_suffix_parts.reverse.join(".")}.#{@domain}"
|
21
|
+
if @subdomain && !@subdomain.empty?
|
22
|
+
subdomain_parts = @subdomain.split(".")
|
23
|
+
url << ".#{subdomain_parts.reverse.join(".")}"
|
24
|
+
end
|
25
|
+
url << @path if @path
|
26
|
+
|
27
|
+
url
|
28
|
+
end
|
29
|
+
|
30
|
+
def domain_with_public_suffix
|
31
|
+
[@domain, @public_suffix].compact.reject{|s|s==''}.join('.')
|
32
|
+
end
|
33
|
+
alias domain_with_tld domain_with_public_suffix
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
if @scheme.nil? || @scheme.empty?
|
37
|
+
scheme = ''
|
38
|
+
else
|
39
|
+
scheme = "#{@scheme}://"
|
40
|
+
end
|
41
|
+
|
42
|
+
parts = []
|
43
|
+
parts << @subdomain if @subdomain and !@subdomain.empty?
|
44
|
+
parts << @domain if @domain and !@domain.empty?
|
45
|
+
parts << @public_suffix if @public_suffix and !@public_suffix.empty?
|
46
|
+
|
47
|
+
"#{scheme}#{parts.join('.')}#{@path}"
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
data/lib/domainatrix.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'addressable/uri'
|
4
|
+
require 'domainatrix/domain_parser'
|
5
|
+
require 'domainatrix/url'
|
6
|
+
require 'uri'
|
7
|
+
|
8
|
+
begin
|
9
|
+
require 'uri'
|
10
|
+
rescue LoadError
|
11
|
+
end
|
12
|
+
|
13
|
+
module Domainatrix
|
14
|
+
|
15
|
+
VERSION = "0.0.11"
|
16
|
+
DOMAIN_PARSER = DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
|
17
|
+
|
18
|
+
def self.parse(url)
|
19
|
+
Url.new(DOMAIN_PARSER.parse(url))
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.scan(text, &block)
|
23
|
+
return [] unless text
|
24
|
+
@schemes ||= %w(http https)
|
25
|
+
all_trailing_clutter = /[.,:);]+$/
|
26
|
+
clutter_without_parens = /[.,:);]+$/
|
27
|
+
|
28
|
+
candidate_urls = ::URI.extract(text, @schemes)
|
29
|
+
candidate_urls.map! do |url|
|
30
|
+
# If the URL has an open paren, allow closing parens.
|
31
|
+
if url.include?("(")
|
32
|
+
url.gsub(clutter_without_parens, '')
|
33
|
+
else
|
34
|
+
url.gsub(all_trailing_clutter, '')
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
urls = candidate_urls.map do |url|
|
39
|
+
begin
|
40
|
+
parse(url)
|
41
|
+
rescue Addressable::URI::InvalidURIError
|
42
|
+
end
|
43
|
+
end.compact
|
44
|
+
|
45
|
+
urls.map!(&block) if block
|
46
|
+
urls
|
47
|
+
end
|
48
|
+
end
|