public_suffix_service 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +38 -0
- data/LICENSE.rdoc +25 -0
- data/README.rdoc +99 -0
- data/Rakefile +132 -0
- data/lib/public_suffix_service.rb +90 -0
- data/lib/public_suffix_service/definitions.dat +4449 -0
- data/lib/public_suffix_service/domain.rb +230 -0
- data/lib/public_suffix_service/errors.rb +25 -0
- data/lib/public_suffix_service/rule.rb +294 -0
- data/lib/public_suffix_service/rule_list.rb +246 -0
- data/lib/public_suffix_service/version.rb +30 -0
- data/public_suffix_service.gemspec +33 -0
- data/test/acceptance_test.rb +26 -0
- data/test/public_suffix_service/domain_test.rb +141 -0
- data/test/public_suffix_service/rule_list_test.rb +182 -0
- data/test/public_suffix_service/rule_test.rb +215 -0
- data/test/public_suffix_service_test.rb +62 -0
- data/test/test_helper.rb +9 -0
- metadata +100 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
#
|
2
|
+
# = Public Suffix Service
|
3
|
+
#
|
4
|
+
# Domain Name parser based on the Public Suffix List
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: PublicSuffixService
|
9
|
+
# Author:: Simone Carletti <weppos@weppos.net>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
module PublicSuffixService
|
18
|
+
|
19
|
+
class Domain
|
20
|
+
|
21
|
+
def initialize(*args, &block)
|
22
|
+
@tld, @sld, @trd = args
|
23
|
+
yield(self) if block_given?
|
24
|
+
end
|
25
|
+
|
26
|
+
# Gets a String representation of this object.
|
27
|
+
#
|
28
|
+
# Returns a String with the domain name.
|
29
|
+
def to_s
|
30
|
+
name
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_a
|
34
|
+
[trd, sld, tld]
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Gets the Top Level Domain part, aka the extension.
|
39
|
+
#
|
40
|
+
# Returns a String if tld is set, nil otherwise.
|
41
|
+
def tld
|
42
|
+
@tld
|
43
|
+
end
|
44
|
+
|
45
|
+
# Gets the Second Level Domain part, aka the domain part.
|
46
|
+
#
|
47
|
+
# Returns a String if sld is set, nil otherwise.
|
48
|
+
def sld
|
49
|
+
@sld
|
50
|
+
end
|
51
|
+
|
52
|
+
# Gets the Third Level Domain part, aka the subdomain part.
|
53
|
+
#
|
54
|
+
# Returns a String if trd is set, nil otherwise.
|
55
|
+
def trd
|
56
|
+
@trd
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# Gets the domain name.
|
61
|
+
#
|
62
|
+
# Examples
|
63
|
+
#
|
64
|
+
# DomainName.new("com", "google").name
|
65
|
+
# # => "google.com"
|
66
|
+
#
|
67
|
+
# DomainName.new("com", "google", "www").name
|
68
|
+
# # => "www.google.com"
|
69
|
+
#
|
70
|
+
# Returns a String with the domain name.
|
71
|
+
def name
|
72
|
+
[trd, sld, tld].reject { |part| part.nil? }.join(".")
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns a domain-like representation of this object
|
76
|
+
# if the object is a <tt>domain?</tt>,
|
77
|
+
# <tt>nil</tt> otherwise.
|
78
|
+
def domain
|
79
|
+
return unless domain?
|
80
|
+
[sld, tld].join(".")
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns a subdomain-like representation of this object
|
84
|
+
# if the object is a <tt>subdomain?</tt>,
|
85
|
+
# <tt>nil</tt> otherwise.
|
86
|
+
def subdomain
|
87
|
+
return unless subdomain?
|
88
|
+
[trd, sld, tld].join(".")
|
89
|
+
end
|
90
|
+
|
91
|
+
# Gets the rule matching this domain in the default PublicSuffixService::RuleList.
|
92
|
+
#
|
93
|
+
# Returns an instance of PublicSuffixService::Rule::Base if a rule matches current domain,
|
94
|
+
# nil if no rule is found.
|
95
|
+
def rule
|
96
|
+
RuleList.default.find(name)
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Checks whether <tt>self</tt> looks like a domain.
|
101
|
+
#
|
102
|
+
# This method doesn't actually validate the domain.
|
103
|
+
# It only checks whether the instance contains
|
104
|
+
# a value for the <tt>tld</tt> and <tt>sld</tt> attributes.
|
105
|
+
# If you also want to validate the domain, use <tt>#valid_domain?</tt> instead.
|
106
|
+
#
|
107
|
+
# Examples
|
108
|
+
#
|
109
|
+
# DomainName.new("com").domain?
|
110
|
+
# # => false
|
111
|
+
#
|
112
|
+
# DomainName.new("com", "google").domain?
|
113
|
+
# # => true
|
114
|
+
#
|
115
|
+
# DomainName.new("com", "google", "www").domain?
|
116
|
+
# # => true
|
117
|
+
#
|
118
|
+
# # This is an invalid domain, but returns true
|
119
|
+
# # because this method doesn't validate the content.
|
120
|
+
# DomainName.new("zip", "google").domain?
|
121
|
+
# # => true
|
122
|
+
#
|
123
|
+
# Returns true if this instance looks like a domain.
|
124
|
+
def domain?
|
125
|
+
!(tld.nil? || sld.nil?)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Checks whether <tt>self</tt> looks like a subdomain.
|
129
|
+
#
|
130
|
+
# This method doesn't actually validate the subdomain.
|
131
|
+
# It only checks whether the instance contains
|
132
|
+
# a value for the <tt>tld</tt>, <tt>sld</tt> and <tt>trd</tt> attributes.
|
133
|
+
# If you also want to validate the domain, use <tt>#valid_subdomain?</tt> instead.
|
134
|
+
#
|
135
|
+
# Examples
|
136
|
+
#
|
137
|
+
# DomainName.new("com").subdomain?
|
138
|
+
# # => false
|
139
|
+
#
|
140
|
+
# DomainName.new("com", "google").subdomain?
|
141
|
+
# # => false
|
142
|
+
#
|
143
|
+
# DomainName.new("com", "google", "www").subdomain?
|
144
|
+
# # => true
|
145
|
+
#
|
146
|
+
# # This is an invalid domain, but returns true
|
147
|
+
# # because this method doesn't validate the content.
|
148
|
+
# DomainName.new("zip", "google", "www").subdomain?
|
149
|
+
# # => true
|
150
|
+
#
|
151
|
+
# Returns true if this instance looks like a subdomain.
|
152
|
+
def subdomain?
|
153
|
+
!(tld.nil? || sld.nil? || trd.nil?)
|
154
|
+
end
|
155
|
+
|
156
|
+
# Checks whether <tt>self</tt> is exclusively a domain,
|
157
|
+
# and not a subdomain.
|
158
|
+
def is_a_domain?
|
159
|
+
domain? && !subdomain?
|
160
|
+
end
|
161
|
+
|
162
|
+
# Checks whether <tt>self</tt> is exclusively a subdomain.
|
163
|
+
def is_a_subdomain?
|
164
|
+
subdomain?
|
165
|
+
end
|
166
|
+
|
167
|
+
# Checks whether <tt>self</tt> is valid
|
168
|
+
# according to default <tt>RuleList</tt>.
|
169
|
+
#
|
170
|
+
# Note: this method triggers a new rule lookup in the default RuleList,
|
171
|
+
# which is a quite intensive task.
|
172
|
+
#
|
173
|
+
# Returns true if this instance is valid.
|
174
|
+
def valid?
|
175
|
+
!rule.nil?
|
176
|
+
end
|
177
|
+
|
178
|
+
# Checks whether <tt>self</tt> looks like a domain and validates
|
179
|
+
# according to default <tt>RuleList</tt>.
|
180
|
+
#
|
181
|
+
# See also <tt>DomainName#domain?</tt> and <tt>DomainName#valid?</tt>.
|
182
|
+
#
|
183
|
+
# Examples
|
184
|
+
#
|
185
|
+
# DomainName.new("com").domain?
|
186
|
+
# # => false
|
187
|
+
#
|
188
|
+
# DomainName.new("com", "google").domain?
|
189
|
+
# # => true
|
190
|
+
#
|
191
|
+
# DomainName.new("com", "google", "www").domain?
|
192
|
+
# # => true
|
193
|
+
#
|
194
|
+
# # This is an invalid domain
|
195
|
+
# DomainName.new("zip", "google").false?
|
196
|
+
# # => true
|
197
|
+
#
|
198
|
+
# Returns true if this instance looks like a domain and is valid.
|
199
|
+
def valid_domain?
|
200
|
+
domain? && valid?
|
201
|
+
end
|
202
|
+
|
203
|
+
# Checks whether <tt>self</tt> looks like a subdomain and validates
|
204
|
+
# according to default <tt>RuleList</tt>.
|
205
|
+
#
|
206
|
+
# See also <tt>DomainName#subdomain?</tt> and <tt>DomainName#valid?</tt>.
|
207
|
+
#
|
208
|
+
# Examples
|
209
|
+
#
|
210
|
+
# DomainName.new("com").subdomain?
|
211
|
+
# # => false
|
212
|
+
#
|
213
|
+
# DomainName.new("com", "google").subdomain?
|
214
|
+
# # => false
|
215
|
+
#
|
216
|
+
# DomainName.new("com", "google", "www").subdomain?
|
217
|
+
# # => true
|
218
|
+
#
|
219
|
+
# # This is an invalid domain
|
220
|
+
# DomainName.new("zip", "google", "www").subdomain?
|
221
|
+
# # => false
|
222
|
+
#
|
223
|
+
# Returns true if this instance looks like a domain and is valid.
|
224
|
+
def valid_subdomain?
|
225
|
+
subdomain? && valid?
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
|
230
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#
|
2
|
+
# = Public Suffix Service
|
3
|
+
#
|
4
|
+
# Domain Name parser based on the Public Suffix List
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: PublicSuffixService
|
9
|
+
# Author:: Simone Carletti <weppos@weppos.net>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
module PublicSuffixService
|
18
|
+
|
19
|
+
class Error < StandardError
|
20
|
+
end
|
21
|
+
|
22
|
+
class InvalidDomain < Error
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,294 @@
|
|
1
|
+
#
|
2
|
+
# = Public Suffix Service
|
3
|
+
#
|
4
|
+
# Domain Name parser based on the Public Suffix List
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: PublicSuffixService
|
9
|
+
# Author:: Simone Carletti <weppos@weppos.net>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
module PublicSuffixService
|
18
|
+
|
19
|
+
class Rule
|
20
|
+
|
21
|
+
# Takes the <tt>name</tt> of the rule, detects the specific rule class
|
22
|
+
# and creates a new instance of that class.
|
23
|
+
# The <tt>name</tt> becomes the rule value.
|
24
|
+
#
|
25
|
+
# name - The rule String definition
|
26
|
+
#
|
27
|
+
# Examples
|
28
|
+
#
|
29
|
+
# PublicSuffixService::Rule.factory("ar")
|
30
|
+
# # => #<PublicSuffixService::Rule::Normal>
|
31
|
+
#
|
32
|
+
# PublicSuffixService::Rule.factory("*.ar")
|
33
|
+
# # => #<PublicSuffixService::Rule::Wildcard>
|
34
|
+
#
|
35
|
+
# PublicSuffixService::Rule.factory("!congresodelalengua3.ar")
|
36
|
+
# # => #<PublicSuffixService::Rule::Exception>
|
37
|
+
#
|
38
|
+
def self.factory(name)
|
39
|
+
klass = case name.to_s[0..0]
|
40
|
+
when "*" then "wildcard"
|
41
|
+
when "!" then "exception"
|
42
|
+
else "normal"
|
43
|
+
end
|
44
|
+
const_get(klass.capitalize).new(name)
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
#
|
49
|
+
# = Abstract rule class
|
50
|
+
#
|
51
|
+
# This represent the base class for a Rule definition
|
52
|
+
# in the {Public Suffix List}[http://publicsuffix.org].
|
53
|
+
#
|
54
|
+
# This is intended to be an Abstract class
|
55
|
+
# and you sholnd't create a direct instance. The only purpose
|
56
|
+
# of this class is to expose a common interface
|
57
|
+
# for all the available subclasses.
|
58
|
+
#
|
59
|
+
# * PublicSuffixService::Rule::Normal
|
60
|
+
# * PublicSuffixService::Rule::Exception
|
61
|
+
# * PublicSuffixService::Rule::Wildcard
|
62
|
+
#
|
63
|
+
# == Properties
|
64
|
+
#
|
65
|
+
# A rule is composed by 4 properties:
|
66
|
+
#
|
67
|
+
# name - The name of the rule, corresponding to the rule definition
|
68
|
+
# in the public suffic list
|
69
|
+
# value - The value, a normalized version of the rule name.
|
70
|
+
# The normalization process depends on rule tpe.
|
71
|
+
# type - The rule type (:normal, :wildcard, :exception)
|
72
|
+
# labels - The canonicalized rule name
|
73
|
+
#
|
74
|
+
# Here's an example
|
75
|
+
#
|
76
|
+
# PublicSuffixService::Rule.factory("*.google.com")
|
77
|
+
# #<PublicSuffixService::Rule::Wildcard:0x1015c14b0
|
78
|
+
# @labels=["com", "google"],
|
79
|
+
# @name="*.google.com",
|
80
|
+
# @type=:wildcard,
|
81
|
+
# @value="google.com"
|
82
|
+
# >
|
83
|
+
#
|
84
|
+
# == Rule Creation
|
85
|
+
#
|
86
|
+
# The best way to create a new rule is passing the rule name
|
87
|
+
# to the <tt>PublicSuffixService::Rule.factory</tt> method.
|
88
|
+
#
|
89
|
+
# PublicSuffixService::Rule.factory("com")
|
90
|
+
# # => PublicSuffixService::Rule::Normal
|
91
|
+
#
|
92
|
+
# PublicSuffixService::Rule.factory("*.com")
|
93
|
+
# # => PublicSuffixService::Rule::Wildcard
|
94
|
+
#
|
95
|
+
# This method will detect the rule type and create an instance
|
96
|
+
# from the proper rule class.
|
97
|
+
#
|
98
|
+
# == Rule Usage
|
99
|
+
#
|
100
|
+
# A rule describes the composition of a domain name
|
101
|
+
# and explains how to tokenize the domain name
|
102
|
+
# into tld, sld and trd.
|
103
|
+
#
|
104
|
+
# To use a rule, you first need to be sure the domain you want to tokenize
|
105
|
+
# can be handled by the current rule.
|
106
|
+
# You can use the <tt>#match?</tt> method.
|
107
|
+
#
|
108
|
+
# rule = PublicSuffixService::Rule.factory("com")
|
109
|
+
#
|
110
|
+
# rule.match?("google.com")
|
111
|
+
# # => true
|
112
|
+
#
|
113
|
+
# rule.match?("google.com")
|
114
|
+
# # => false
|
115
|
+
#
|
116
|
+
# Rule order is significant. A domain can match more than one rule.
|
117
|
+
# See the {Public Suffix Documentation}[http://publicsuffix.org/format/]
|
118
|
+
# to learn more about rule priority.
|
119
|
+
#
|
120
|
+
# When you have the right rule, you can use it to tokenize the domain name.
|
121
|
+
#
|
122
|
+
# rule = PublicSuffixService::Rule.factory("com")
|
123
|
+
#
|
124
|
+
# rule.decompose("google.com")
|
125
|
+
# # => ["google", "com"]
|
126
|
+
#
|
127
|
+
# rule.decompose("www.google.com")
|
128
|
+
# # => ["www.google", "com"]
|
129
|
+
#
|
130
|
+
class Base
|
131
|
+
|
132
|
+
attr_reader :name, :value, :type, :labels
|
133
|
+
|
134
|
+
# Initializes a new rule with name and value.
|
135
|
+
# If value is nil, name also becomes the value for this rule.
|
136
|
+
def initialize(name, value = nil)
|
137
|
+
@name = name.to_s
|
138
|
+
@value = value || @name
|
139
|
+
@type = self.class.name.split("::").last.downcase.to_sym
|
140
|
+
@labels = domain_to_labels(@value)
|
141
|
+
end
|
142
|
+
|
143
|
+
# Checks whether this rule is equal to <tt>other</tt>.
|
144
|
+
#
|
145
|
+
# other - An other PublicSuffixService::Rule::Base to compare.
|
146
|
+
#
|
147
|
+
# Returns true if this rule and other are instances of the same class
|
148
|
+
# and has the same value, false otherwise.
|
149
|
+
def ==(other)
|
150
|
+
return false unless other.is_a?(self.class)
|
151
|
+
self.equal?(other) ||
|
152
|
+
self.name == other.name
|
153
|
+
end
|
154
|
+
alias :eql? :==
|
155
|
+
|
156
|
+
|
157
|
+
# Checks whether this rule matches <tt>domain</tt>.
|
158
|
+
#
|
159
|
+
# domain - A string with the domain name to check.
|
160
|
+
#
|
161
|
+
# Returns a true if this rule matches domain,
|
162
|
+
# false otherwise.
|
163
|
+
def match?(domain)
|
164
|
+
l1 = labels
|
165
|
+
l2 = domain_to_labels(domain)
|
166
|
+
odiff(l1, l2).empty?
|
167
|
+
end
|
168
|
+
|
169
|
+
# Gets the length of this rule for comparison.
|
170
|
+
# The length usually matches the number of rule <tt>parts</tt>.
|
171
|
+
# Subclasses might actually override this method.
|
172
|
+
#
|
173
|
+
# Returns an Integer with the number of parts.
|
174
|
+
def length
|
175
|
+
parts.length
|
176
|
+
end
|
177
|
+
|
178
|
+
# Raises NotImplementedError.
|
179
|
+
def parts
|
180
|
+
raise NotImplementedError
|
181
|
+
end
|
182
|
+
|
183
|
+
# Raises NotImplementedError.
|
184
|
+
def decompose(domain)
|
185
|
+
raise NotImplementedError
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
private
|
190
|
+
|
191
|
+
def domain_to_labels(domain)
|
192
|
+
domain.to_s.split(".").reverse
|
193
|
+
end
|
194
|
+
|
195
|
+
def odiff(one, two)
|
196
|
+
ii = 0
|
197
|
+
while(ii < one.size && one[ii] == two[ii])
|
198
|
+
ii += 1
|
199
|
+
end
|
200
|
+
one[ii..one.length]
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
class Normal < Base
|
206
|
+
|
207
|
+
def initialize(name)
|
208
|
+
super(name, name)
|
209
|
+
end
|
210
|
+
|
211
|
+
# dot-split rule value and returns all rule parts
|
212
|
+
# in the order they appear in the value.
|
213
|
+
#
|
214
|
+
# Returns an Array with the domain parts.
|
215
|
+
def parts
|
216
|
+
@parts ||= @value.split(".")
|
217
|
+
end
|
218
|
+
|
219
|
+
# Decomposes the domain according to rule properties.
|
220
|
+
#
|
221
|
+
# domain - A String with the domain name to parse
|
222
|
+
#
|
223
|
+
# Return an Array with [trd + sld, tld].
|
224
|
+
def decompose(domain)
|
225
|
+
domain.to_s =~ /^(.*)\.(#{parts.join('\.')})$/
|
226
|
+
[$1, $2]
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
230
|
+
|
231
|
+
class Wildcard < Base
|
232
|
+
|
233
|
+
def initialize(name)
|
234
|
+
super(name, name.to_s[2..-1])
|
235
|
+
end
|
236
|
+
|
237
|
+
# dot-split rule value and returns all rule parts
|
238
|
+
# in the order they appear in the value.
|
239
|
+
#
|
240
|
+
# Returns an Array with the domain parts.
|
241
|
+
def parts
|
242
|
+
@parts ||= @value.split(".")
|
243
|
+
end
|
244
|
+
|
245
|
+
def length
|
246
|
+
parts.length + 1 # * counts as 1
|
247
|
+
end
|
248
|
+
|
249
|
+
# Decomposes the domain according to rule properties.
|
250
|
+
#
|
251
|
+
# domain - A String with the domain name to parse
|
252
|
+
#
|
253
|
+
# Return an Array with [trd + sld, tld].
|
254
|
+
def decompose(domain)
|
255
|
+
domain.to_s =~ /^(.*)\.(.*?\.#{parts.join('\.')})$/
|
256
|
+
[$1, $2]
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
class Exception < Base
|
262
|
+
|
263
|
+
def initialize(name)
|
264
|
+
super(name, name.to_s[1..-1])
|
265
|
+
end
|
266
|
+
|
267
|
+
# dot-split rule value and returns all rule parts
|
268
|
+
# in the order they appear in the value.
|
269
|
+
# The leftmost label is not considered a label.
|
270
|
+
#
|
271
|
+
# See http://publicsuffix.org/format/:
|
272
|
+
# If the prevailing rule is a exception rule,
|
273
|
+
# modify it by removing the leftmost label.
|
274
|
+
#
|
275
|
+
# Returns an Array with the domain parts.
|
276
|
+
def parts
|
277
|
+
@parts ||= @value.split(".")[1..-1]
|
278
|
+
end
|
279
|
+
|
280
|
+
# Decomposes the domain according to rule properties.
|
281
|
+
#
|
282
|
+
# domain - A String with the domain name to parse
|
283
|
+
#
|
284
|
+
# Return an Array with [trd + sld, tld].
|
285
|
+
def decompose(domain)
|
286
|
+
domain.to_s =~ /^(.*)\.(#{parts.join('\.')})$/
|
287
|
+
[$1, $2]
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
293
|
+
|
294
|
+
end
|