postrank-uri 1.0.8 → 1.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/postrank-uri.rb +21 -6
- data/lib/postrank-uri/version.rb +1 -1
- data/spec/postrank-uri_spec.rb +35 -3
- metadata +2 -2
data/lib/postrank-uri.rb
CHANGED
@@ -6,6 +6,21 @@ require 'digest/md5'
|
|
6
6
|
require 'nokogiri'
|
7
7
|
require 'yaml'
|
8
8
|
|
9
|
+
module Addressable
|
10
|
+
class URI
|
11
|
+
def domain
|
12
|
+
begin
|
13
|
+
dp = Domainatrix.parse(self)
|
14
|
+
rescue
|
15
|
+
return nil
|
16
|
+
end
|
17
|
+
|
18
|
+
dom = dp.public_suffix
|
19
|
+
dom = dp.domain.downcase + "." + dom unless dp.domain.empty?
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
9
24
|
module PostRank
|
10
25
|
module URI
|
11
26
|
|
@@ -87,7 +102,7 @@ module PostRank
|
|
87
102
|
urls = []
|
88
103
|
Nokogiri.HTML(text).search('a').each do |a|
|
89
104
|
begin
|
90
|
-
url = clean(a.attr('href'),
|
105
|
+
url = clean(a.attr('href'), :raw => true)
|
91
106
|
if url.host.empty?
|
92
107
|
next if host.nil?
|
93
108
|
url.host = host
|
@@ -113,13 +128,13 @@ module PostRank
|
|
113
128
|
end
|
114
129
|
end
|
115
130
|
|
116
|
-
def clean(uri,
|
131
|
+
def clean(uri, opts = {})
|
117
132
|
uri = normalize(c18n(unescape(uri)))
|
118
|
-
|
133
|
+
opts[:raw] ? uri : uri.to_s
|
119
134
|
end
|
120
135
|
|
121
|
-
def hash(uri)
|
122
|
-
Digest::MD5.hexdigest(clean(uri))
|
136
|
+
def hash(uri, opts = {})
|
137
|
+
Digest::MD5.hexdigest(opts[:skip_clean] ? uri : clean(uri))
|
123
138
|
end
|
124
139
|
|
125
140
|
def normalize(uri)
|
@@ -158,7 +173,7 @@ module PostRank
|
|
158
173
|
embedded = uri.query_values['u']
|
159
174
|
end
|
160
175
|
|
161
|
-
uri = clean(embedded,
|
176
|
+
uri = clean(embedded, :raw => true) if embedded
|
162
177
|
uri
|
163
178
|
end
|
164
179
|
|
data/lib/postrank-uri/version.rb
CHANGED
data/spec/postrank-uri_spec.rb
CHANGED
@@ -177,8 +177,8 @@ describe PostRank::URI do
|
|
177
177
|
end
|
178
178
|
|
179
179
|
context "hash" do
|
180
|
-
def h(uri)
|
181
|
-
PostRank::URI.hash(uri)
|
180
|
+
def h(uri, opts = {})
|
181
|
+
PostRank::URI.hash(uri, opts)
|
182
182
|
end
|
183
183
|
|
184
184
|
it "should compute MD5 hash of the normalized URI" do
|
@@ -189,6 +189,13 @@ describe PostRank::URI do
|
|
189
189
|
h('everburning.com/feed/post/1').should == hash
|
190
190
|
h('everburning.com/feed/post/1/').should == hash
|
191
191
|
end
|
192
|
+
|
193
|
+
it "should not clean the URI if requested" do
|
194
|
+
hash = '55fae8910d312b7878a3201ed653b881'
|
195
|
+
|
196
|
+
h('http://everburning.com/feed/post/1', :skip_clean => true).should == hash
|
197
|
+
h('everburning.com/feed/post/1', :skip_clean => true).should_not == hash
|
198
|
+
end
|
192
199
|
end
|
193
200
|
|
194
201
|
context "extract" do
|
@@ -265,6 +272,31 @@ describe PostRank::URI do
|
|
265
272
|
i.last.should == 'link to stuff'
|
266
273
|
end
|
267
274
|
end
|
268
|
-
end
|
269
275
|
|
276
|
+
context 'domain extraction' do
|
277
|
+
url_list = {"http://alex.pages.example.com" => "example.com",
|
278
|
+
"alex.pages.example.com" => "example.com",
|
279
|
+
"http://example.com/2011/04/01/blah" => "example.com",
|
280
|
+
"http://example.com" => "example.com",
|
281
|
+
"example.com" => "example.com",
|
282
|
+
"ExampLe.com" => "example.com",
|
283
|
+
"ExampLe.com:3000" => "example.com",
|
284
|
+
"http://alex.pages.example.COM" => "example.com",
|
285
|
+
"http://www.example.ag.it/2011/04/01/blah" => "example.ag.it",
|
286
|
+
"ftp://www.example.com/2011/04/01/blah" => nil,
|
287
|
+
"http://com" => nil,
|
288
|
+
"http://alex.pages.examplecom" => nil,
|
289
|
+
"example" => nil,
|
290
|
+
"http://127.0.0.1" => nil,
|
291
|
+
"localhost" => nil
|
292
|
+
}
|
293
|
+
|
294
|
+
url_list.each_pair do |url, expected_result|
|
295
|
+
it "should extract #{expected_result.inspect} from #{url}" do
|
296
|
+
u = PostRank::URI.clean(url, :raw => true)
|
297
|
+
u.domain.should == expected_result
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
270
302
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: postrank-uri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.0.
|
5
|
+
version: 1.0.9
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Ilya Grigorik
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-04-08 00:00:00 -04:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|