postrank-uri 1.0.8 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/postrank-uri.rb CHANGED
@@ -6,6 +6,21 @@ require 'digest/md5'
6
6
  require 'nokogiri'
7
7
  require 'yaml'
8
8
 
9
+ module Addressable
10
+ class URI
11
+ def domain
12
+ begin
13
+ dp = Domainatrix.parse(self)
14
+ rescue
15
+ return nil
16
+ end
17
+
18
+ dom = dp.public_suffix
19
+ dom = dp.domain.downcase + "." + dom unless dp.domain.empty?
20
+ end
21
+ end
22
+ end
23
+
9
24
  module PostRank
10
25
  module URI
11
26
 
@@ -87,7 +102,7 @@ module PostRank
87
102
  urls = []
88
103
  Nokogiri.HTML(text).search('a').each do |a|
89
104
  begin
90
- url = clean(a.attr('href'), false)
105
+ url = clean(a.attr('href'), :raw => true)
91
106
  if url.host.empty?
92
107
  next if host.nil?
93
108
  url.host = host
@@ -113,13 +128,13 @@ module PostRank
113
128
  end
114
129
  end
115
130
 
116
- def clean(uri, string = true)
131
+ def clean(uri, opts = {})
117
132
  uri = normalize(c18n(unescape(uri)))
118
- string ? uri.to_s : uri
133
+ opts[:raw] ? uri : uri.to_s
119
134
  end
120
135
 
121
- def hash(uri)
122
- Digest::MD5.hexdigest(clean(uri))
136
+ def hash(uri, opts = {})
137
+ Digest::MD5.hexdigest(opts[:skip_clean] ? uri : clean(uri))
123
138
  end
124
139
 
125
140
  def normalize(uri)
@@ -158,7 +173,7 @@ module PostRank
158
173
  embedded = uri.query_values['u']
159
174
  end
160
175
 
161
- uri = clean(embedded, false) if embedded
176
+ uri = clean(embedded, :raw => true) if embedded
162
177
  uri
163
178
  end
164
179
 
@@ -1,5 +1,5 @@
1
1
  module PostRank
2
2
  module URI
3
- VERSION = "1.0.8"
3
+ VERSION = "1.0.9"
4
4
  end
5
5
  end
@@ -177,8 +177,8 @@ describe PostRank::URI do
177
177
  end
178
178
 
179
179
  context "hash" do
180
- def h(uri)
181
- PostRank::URI.hash(uri)
180
+ def h(uri, opts = {})
181
+ PostRank::URI.hash(uri, opts)
182
182
  end
183
183
 
184
184
  it "should compute MD5 hash of the normalized URI" do
@@ -189,6 +189,13 @@ describe PostRank::URI do
189
189
  h('everburning.com/feed/post/1').should == hash
190
190
  h('everburning.com/feed/post/1/').should == hash
191
191
  end
192
+
193
+ it "should not clean the URI if requested" do
194
+ hash = '55fae8910d312b7878a3201ed653b881'
195
+
196
+ h('http://everburning.com/feed/post/1', :skip_clean => true).should == hash
197
+ h('everburning.com/feed/post/1', :skip_clean => true).should_not == hash
198
+ end
192
199
  end
193
200
 
194
201
  context "extract" do
@@ -265,6 +272,31 @@ describe PostRank::URI do
265
272
  i.last.should == 'link to stuff'
266
273
  end
267
274
  end
268
- end
269
275
 
276
+ context 'domain extraction' do
277
+ url_list = {"http://alex.pages.example.com" => "example.com",
278
+ "alex.pages.example.com" => "example.com",
279
+ "http://example.com/2011/04/01/blah" => "example.com",
280
+ "http://example.com" => "example.com",
281
+ "example.com" => "example.com",
282
+ "ExampLe.com" => "example.com",
283
+ "ExampLe.com:3000" => "example.com",
284
+ "http://alex.pages.example.COM" => "example.com",
285
+ "http://www.example.ag.it/2011/04/01/blah" => "example.ag.it",
286
+ "ftp://www.example.com/2011/04/01/blah" => nil,
287
+ "http://com" => nil,
288
+ "http://alex.pages.examplecom" => nil,
289
+ "example" => nil,
290
+ "http://127.0.0.1" => nil,
291
+ "localhost" => nil
292
+ }
293
+
294
+ url_list.each_pair do |url, expected_result|
295
+ it "should extract #{expected_result.inspect} from #{url}" do
296
+ u = PostRank::URI.clean(url, :raw => true)
297
+ u.domain.should == expected_result
298
+ end
299
+ end
300
+ end
301
+ end
270
302
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: postrank-uri
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.0.8
5
+ version: 1.0.9
6
6
  platform: ruby
7
7
  authors:
8
8
  - Ilya Grigorik
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-03-31 00:00:00 -04:00
13
+ date: 2011-04-08 00:00:00 -04:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency