postrank-uri 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/lib/postrank-uri/version.rb +1 -1
- data/lib/postrank-uri.rb +28 -7
- data/postrank-uri.gemspec +3 -2
- data/spec/postrank-uri_spec.rb +36 -1
- metadata +24 -14
data/.gitignore
ADDED
data/lib/postrank-uri/version.rb
CHANGED
data/lib/postrank-uri.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'addressable/uri'
|
4
4
|
require 'domainatrix'
|
5
|
+
require 'nokogiri'
|
5
6
|
require 'yaml'
|
6
7
|
|
7
8
|
module PostRank
|
@@ -64,7 +65,9 @@ module PostRank
|
|
64
65
|
URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
|
65
66
|
URIREGEX.each_pair{|k,v| v.freeze }
|
66
67
|
|
67
|
-
|
68
|
+
module_function
|
69
|
+
|
70
|
+
def extract(text)
|
68
71
|
return [] if !text
|
69
72
|
urls = []
|
70
73
|
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
@@ -79,23 +82,41 @@ module PostRank
|
|
79
82
|
urls.compact
|
80
83
|
end
|
81
84
|
|
82
|
-
def
|
85
|
+
def extract_href(text, host = nil)
|
86
|
+
urls = {}
|
87
|
+
Nokogiri.HTML(text).search('a').each do |a|
|
88
|
+
begin
|
89
|
+
url = normalize(c18n(unescape(a.attr('href'))))
|
90
|
+
if url.host.empty?
|
91
|
+
next if host.nil?
|
92
|
+
url.host = host
|
93
|
+
end
|
94
|
+
|
95
|
+
urls[url.to_s] = a.text
|
96
|
+
rescue
|
97
|
+
next
|
98
|
+
end
|
99
|
+
end
|
100
|
+
urls
|
101
|
+
end
|
102
|
+
|
103
|
+
def escape(uri)
|
83
104
|
uri.gsub(URIREGEX[:escape]) do
|
84
105
|
'%' + $1.unpack('H2' * $1.size).join('%').upcase
|
85
106
|
end.gsub(' ','%20')
|
86
107
|
end
|
87
108
|
|
88
|
-
def
|
109
|
+
def unescape(uri)
|
89
110
|
uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
|
90
111
|
[$1.delete('%')].pack('H*')
|
91
112
|
end
|
92
113
|
end
|
93
114
|
|
94
|
-
def
|
115
|
+
def clean(uri)
|
95
116
|
normalize(c18n(unescape(uri))).to_s
|
96
117
|
end
|
97
118
|
|
98
|
-
def
|
119
|
+
def normalize(uri)
|
99
120
|
u = parse(uri)
|
100
121
|
u.path = u.path.squeeze('/')
|
101
122
|
u.query = nil if u.query && u.query.empty?
|
@@ -103,7 +124,7 @@ module PostRank
|
|
103
124
|
u
|
104
125
|
end
|
105
126
|
|
106
|
-
def
|
127
|
+
def c18n(uri)
|
107
128
|
u = parse(uri)
|
108
129
|
|
109
130
|
if q = u.query_values(:notation => :flat_array)
|
@@ -115,7 +136,7 @@ module PostRank
|
|
115
136
|
u
|
116
137
|
end
|
117
138
|
|
118
|
-
def
|
139
|
+
def parse(uri)
|
119
140
|
return uri if uri.is_a? Addressable::URI
|
120
141
|
|
121
142
|
uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}"
|
data/postrank-uri.gemspec
CHANGED
@@ -8,14 +8,15 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Ilya Grigorik"]
|
10
10
|
s.email = ["ilya@igvita.com"]
|
11
|
-
s.homepage = "http://
|
11
|
+
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
12
|
s.summary = "URI normalization, c18n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
14
|
|
15
15
|
s.rubyforge_project = "postrank-uri"
|
16
16
|
|
17
|
-
s.add_dependency "addressable"
|
17
|
+
s.add_dependency "addressable", ">= 2.2.3"
|
18
18
|
s.add_dependency "domainatrix"
|
19
|
+
s.add_dependency "nokogiri"
|
19
20
|
s.add_development_dependency "rspec"
|
20
21
|
|
21
22
|
s.files = `git ls-files`.split("\n")
|
data/spec/postrank-uri_spec.rb
CHANGED
@@ -181,6 +181,41 @@ describe PostRank::URI do
|
|
181
181
|
e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食").should == ["http://www.youtube.com/watch?v=w_j4Lda25jA"]
|
182
182
|
end
|
183
183
|
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
context "href extract" do
|
188
|
+
it "should extract links from html text" do
|
189
|
+
l = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
|
190
|
+
l.keys.size.should == 2
|
191
|
+
|
192
|
+
l.keys.should include('http://google.com/')
|
193
|
+
l.keys.should include('http://b.com/')
|
194
|
+
|
195
|
+
l['http://google.com/'].should == 'link to google'
|
196
|
+
l['http://b.com/'].should == 'stuff'
|
197
|
+
end
|
198
|
+
|
199
|
+
it "should handle empty hrefs" do
|
200
|
+
lambda do
|
201
|
+
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
|
202
|
+
l.should be_empty
|
203
|
+
end.should_not raise_error
|
204
|
+
end
|
205
|
+
|
206
|
+
context "relative paths" do
|
207
|
+
it "should reject relative paths" do
|
208
|
+
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
|
209
|
+
l.should be_empty
|
210
|
+
end
|
211
|
+
|
212
|
+
it "should resolve relative paths if host is provided" do
|
213
|
+
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com")
|
214
|
+
|
215
|
+
l.size.should == 1
|
216
|
+
l['http://igvita.com/stuff'].should == 'link to stuff'
|
217
|
+
end
|
218
|
+
end
|
184
219
|
end
|
185
220
|
|
186
|
-
end
|
221
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 1
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 1.0.
|
8
|
+
- 1
|
9
|
+
version: 1.0.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Ilya Grigorik
|
@@ -14,27 +14,27 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-21 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: addressable
|
22
22
|
prerelease: false
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
24
|
requirements:
|
26
25
|
- - ">="
|
27
26
|
- !ruby/object:Gem::Version
|
28
27
|
segments:
|
29
|
-
-
|
30
|
-
|
28
|
+
- 2
|
29
|
+
- 2
|
30
|
+
- 3
|
31
|
+
version: 2.2.3
|
31
32
|
type: :runtime
|
32
33
|
version_requirements: *id001
|
33
34
|
- !ruby/object:Gem::Dependency
|
34
35
|
name: domainatrix
|
35
36
|
prerelease: false
|
36
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
-
none: false
|
38
38
|
requirements:
|
39
39
|
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
@@ -44,18 +44,29 @@ dependencies:
|
|
44
44
|
type: :runtime
|
45
45
|
version_requirements: *id002
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
47
|
+
name: nokogiri
|
48
48
|
prerelease: false
|
49
49
|
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
-
none: false
|
51
50
|
requirements:
|
52
51
|
- - ">="
|
53
52
|
- !ruby/object:Gem::Version
|
54
53
|
segments:
|
55
54
|
- 0
|
56
55
|
version: "0"
|
57
|
-
type: :
|
56
|
+
type: :runtime
|
58
57
|
version_requirements: *id003
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: rspec
|
60
|
+
prerelease: false
|
61
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
type: :development
|
69
|
+
version_requirements: *id004
|
59
70
|
description: URI normalization, c18n, escaping, and extraction
|
60
71
|
email:
|
61
72
|
- ilya@igvita.com
|
@@ -66,6 +77,7 @@ extensions: []
|
|
66
77
|
extra_rdoc_files: []
|
67
78
|
|
68
79
|
files:
|
80
|
+
- .gitignore
|
69
81
|
- Gemfile
|
70
82
|
- README.md
|
71
83
|
- Rakefile
|
@@ -77,7 +89,7 @@ files:
|
|
77
89
|
- spec/helper.rb
|
78
90
|
- spec/postrank-uri_spec.rb
|
79
91
|
has_rdoc: true
|
80
|
-
homepage: http://
|
92
|
+
homepage: http://github.com/postrank-labs/postrank-uri
|
81
93
|
licenses: []
|
82
94
|
|
83
95
|
post_install_message:
|
@@ -86,7 +98,6 @@ rdoc_options: []
|
|
86
98
|
require_paths:
|
87
99
|
- lib
|
88
100
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
101
|
requirements:
|
91
102
|
- - ">="
|
92
103
|
- !ruby/object:Gem::Version
|
@@ -94,7 +105,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
94
105
|
- 0
|
95
106
|
version: "0"
|
96
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
108
|
requirements:
|
99
109
|
- - ">="
|
100
110
|
- !ruby/object:Gem::Version
|
@@ -104,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
114
|
requirements: []
|
105
115
|
|
106
116
|
rubyforge_project: postrank-uri
|
107
|
-
rubygems_version: 1.3.
|
117
|
+
rubygems_version: 1.3.6
|
108
118
|
signing_key:
|
109
119
|
specification_version: 3
|
110
120
|
summary: URI normalization, c18n, escaping, and extraction
|