postrank-uri 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/lib/postrank-uri/version.rb +1 -1
- data/lib/postrank-uri.rb +28 -7
- data/postrank-uri.gemspec +3 -2
- data/spec/postrank-uri_spec.rb +36 -1
- metadata +24 -14
data/.gitignore
ADDED
data/lib/postrank-uri/version.rb
CHANGED
data/lib/postrank-uri.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'addressable/uri'
|
4
4
|
require 'domainatrix'
|
5
|
+
require 'nokogiri'
|
5
6
|
require 'yaml'
|
6
7
|
|
7
8
|
module PostRank
|
@@ -64,7 +65,9 @@ module PostRank
|
|
64
65
|
URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
|
65
66
|
URIREGEX.each_pair{|k,v| v.freeze }
|
66
67
|
|
67
|
-
|
68
|
+
module_function
|
69
|
+
|
70
|
+
def extract(text)
|
68
71
|
return [] if !text
|
69
72
|
urls = []
|
70
73
|
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
@@ -79,23 +82,41 @@ module PostRank
|
|
79
82
|
urls.compact
|
80
83
|
end
|
81
84
|
|
82
|
-
def
|
85
|
+
def extract_href(text, host = nil)
|
86
|
+
urls = {}
|
87
|
+
Nokogiri.HTML(text).search('a').each do |a|
|
88
|
+
begin
|
89
|
+
url = normalize(c18n(unescape(a.attr('href'))))
|
90
|
+
if url.host.empty?
|
91
|
+
next if host.nil?
|
92
|
+
url.host = host
|
93
|
+
end
|
94
|
+
|
95
|
+
urls[url.to_s] = a.text
|
96
|
+
rescue
|
97
|
+
next
|
98
|
+
end
|
99
|
+
end
|
100
|
+
urls
|
101
|
+
end
|
102
|
+
|
103
|
+
def escape(uri)
|
83
104
|
uri.gsub(URIREGEX[:escape]) do
|
84
105
|
'%' + $1.unpack('H2' * $1.size).join('%').upcase
|
85
106
|
end.gsub(' ','%20')
|
86
107
|
end
|
87
108
|
|
88
|
-
def
|
109
|
+
def unescape(uri)
|
89
110
|
uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
|
90
111
|
[$1.delete('%')].pack('H*')
|
91
112
|
end
|
92
113
|
end
|
93
114
|
|
94
|
-
def
|
115
|
+
def clean(uri)
|
95
116
|
normalize(c18n(unescape(uri))).to_s
|
96
117
|
end
|
97
118
|
|
98
|
-
def
|
119
|
+
def normalize(uri)
|
99
120
|
u = parse(uri)
|
100
121
|
u.path = u.path.squeeze('/')
|
101
122
|
u.query = nil if u.query && u.query.empty?
|
@@ -103,7 +124,7 @@ module PostRank
|
|
103
124
|
u
|
104
125
|
end
|
105
126
|
|
106
|
-
def
|
127
|
+
def c18n(uri)
|
107
128
|
u = parse(uri)
|
108
129
|
|
109
130
|
if q = u.query_values(:notation => :flat_array)
|
@@ -115,7 +136,7 @@ module PostRank
|
|
115
136
|
u
|
116
137
|
end
|
117
138
|
|
118
|
-
def
|
139
|
+
def parse(uri)
|
119
140
|
return uri if uri.is_a? Addressable::URI
|
120
141
|
|
121
142
|
uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}"
|
data/postrank-uri.gemspec
CHANGED
@@ -8,14 +8,15 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Ilya Grigorik"]
|
10
10
|
s.email = ["ilya@igvita.com"]
|
11
|
-
s.homepage = "http://
|
11
|
+
s.homepage = "http://github.com/postrank-labs/postrank-uri"
|
12
12
|
s.summary = "URI normalization, c18n, escaping, and extraction"
|
13
13
|
s.description = s.summary
|
14
14
|
|
15
15
|
s.rubyforge_project = "postrank-uri"
|
16
16
|
|
17
|
-
s.add_dependency "addressable"
|
17
|
+
s.add_dependency "addressable", ">= 2.2.3"
|
18
18
|
s.add_dependency "domainatrix"
|
19
|
+
s.add_dependency "nokogiri"
|
19
20
|
s.add_development_dependency "rspec"
|
20
21
|
|
21
22
|
s.files = `git ls-files`.split("\n")
|
data/spec/postrank-uri_spec.rb
CHANGED
@@ -181,6 +181,41 @@ describe PostRank::URI do
|
|
181
181
|
e("http://www.youtube.com/watch?v=w_j4Lda25jA とんかつ定食").should == ["http://www.youtube.com/watch?v=w_j4Lda25jA"]
|
182
182
|
end
|
183
183
|
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
context "href extract" do
|
188
|
+
it "should extract links from html text" do
|
189
|
+
l = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
|
190
|
+
l.keys.size.should == 2
|
191
|
+
|
192
|
+
l.keys.should include('http://google.com/')
|
193
|
+
l.keys.should include('http://b.com/')
|
194
|
+
|
195
|
+
l['http://google.com/'].should == 'link to google'
|
196
|
+
l['http://b.com/'].should == 'stuff'
|
197
|
+
end
|
198
|
+
|
199
|
+
it "should handle empty hrefs" do
|
200
|
+
lambda do
|
201
|
+
l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
|
202
|
+
l.should be_empty
|
203
|
+
end.should_not raise_error
|
204
|
+
end
|
205
|
+
|
206
|
+
context "relative paths" do
|
207
|
+
it "should reject relative paths" do
|
208
|
+
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
|
209
|
+
l.should be_empty
|
210
|
+
end
|
211
|
+
|
212
|
+
it "should resolve relative paths if host is provided" do
|
213
|
+
l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com")
|
214
|
+
|
215
|
+
l.size.should == 1
|
216
|
+
l['http://igvita.com/stuff'].should == 'link to stuff'
|
217
|
+
end
|
218
|
+
end
|
184
219
|
end
|
185
220
|
|
186
|
-
end
|
221
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 1
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 1.0.
|
8
|
+
- 1
|
9
|
+
version: 1.0.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Ilya Grigorik
|
@@ -14,27 +14,27 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-21 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: addressable
|
22
22
|
prerelease: false
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
24
|
requirements:
|
26
25
|
- - ">="
|
27
26
|
- !ruby/object:Gem::Version
|
28
27
|
segments:
|
29
|
-
-
|
30
|
-
|
28
|
+
- 2
|
29
|
+
- 2
|
30
|
+
- 3
|
31
|
+
version: 2.2.3
|
31
32
|
type: :runtime
|
32
33
|
version_requirements: *id001
|
33
34
|
- !ruby/object:Gem::Dependency
|
34
35
|
name: domainatrix
|
35
36
|
prerelease: false
|
36
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
-
none: false
|
38
38
|
requirements:
|
39
39
|
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
@@ -44,18 +44,29 @@ dependencies:
|
|
44
44
|
type: :runtime
|
45
45
|
version_requirements: *id002
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
47
|
+
name: nokogiri
|
48
48
|
prerelease: false
|
49
49
|
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
-
none: false
|
51
50
|
requirements:
|
52
51
|
- - ">="
|
53
52
|
- !ruby/object:Gem::Version
|
54
53
|
segments:
|
55
54
|
- 0
|
56
55
|
version: "0"
|
57
|
-
type: :
|
56
|
+
type: :runtime
|
58
57
|
version_requirements: *id003
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: rspec
|
60
|
+
prerelease: false
|
61
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
type: :development
|
69
|
+
version_requirements: *id004
|
59
70
|
description: URI normalization, c18n, escaping, and extraction
|
60
71
|
email:
|
61
72
|
- ilya@igvita.com
|
@@ -66,6 +77,7 @@ extensions: []
|
|
66
77
|
extra_rdoc_files: []
|
67
78
|
|
68
79
|
files:
|
80
|
+
- .gitignore
|
69
81
|
- Gemfile
|
70
82
|
- README.md
|
71
83
|
- Rakefile
|
@@ -77,7 +89,7 @@ files:
|
|
77
89
|
- spec/helper.rb
|
78
90
|
- spec/postrank-uri_spec.rb
|
79
91
|
has_rdoc: true
|
80
|
-
homepage: http://
|
92
|
+
homepage: http://github.com/postrank-labs/postrank-uri
|
81
93
|
licenses: []
|
82
94
|
|
83
95
|
post_install_message:
|
@@ -86,7 +98,6 @@ rdoc_options: []
|
|
86
98
|
require_paths:
|
87
99
|
- lib
|
88
100
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
101
|
requirements:
|
91
102
|
- - ">="
|
92
103
|
- !ruby/object:Gem::Version
|
@@ -94,7 +105,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
94
105
|
- 0
|
95
106
|
version: "0"
|
96
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
108
|
requirements:
|
99
109
|
- - ">="
|
100
110
|
- !ruby/object:Gem::Version
|
@@ -104,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
114
|
requirements: []
|
105
115
|
|
106
116
|
rubyforge_project: postrank-uri
|
107
|
-
rubygems_version: 1.3.
|
117
|
+
rubygems_version: 1.3.6
|
108
118
|
signing_key:
|
109
119
|
specification_version: 3
|
110
120
|
summary: URI normalization, c18n, escaping, and extraction
|