postrank-uri 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ pkg
2
+ Gemfile.lock
@@ -1,5 +1,5 @@
1
1
  module PostRank
2
2
  module URI
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
5
5
  end
data/lib/postrank-uri.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'addressable/uri'
4
4
  require 'domainatrix'
5
+ require 'nokogiri'
5
6
  require 'yaml'
6
7
 
7
8
  module PostRank
@@ -64,7 +65,9 @@ module PostRank
64
65
  URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
65
66
  URIREGEX.each_pair{|k,v| v.freeze }
66
67
 
67
- def self.extract(text)
68
+ module_function
69
+
70
+ def extract(text)
68
71
  return [] if !text
69
72
  urls = []
70
73
  text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
@@ -79,23 +82,41 @@ module PostRank
79
82
  urls.compact
80
83
  end
81
84
 
82
- def self.escape(uri)
85
+ def extract_href(text, host = nil)
86
+ urls = {}
87
+ Nokogiri.HTML(text).search('a').each do |a|
88
+ begin
89
+ url = normalize(c18n(unescape(a.attr('href'))))
90
+ if url.host.empty?
91
+ next if host.nil?
92
+ url.host = host
93
+ end
94
+
95
+ urls[url.to_s] = a.text
96
+ rescue
97
+ next
98
+ end
99
+ end
100
+ urls
101
+ end
102
+
103
+ def escape(uri)
83
104
  uri.gsub(URIREGEX[:escape]) do
84
105
  '%' + $1.unpack('H2' * $1.size).join('%').upcase
85
106
  end.gsub(' ','%20')
86
107
  end
87
108
 
88
- def self.unescape(uri)
109
+ def unescape(uri)
89
110
  uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
90
111
  [$1.delete('%')].pack('H*')
91
112
  end
92
113
  end
93
114
 
94
- def self.clean(uri)
115
+ def clean(uri)
95
116
  normalize(c18n(unescape(uri))).to_s
96
117
  end
97
118
 
98
- def self.normalize(uri)
119
+ def normalize(uri)
99
120
  u = parse(uri)
100
121
  u.path = u.path.squeeze('/')
101
122
  u.query = nil if u.query && u.query.empty?
@@ -103,7 +124,7 @@ module PostRank
103
124
  u
104
125
  end
105
126
 
106
- def self.c18n(uri)
127
+ def c18n(uri)
107
128
  u = parse(uri)
108
129
 
109
130
  if q = u.query_values(:notation => :flat_array)
@@ -115,7 +136,7 @@ module PostRank
115
136
  u
116
137
  end
117
138
 
118
- def self.parse(uri)
139
+ def parse(uri)
119
140
  return uri if uri.is_a? Addressable::URI
120
141
 
121
142
  uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}"
data/postrank-uri.gemspec CHANGED
@@ -8,14 +8,15 @@ Gem::Specification.new do |s|
8
8
  s.platform = Gem::Platform::RUBY
9
9
  s.authors = ["Ilya Grigorik"]
10
10
  s.email = ["ilya@igvita.com"]
11
- s.homepage = "http://rubygems.org/gems/postrank-uri"
11
+ s.homepage = "http://github.com/postrank-labs/postrank-uri"
12
12
  s.summary = "URI normalization, c18n, escaping, and extraction"
13
13
  s.description = s.summary
14
14
 
15
15
  s.rubyforge_project = "postrank-uri"
16
16
 
17
- s.add_dependency "addressable"
17
+ s.add_dependency "addressable", ">= 2.2.3"
18
18
  s.add_dependency "domainatrix"
19
+ s.add_dependency "nokogiri"
19
20
  s.add_development_dependency "rspec"
20
21
 
21
22
  s.files = `git ls-files`.split("\n")
@@ -181,6 +181,41 @@ describe PostRank::URI do
181
181
  e("http://www.youtube.com/watch?v=w_j4Lda25jA  とんかつ定食").should == ["http://www.youtube.com/watch?v=w_j4Lda25jA"]
182
182
  end
183
183
  end
184
+
185
+ end
186
+
187
+ context "href extract" do
188
+ it "should extract links from html text" do
189
+ l = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
190
+ l.keys.size.should == 2
191
+
192
+ l.keys.should include('http://google.com/')
193
+ l.keys.should include('http://b.com/')
194
+
195
+ l['http://google.com/'].should == 'link to google'
196
+ l['http://b.com/'].should == 'stuff'
197
+ end
198
+
199
+ it "should handle empty hrefs" do
200
+ lambda do
201
+ l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
202
+ l.should be_empty
203
+ end.should_not raise_error
204
+ end
205
+
206
+ context "relative paths" do
207
+ it "should reject relative paths" do
208
+ l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
209
+ l.should be_empty
210
+ end
211
+
212
+ it "should resolve relative paths if host is provided" do
213
+ l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com")
214
+
215
+ l.size.should == 1
216
+ l['http://igvita.com/stuff'].should == 'link to stuff'
217
+ end
218
+ end
184
219
  end
185
220
 
186
- end
221
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 1
7
7
  - 0
8
- - 0
9
- version: 1.0.0
8
+ - 1
9
+ version: 1.0.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ilya Grigorik
@@ -14,27 +14,27 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-20 00:00:00 -05:00
17
+ date: 2011-01-21 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: addressable
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
24
  requirements:
26
25
  - - ">="
27
26
  - !ruby/object:Gem::Version
28
27
  segments:
29
- - 0
30
- version: "0"
28
+ - 2
29
+ - 2
30
+ - 3
31
+ version: 2.2.3
31
32
  type: :runtime
32
33
  version_requirements: *id001
33
34
  - !ruby/object:Gem::Dependency
34
35
  name: domainatrix
35
36
  prerelease: false
36
37
  requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
38
  requirements:
39
39
  - - ">="
40
40
  - !ruby/object:Gem::Version
@@ -44,18 +44,29 @@ dependencies:
44
44
  type: :runtime
45
45
  version_requirements: *id002
46
46
  - !ruby/object:Gem::Dependency
47
- name: rspec
47
+ name: nokogiri
48
48
  prerelease: false
49
49
  requirement: &id003 !ruby/object:Gem::Requirement
50
- none: false
51
50
  requirements:
52
51
  - - ">="
53
52
  - !ruby/object:Gem::Version
54
53
  segments:
55
54
  - 0
56
55
  version: "0"
57
- type: :development
56
+ type: :runtime
58
57
  version_requirements: *id003
58
+ - !ruby/object:Gem::Dependency
59
+ name: rspec
60
+ prerelease: false
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ type: :development
69
+ version_requirements: *id004
59
70
  description: URI normalization, c18n, escaping, and extraction
60
71
  email:
61
72
  - ilya@igvita.com
@@ -66,6 +77,7 @@ extensions: []
66
77
  extra_rdoc_files: []
67
78
 
68
79
  files:
80
+ - .gitignore
69
81
  - Gemfile
70
82
  - README.md
71
83
  - Rakefile
@@ -77,7 +89,7 @@ files:
77
89
  - spec/helper.rb
78
90
  - spec/postrank-uri_spec.rb
79
91
  has_rdoc: true
80
- homepage: http://rubygems.org/gems/postrank-uri
92
+ homepage: http://github.com/postrank-labs/postrank-uri
81
93
  licenses: []
82
94
 
83
95
  post_install_message:
@@ -86,7 +98,6 @@ rdoc_options: []
86
98
  require_paths:
87
99
  - lib
88
100
  required_ruby_version: !ruby/object:Gem::Requirement
89
- none: false
90
101
  requirements:
91
102
  - - ">="
92
103
  - !ruby/object:Gem::Version
@@ -94,7 +105,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
94
105
  - 0
95
106
  version: "0"
96
107
  required_rubygems_version: !ruby/object:Gem::Requirement
97
- none: false
98
108
  requirements:
99
109
  - - ">="
100
110
  - !ruby/object:Gem::Version
@@ -104,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
114
  requirements: []
105
115
 
106
116
  rubyforge_project: postrank-uri
107
- rubygems_version: 1.3.7
117
+ rubygems_version: 1.3.6
108
118
  signing_key:
109
119
  specification_version: 3
110
120
  summary: URI normalization, c18n, escaping, and extraction