postrank-uri 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ pkg
2
+ Gemfile.lock
@@ -1,5 +1,5 @@
1
1
  module PostRank
2
2
  module URI
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
5
5
  end
data/lib/postrank-uri.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'addressable/uri'
4
4
  require 'domainatrix'
5
+ require 'nokogiri'
5
6
  require 'yaml'
6
7
 
7
8
  module PostRank
@@ -64,7 +65,9 @@ module PostRank
64
65
  URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
65
66
  URIREGEX.each_pair{|k,v| v.freeze }
66
67
 
67
- def self.extract(text)
68
+ module_function
69
+
70
+ def extract(text)
68
71
  return [] if !text
69
72
  urls = []
70
73
  text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
@@ -79,23 +82,41 @@ module PostRank
79
82
  urls.compact
80
83
  end
81
84
 
82
- def self.escape(uri)
85
+ def extract_href(text, host = nil)
86
+ urls = {}
87
+ Nokogiri.HTML(text).search('a').each do |a|
88
+ begin
89
+ url = normalize(c18n(unescape(a.attr('href'))))
90
+ if url.host.empty?
91
+ next if host.nil?
92
+ url.host = host
93
+ end
94
+
95
+ urls[url.to_s] = a.text
96
+ rescue
97
+ next
98
+ end
99
+ end
100
+ urls
101
+ end
102
+
103
+ def escape(uri)
83
104
  uri.gsub(URIREGEX[:escape]) do
84
105
  '%' + $1.unpack('H2' * $1.size).join('%').upcase
85
106
  end.gsub(' ','%20')
86
107
  end
87
108
 
88
- def self.unescape(uri)
109
+ def unescape(uri)
89
110
  uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
90
111
  [$1.delete('%')].pack('H*')
91
112
  end
92
113
  end
93
114
 
94
- def self.clean(uri)
115
+ def clean(uri)
95
116
  normalize(c18n(unescape(uri))).to_s
96
117
  end
97
118
 
98
- def self.normalize(uri)
119
+ def normalize(uri)
99
120
  u = parse(uri)
100
121
  u.path = u.path.squeeze('/')
101
122
  u.query = nil if u.query && u.query.empty?
@@ -103,7 +124,7 @@ module PostRank
103
124
  u
104
125
  end
105
126
 
106
- def self.c18n(uri)
127
+ def c18n(uri)
107
128
  u = parse(uri)
108
129
 
109
130
  if q = u.query_values(:notation => :flat_array)
@@ -115,7 +136,7 @@ module PostRank
115
136
  u
116
137
  end
117
138
 
118
- def self.parse(uri)
139
+ def parse(uri)
119
140
  return uri if uri.is_a? Addressable::URI
120
141
 
121
142
  uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}"
data/postrank-uri.gemspec CHANGED
@@ -8,14 +8,15 @@ Gem::Specification.new do |s|
8
8
  s.platform = Gem::Platform::RUBY
9
9
  s.authors = ["Ilya Grigorik"]
10
10
  s.email = ["ilya@igvita.com"]
11
- s.homepage = "http://rubygems.org/gems/postrank-uri"
11
+ s.homepage = "http://github.com/postrank-labs/postrank-uri"
12
12
  s.summary = "URI normalization, c18n, escaping, and extraction"
13
13
  s.description = s.summary
14
14
 
15
15
  s.rubyforge_project = "postrank-uri"
16
16
 
17
- s.add_dependency "addressable"
17
+ s.add_dependency "addressable", ">= 2.2.3"
18
18
  s.add_dependency "domainatrix"
19
+ s.add_dependency "nokogiri"
19
20
  s.add_development_dependency "rspec"
20
21
 
21
22
  s.files = `git ls-files`.split("\n")
@@ -181,6 +181,41 @@ describe PostRank::URI do
181
181
  e("http://www.youtube.com/watch?v=w_j4Lda25jA  とんかつ定食").should == ["http://www.youtube.com/watch?v=w_j4Lda25jA"]
182
182
  end
183
183
  end
184
+
185
+ end
186
+
187
+ context "href extract" do
188
+ it "should extract links from html text" do
189
+ l = PostRank::URI.extract_href("<a href='google.com'>link to google</a> with text <a href='b.com'>stuff</a>")
190
+ l.keys.size.should == 2
191
+
192
+ l.keys.should include('http://google.com/')
193
+ l.keys.should include('http://b.com/')
194
+
195
+ l['http://google.com/'].should == 'link to google'
196
+ l['http://b.com/'].should == 'stuff'
197
+ end
198
+
199
+ it "should handle empty hrefs" do
200
+ lambda do
201
+ l = PostRank::URI.extract_href("<a>link to google</a> with text <a href=''>stuff</a>")
202
+ l.should be_empty
203
+ end.should_not raise_error
204
+ end
205
+
206
+ context "relative paths" do
207
+ it "should reject relative paths" do
208
+ l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>")
209
+ l.should be_empty
210
+ end
211
+
212
+ it "should resolve relative paths if host is provided" do
213
+ l = PostRank::URI.extract_href("<a href='/stuff'>link to stuff</a>", "igvita.com")
214
+
215
+ l.size.should == 1
216
+ l['http://igvita.com/stuff'].should == 'link to stuff'
217
+ end
218
+ end
184
219
  end
185
220
 
186
- end
221
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 1
7
7
  - 0
8
- - 0
9
- version: 1.0.0
8
+ - 1
9
+ version: 1.0.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ilya Grigorik
@@ -14,27 +14,27 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-20 00:00:00 -05:00
17
+ date: 2011-01-21 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: addressable
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
24
  requirements:
26
25
  - - ">="
27
26
  - !ruby/object:Gem::Version
28
27
  segments:
29
- - 0
30
- version: "0"
28
+ - 2
29
+ - 2
30
+ - 3
31
+ version: 2.2.3
31
32
  type: :runtime
32
33
  version_requirements: *id001
33
34
  - !ruby/object:Gem::Dependency
34
35
  name: domainatrix
35
36
  prerelease: false
36
37
  requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
38
  requirements:
39
39
  - - ">="
40
40
  - !ruby/object:Gem::Version
@@ -44,18 +44,29 @@ dependencies:
44
44
  type: :runtime
45
45
  version_requirements: *id002
46
46
  - !ruby/object:Gem::Dependency
47
- name: rspec
47
+ name: nokogiri
48
48
  prerelease: false
49
49
  requirement: &id003 !ruby/object:Gem::Requirement
50
- none: false
51
50
  requirements:
52
51
  - - ">="
53
52
  - !ruby/object:Gem::Version
54
53
  segments:
55
54
  - 0
56
55
  version: "0"
57
- type: :development
56
+ type: :runtime
58
57
  version_requirements: *id003
58
+ - !ruby/object:Gem::Dependency
59
+ name: rspec
60
+ prerelease: false
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ type: :development
69
+ version_requirements: *id004
59
70
  description: URI normalization, c18n, escaping, and extraction
60
71
  email:
61
72
  - ilya@igvita.com
@@ -66,6 +77,7 @@ extensions: []
66
77
  extra_rdoc_files: []
67
78
 
68
79
  files:
80
+ - .gitignore
69
81
  - Gemfile
70
82
  - README.md
71
83
  - Rakefile
@@ -77,7 +89,7 @@ files:
77
89
  - spec/helper.rb
78
90
  - spec/postrank-uri_spec.rb
79
91
  has_rdoc: true
80
- homepage: http://rubygems.org/gems/postrank-uri
92
+ homepage: http://github.com/postrank-labs/postrank-uri
81
93
  licenses: []
82
94
 
83
95
  post_install_message:
@@ -86,7 +98,6 @@ rdoc_options: []
86
98
  require_paths:
87
99
  - lib
88
100
  required_ruby_version: !ruby/object:Gem::Requirement
89
- none: false
90
101
  requirements:
91
102
  - - ">="
92
103
  - !ruby/object:Gem::Version
@@ -94,7 +105,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
94
105
  - 0
95
106
  version: "0"
96
107
  required_rubygems_version: !ruby/object:Gem::Requirement
97
- none: false
98
108
  requirements:
99
109
  - - ">="
100
110
  - !ruby/object:Gem::Version
@@ -104,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
114
  requirements: []
105
115
 
106
116
  rubyforge_project: postrank-uri
107
- rubygems_version: 1.3.7
117
+ rubygems_version: 1.3.6
108
118
  signing_key:
109
119
  specification_version: 3
110
120
  summary: URI normalization, c18n, escaping, and extraction