urlybird 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ module UrlyBird
2
+ VERSION = "0.0.1"
3
+ end
data/lib/urlybird.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'uri'
2
+ require 'addressable/uri'
3
+
4
+ module UrlyBird
5
+
6
+ def self.seek(content, opts={}, &block)
7
+
8
+ opts = default_opts.merge(opts)
9
+
10
+ # find URI matches within a set of schemes (e.g. ['http','mailto']) if
11
+ # provided, nil will return all schemes
12
+ content.gsub(regexp(opts)) do |matched|
13
+ # capture the first captured part in case we're working with a anchor
14
+ # tag
15
+ url_raw = $1
16
+
17
+ # check if the current URL is within an anchor
18
+ is_anchor = !!matched.match(/^<\s*a/)
19
+
20
+ # if we're not dealing with an anchored URL the whole string match is
21
+ # the raw URL
22
+ url_raw = matched unless is_anchor
23
+
24
+ begin
25
+ # create an Addressable::URI object, un-escaping "&amp;" if the URL is
26
+ # within an anchor tag
27
+ url = Addressable::URI.parse(
28
+ is_anchor ? url_raw.gsub('&amp;', '&') : url_raw)
29
+
30
+ if block_given? && valid?(url, opts)
31
+
32
+ # yield valid URLs
33
+ block.call(url)
34
+
35
+ # turn URL back into a string and clone the string due to what seems
36
+ # like internal string caching in Addressable
37
+ url = url.to_s.clone
38
+
39
+ # FIXME: Temporary fix to dealing with dollar signs ($) in URLs
40
+ # which in most use cases are required as placeholders
41
+ # and need to remain unencoded
42
+ #
43
+ # Ideally UrlyBird should provide some form of option to unencode
44
+ # specific characters, or simply forcing developers to deal with
45
+ # these kinds of special cases in their apps.
46
+ url.gsub!('%24', '$')
47
+
48
+ # escape ampersands (&) in anchor tag URLs
49
+ url.gsub!(/&(?!amp;)/, '&amp;') if is_anchor
50
+
51
+ # if we're working with an anchor tag inject the new URL, otherwise
52
+ # just return the new URL as is
53
+ is_anchor ? matched.gsub(url_raw, url) : url
54
+ else
55
+ matched
56
+ end
57
+ rescue Addressable::URI::InvalidURIError
58
+ matched
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ private
65
+
66
+ def self.default_opts
67
+ { :anchors_only => false }
68
+ end
69
+
70
+ def self.valid?(url, opts)
71
+ # validate extname regexp if provided
72
+ return false if opts[:extname] && !url.extname.empty? &&
73
+ url.extname.delete('.').match(opts[:extname]).nil?
74
+
75
+ # validate host regexp if provided
76
+ return false if opts[:host] && !url.host.to_s.empty? &&
77
+ url.host.match(opts[:host]).nil?
78
+
79
+ # return
80
+ true
81
+ end
82
+
83
+ def self.uri_regexp(opts = {})
84
+ /(#{URI.regexp(opts[:scheme])})/
85
+ end
86
+
87
+ def self.anchor_uri_regexp(opts = {})
88
+ /<\s*a\s+[^>]*href\s*=\s*[\"']?(#{uri_regexp(opts)})[\"' >]/
89
+ end
90
+
91
+ def self.regexp(opts = {})
92
+ url_match = uri_regexp(opts)
93
+ anchors = anchor_uri_regexp(opts)
94
+ any = /#{anchors}|#{url_match}/
95
+ opts[:anchors_only] ? anchors : any
96
+ end
97
+
98
+ end
@@ -0,0 +1,18 @@
1
+ $:.unshift(File.expand_path('../lib',__FILE__))
2
+
3
+ require 'rspec'
4
+
5
+ require 'simplecov'
6
+ SimpleCov.start do
7
+ add_filter 'spec'
8
+ add_filter 'vendor'
9
+ end
10
+
11
+ require 'urlybird'
12
+
13
+ # String helper for large text-inserts
14
+ class String
15
+ def undent
16
+ gsub /^.{#{slice(/^ +/).length}}/, ''
17
+ end
18
+ end
@@ -0,0 +1,324 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlyBird do
4
+
5
+ let(:klass) { UrlyBird }
6
+
7
+ describe '#seek' do
8
+ let(:content) do
9
+ <<-EOS.undent
10
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Wormy</a>
11
+ http://urlybird.com/search?q=urly&ie=latin1
12
+ EOS
13
+ end
14
+
15
+ context 'when no block is passed' do
16
+ subject { UrlyBird.seek(content) }
17
+
18
+ it 'content is returned without modifications' do
19
+ subject.should == content
20
+ end
21
+ end # no transforms
22
+
23
+ context 'when no transforms are specified' do
24
+ subject do
25
+ UrlyBird.seek(content) { |url| }
26
+ end
27
+
28
+ it 'content is returned without modifications' do
29
+ subject.should == content
30
+ end
31
+ end # no transforms
32
+
33
+ describe 'URL Matching' do
34
+ context 'when given syntactically correct but technically invalid ' +
35
+ 'URLs' do
36
+ let(:content) { 'Wormy: http://www.not-so-urlybird.com/' }
37
+
38
+ it 'the invalid URL match is silently skipped' do
39
+ matched = []
40
+ klass.seek(content) { |url| matched << url.to_s }
41
+ matched.should == ['http://www.not-so-urlybird.com/']
42
+ end
43
+ end # given syntactically correct but technically invalid URLs
44
+
45
+ context 'when some input URLs separates query params with "&amp;"' do
46
+ let(:content) do
47
+ <<-EOS.undent
48
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Do It</a>
49
+ http://urlybird.com/search?q=wormy&ie=UTF-8
50
+ EOS
51
+ end
52
+
53
+ it 'query values are parsed correctly' do
54
+ klass.seek(content) do |url|
55
+ url.query_values.should == {'q' => 'wormy', 'ie' => 'UTF-8'}
56
+ end
57
+ end
58
+ end # query params separted by "&amp;"
59
+
60
+ context 'when input has oddly formatted anchor tags' do
61
+ let(:content) do
62
+ "< a href=\"http://urlybird.com/search?q=wormy&amp;ie=UTF-8\">\n" +
63
+ " Do It\n" +
64
+ "</a>\n" +
65
+ "<\n" +
66
+ " a\n" +
67
+ " class=\"foo\"\n" +
68
+ " href=\"http://urlybird.com/search?q=wormy&amp;ie=UTF-8\">\n" +
69
+ " Do It\n" +
70
+ "</a>\n" +
71
+ "http://urlybird.com/search?q=wormy&ie=UTF-8"
72
+ end
73
+
74
+ it 'query values are parsed correctly' do
75
+ matched = []
76
+ klass.seek(content) { |url| matched << url.to_s }
77
+ matched.should == [
78
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
79
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
80
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
81
+ ]
82
+ end
83
+ end
84
+
85
+ context 'when input anchors have non-href attributes with URL-like ' +
86
+ 'values' do
87
+ let(:content) do
88
+ <<-EOS.undent
89
+ <a style="padding: 4px; color: black;" href="http://urlybird.com/">
90
+ Do It
91
+ </a>
92
+ EOS
93
+ end
94
+
95
+ it 'only URL-like values within the href attribute is matched' do
96
+ matched = []
97
+ klass.seek(content) { |url| matched << url.to_s }
98
+ matched.should == ['http://urlybird.com/']
99
+ end
100
+ end
101
+
102
+ describe 'anchors_only option' do
103
+ it 'defaults to false' do
104
+ klass.send(:default_opts)[:anchors_only].should be_false
105
+ end
106
+
107
+ context 'when set to true' do
108
+ it 'only anchored URLs are matched' do
109
+ matched = []
110
+ klass.seek(content, :anchors_only => true) do |url|
111
+ matched << url.to_s
112
+ end
113
+ matched.should == ['http://urlybird.com/search?q=wormy&ie=UTF-8']
114
+ end
115
+ end # match anchored only
116
+
117
+ context 'when set to false' do
118
+ it 'only anchored URLs are matched' do
119
+ matched = []
120
+ klass.seek(content, :anchors_only => false) { |url| matched << url.to_s }
121
+ matched.should == [
122
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
123
+ 'http://urlybird.com/search?q=urly&ie=latin1'
124
+ ]
125
+ end
126
+ end # match anchored only
127
+ end # anchors_only option
128
+
129
+ describe 'scheme option' do
130
+ let(:urls) do
131
+ [ 'http://www.urlybird.com/', 'https://www.urlybird.com/',
132
+ 'ftp://www.urlybird.com/', 'ssh://www.urlybird.com/',
133
+ 'mailto:foo@urlybird.com' ]
134
+ end
135
+
136
+ let(:content) { urls.join("\n") }
137
+
138
+ context 'when not given' do
139
+ it 'any scheme is matched' do
140
+ matched = []
141
+ klass.seek(content) { |url| matched << url.to_s }
142
+ matched.should == urls
143
+ end
144
+ end # when not given
145
+
146
+ context 'when given' do
147
+ it 'only URLs of specified schemes are matched' do
148
+ matched = []
149
+ klass.seek(content, :scheme => ['http', 'mailto']) do |url|
150
+ matched << url.to_s
151
+ end
152
+ matched.should == urls.select do |item|
153
+ item.match(/^(http\:|mailto\:)/)
154
+ end
155
+ end
156
+ end # when given
157
+ end # scheme option
158
+
159
+ describe 'extname option' do
160
+ let(:urls) do
161
+ [ 'http://urlybird.com/foo.php', 'http://adobe.com/foo.cfm',
162
+ 'http://lolcats.com/lol.jpg', 'http://lolcats.com/lol.png' ]
163
+ end
164
+
165
+ let(:content) { urls.join("\n") }
166
+
167
+ context 'when not given' do
168
+ it 'no extension-based filtering is performed' do
169
+ matched = []
170
+ klass.seek(content) { |url| matched << url.to_s }
171
+ matched.should == urls
172
+ end
173
+ end
174
+
175
+ context 'when given' do
176
+ it 'only URLs with matching extensions are matched' do
177
+ matched = []
178
+ klass.seek(content, :extname => /^(?!jpg|png|gif)/) do |url|
179
+ matched << url.to_s
180
+ end
181
+ matched.should == urls.reject do |item|
182
+ item.match(/\.(jpg|png)$/)
183
+ end
184
+ end
185
+ end
186
+ end # extname option
187
+
188
+ describe 'host option' do
189
+ let(:urls) do
190
+ [ 'http://www.urlybird.com/', 'https://images.urlybird.com/',
191
+ 'http://www.not-so-urlybird.com/foo', 'http://wormy.co.uk/' ]
192
+ end
193
+
194
+ let(:content) { urls.join("\n") }
195
+
196
+ context 'when not given' do
197
+ it 'any host is matched' do
198
+ matched = []
199
+ klass.seek(content) { |url| matched << url.to_s }
200
+ matched.should == urls
201
+ end
202
+ end # when not given
203
+
204
+ context 'when given' do
205
+ it 'only URLs of specified hosts are matched' do
206
+ matched = []
207
+ klass.seek(content, :host => /wormy\.com/) do |url|
208
+ matched << url.to_s
209
+ end
210
+ matched.should == urls.select do |item|
211
+ item.match(/wormy\.com/)
212
+ end
213
+ end
214
+
215
+ context 'when input contains URLs of various schemes/types' do
216
+ let(:urls) do
217
+ [ 'http://www.urlybird.com/', 'http://www.wormy.com/',
218
+ 'https://www.urlybird.com/', 'https://www.wormy.com/',
219
+ 'ftp://www.urlybird.com/', 'ftp://www.wormy.com/',
220
+ 'ssh://www.urlybird.com/', 'ssh://www.wormy.com/',
221
+ 'mailto:foo@urlybird.com', 'mailto:foo@wormy.com' ]
222
+ end
223
+
224
+ let(:content) { urls.join("\n") }
225
+
226
+ it 'URLs matching specified host are matched' do
227
+ matched = []
228
+ klass.seek(content, :host => /urlybird\.com/) do |url|
229
+ matched << url.to_s
230
+ end
231
+ matched.should == urls.reject do |item|
232
+ item.match(/wormy\.com\/$/)
233
+ end
234
+ end
235
+ end # when input contains non-http/https URLs
236
+ end # when given
237
+ end # host option
238
+ end # URL matching
239
+
240
+ describe 'URL Manipulation' do
241
+ context 'when manipulating URLs within anchor tags' do
242
+ let(:content) do
243
+ <<-EOS.undent
244
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Do It</a>
245
+ http://urlybird.com/search?q=wormy&ie=UTF-8
246
+ EOS
247
+ end
248
+ it 'should escape ampersands (&) to "&amp;"' do
249
+ result = klass.seek(content) { |url| }
250
+ lines = result.split("\n")
251
+ lines[0].should include('?q=wormy&amp;ie=UTF-8')
252
+ lines[1].should include('?q=wormy&ie=UTF-8')
253
+ end
254
+ end
255
+
256
+ context 'when injecting query params' do
257
+ context 'into URLs without any params' do
258
+ let(:content) do
259
+ <<-EOS.undent
260
+ http://www.urlybird.com/
261
+ http://www.urlybird.com/
262
+ EOS
263
+ end
264
+
265
+ it 'the query params are added' do
266
+ result = klass.seek(content) do |url|
267
+ url.query_values = (url.query_values || {}).merge(:foo => 'bar')
268
+ end
269
+ query_strings = result.split("\n").inject([]) do |result, item|
270
+ result << item.split('?').last
271
+ end
272
+ query_strings.each do |string|
273
+ string.should == 'foo=bar'
274
+ end
275
+ end
276
+ end # without any params
277
+
278
+ context 'into URLs with existing params' do
279
+ let(:content) do
280
+ <<-EOS.undent
281
+ http://www.urlybird.com/search?q=wormy&ie=UTF-8
282
+ http://www.urlybird.com/search?q=urly&ie=latin1
283
+ EOS
284
+ end
285
+
286
+ it 'the query params are added' do
287
+ result = klass.seek(content) do |url|
288
+ url.query_values = (url.query_values || {}).merge(:foo => 'bar')
289
+ end
290
+ query_strings = result.split("\n").inject([]) do |result, item|
291
+ result << item.split('?').last
292
+ end
293
+ query_strings[0].should include('q=wormy', 'ie=UTF-8', 'foo=bar')
294
+ query_strings[1].should include('q=urly', 'ie=latin1', 'foo=bar')
295
+ end
296
+ end # with existing params
297
+
298
+ context 'with dollar signs in their values' do
299
+ let(:content) do
300
+ <<-EOS.undent
301
+ http://www.urlybird.com/search?q=wormy&amp;woo=$$boo$$
302
+ http://www.urlybird.com/search?q=urly&woo=$$boo$$
303
+ EOS
304
+ end
305
+
306
+ it 'the dollar signs are urlencoded properly' do
307
+ result = klass.seek(content, :encode => true) do |url|
308
+ new_query = (url.query_values || {}).merge(:foo => '$$bar$$')
309
+ url.query_values = new_query
310
+ end
311
+ query_strings = result.split("\n").inject([]) do |result, item|
312
+ result << item.split('?').last
313
+ end
314
+ query_strings[0].should include('q=wormy', 'woo=$$boo$$',
315
+ 'foo=$$bar$$')
316
+ query_strings[1].should include('q=urly', 'woo=$$boo$$',
317
+ 'foo=$$bar$$')
318
+ end
319
+ end # with dollar signs
320
+ end # injecting query params
321
+ end # URL Manipulation
322
+ end # seek
323
+
324
+ end # UrlyBird
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: urlybird
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kriselda Rabino
9
+ - Jim Myhrberg
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-05-17 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ requirement: !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: 2.8.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ! '>='
29
+ - !ruby/object:Gem::Version
30
+ version: 2.8.0
31
+ - !ruby/object:Gem::Dependency
32
+ name: simplecov
33
+ requirement: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ type: :development
40
+ prerelease: false
41
+ version_requirements: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: addressable
49
+ requirement: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 2.2.7
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: 2.2.7
63
+ description: Send UrlyBird off into the intricate canopies of your URI-inhabited content,
64
+ and watch him bring you back a beakful of Addressable::URI objects to do with what
65
+ you will.
66
+ email:
67
+ - kriselda.rabino@gmail.com
68
+ - contact@jimeh.me
69
+ executables: []
70
+ extensions: []
71
+ extra_rdoc_files: []
72
+ files:
73
+ - lib/urlybird/version.rb
74
+ - lib/urlybird.rb
75
+ - spec/spec_helper.rb
76
+ - spec/urlybird_spec.rb
77
+ homepage: http://rubygems.org/gems/urlybird
78
+ licenses: []
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 1.8.19
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: UrlyBird fetches all your URIs in one fell swoop
101
+ test_files:
102
+ - spec/spec_helper.rb
103
+ - spec/urlybird_spec.rb