urlybird 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ module UrlyBird
2
+ VERSION = "0.0.1"
3
+ end
data/lib/urlybird.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'uri'
2
+ require 'addressable/uri'
3
+
4
+ module UrlyBird
5
+
6
+ def self.seek(content, opts={}, &block)
7
+
8
+ opts = default_opts.merge(opts)
9
+
10
+ # find URI matches within a set of schemes (e.g. ['http','mailto']) if
11
+ # provided, nil will return all schemes
12
+ content.gsub(regexp(opts)) do |matched|
13
+ # capture the first captured part in case we're working with a anchor
14
+ # tag
15
+ url_raw = $1
16
+
17
+ # check if the current URL is within an anchor
18
+ is_anchor = !!matched.match(/^<\s*a/)
19
+
20
+ # if we're not dealing with an anchored URL the whole string match is
21
+ # the raw URL
22
+ url_raw = matched unless is_anchor
23
+
24
+ begin
25
+ # create an Addressable::URI object, un-escaping "&amp;" if the URL is
26
+ # within an anchor tag
27
+ url = Addressable::URI.parse(
28
+ is_anchor ? url_raw.gsub('&amp;', '&') : url_raw)
29
+
30
+ if block_given? && valid?(url, opts)
31
+
32
+ # yield valid URLs
33
+ block.call(url)
34
+
35
+ # turn URL back into a string and clone the string due to what seems
36
+ # like internal string caching in Addressable
37
+ url = url.to_s.clone
38
+
39
+ # FIXME: Temporary fix to dealing with dollar signs ($) in URLs
40
+ # which in most use cases are required as placeholders
41
+ # and need to remain unencoded
42
+ #
43
+ # Ideally UrlyBird should provide some form of option to unencode
44
+ # specific characters, or simply forcing developers to deal with
45
+ # these kinds of special cases in their apps.
46
+ url.gsub!('%24', '$')
47
+
48
+ # escape ampersands (&) in anchor tag URLs
49
+ url.gsub!(/&(?!amp;)/, '&amp;') if is_anchor
50
+
51
+ # if we're working with an anchor tag inject the new URL, otherwise
52
+ # just return the new URL as is
53
+ is_anchor ? matched.gsub(url_raw, url) : url
54
+ else
55
+ matched
56
+ end
57
+ rescue Addressable::URI::InvalidURIError
58
+ matched
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ private
65
+
66
+ def self.default_opts
67
+ { :anchors_only => false }
68
+ end
69
+
70
+ def self.valid?(url, opts)
71
+ # validate extname regexp if provided
72
+ return false if opts[:extname] && !url.extname.empty? &&
73
+ url.extname.delete('.').match(opts[:extname]).nil?
74
+
75
+ # validate host regexp if provided
76
+ return false if opts[:host] && !url.host.to_s.empty? &&
77
+ url.host.match(opts[:host]).nil?
78
+
79
+ # return
80
+ true
81
+ end
82
+
83
+ def self.uri_regexp(opts = {})
84
+ /(#{URI.regexp(opts[:scheme])})/
85
+ end
86
+
87
+ def self.anchor_uri_regexp(opts = {})
88
+ /<\s*a\s+[^>]*href\s*=\s*[\"']?(#{uri_regexp(opts)})[\"' >]/
89
+ end
90
+
91
+ def self.regexp(opts = {})
92
+ url_match = uri_regexp(opts)
93
+ anchors = anchor_uri_regexp(opts)
94
+ any = /#{anchors}|#{url_match}/
95
+ opts[:anchors_only] ? anchors : any
96
+ end
97
+
98
+ end
@@ -0,0 +1,18 @@
1
+ $:.unshift(File.expand_path('../lib',__FILE__))
2
+
3
+ require 'rspec'
4
+
5
+ require 'simplecov'
6
+ SimpleCov.start do
7
+ add_filter 'spec'
8
+ add_filter 'vendor'
9
+ end
10
+
11
+ require 'urlybird'
12
+
13
+ # String helper for large text-inserts
14
+ class String
15
+ def undent
16
+ gsub /^.{#{slice(/^ +/).length}}/, ''
17
+ end
18
+ end
@@ -0,0 +1,324 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlyBird do
4
+
5
+ let(:klass) { UrlyBird }
6
+
7
+ describe '#seek' do
8
+ let(:content) do
9
+ <<-EOS.undent
10
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Wormy</a>
11
+ http://urlybird.com/search?q=urly&ie=latin1
12
+ EOS
13
+ end
14
+
15
+ context 'when no block is passed' do
16
+ subject { UrlyBird.seek(content) }
17
+
18
+ it 'content is returned without modifications' do
19
+ subject.should == content
20
+ end
21
+ end # no transforms
22
+
23
+ context 'when no transforms are specified' do
24
+ subject do
25
+ UrlyBird.seek(content) { |url| }
26
+ end
27
+
28
+ it 'content is returned without modifications' do
29
+ subject.should == content
30
+ end
31
+ end # no transforms
32
+
33
+ describe 'URL Matching' do
34
+ context 'when given syntactically correct but technically invalid ' +
35
+ 'URLs' do
36
+ let(:content) { 'Wormy: http://www.not-so-urlybird.com/' }
37
+
38
+ it 'the invalid URL match is silently skipped' do
39
+ matched = []
40
+ klass.seek(content) { |url| matched << url.to_s }
41
+ matched.should == ['http://www.not-so-urlybird.com/']
42
+ end
43
+ end # given syntactically correct but technically invalid URLs
44
+
45
+ context 'when some input URLs separates query params with "&amp;"' do
46
+ let(:content) do
47
+ <<-EOS.undent
48
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Do It</a>
49
+ http://urlybird.com/search?q=wormy&ie=UTF-8
50
+ EOS
51
+ end
52
+
53
+ it 'query values are parsed correctly' do
54
+ klass.seek(content) do |url|
55
+ url.query_values.should == {'q' => 'wormy', 'ie' => 'UTF-8'}
56
+ end
57
+ end
58
+ end # query params separted by "&amp;"
59
+
60
+ context 'when input has oddly formatted anchor tags' do
61
+ let(:content) do
62
+ "< a href=\"http://urlybird.com/search?q=wormy&amp;ie=UTF-8\">\n" +
63
+ " Do It\n" +
64
+ "</a>\n" +
65
+ "<\n" +
66
+ " a\n" +
67
+ " class=\"foo\"\n" +
68
+ " href=\"http://urlybird.com/search?q=wormy&amp;ie=UTF-8\">\n" +
69
+ " Do It\n" +
70
+ "</a>\n" +
71
+ "http://urlybird.com/search?q=wormy&ie=UTF-8"
72
+ end
73
+
74
+ it 'query values are parsed correctly' do
75
+ matched = []
76
+ klass.seek(content) { |url| matched << url.to_s }
77
+ matched.should == [
78
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
79
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
80
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
81
+ ]
82
+ end
83
+ end
84
+
85
+ context 'when input anchors have non-href attributes with URL-like ' +
86
+ 'values' do
87
+ let(:content) do
88
+ <<-EOS.undent
89
+ <a style="padding: 4px; color: black;" href="http://urlybird.com/">
90
+ Do It
91
+ </a>
92
+ EOS
93
+ end
94
+
95
+ it 'only URL-like values within the href attribute is matched' do
96
+ matched = []
97
+ klass.seek(content) { |url| matched << url.to_s }
98
+ matched.should == ['http://urlybird.com/']
99
+ end
100
+ end
101
+
102
+ describe 'anchors_only option' do
103
+ it 'defaults to false' do
104
+ klass.send(:default_opts)[:anchors_only].should be_false
105
+ end
106
+
107
+ context 'when set to true' do
108
+ it 'only anchored URLs are matched' do
109
+ matched = []
110
+ klass.seek(content, :anchors_only => true) do |url|
111
+ matched << url.to_s
112
+ end
113
+ matched.should == ['http://urlybird.com/search?q=wormy&ie=UTF-8']
114
+ end
115
+ end # match anchored only
116
+
117
+ context 'when set to false' do
118
+ it 'only anchored URLs are matched' do
119
+ matched = []
120
+ klass.seek(content, :anchors_only => false) { |url| matched << url.to_s }
121
+ matched.should == [
122
+ 'http://urlybird.com/search?q=wormy&ie=UTF-8',
123
+ 'http://urlybird.com/search?q=urly&ie=latin1'
124
+ ]
125
+ end
126
+ end # match anchored only
127
+ end # anchors_only option
128
+
129
+ describe 'scheme option' do
130
+ let(:urls) do
131
+ [ 'http://www.urlybird.com/', 'https://www.urlybird.com/',
132
+ 'ftp://www.urlybird.com/', 'ssh://www.urlybird.com/',
133
+ 'mailto:foo@urlybird.com' ]
134
+ end
135
+
136
+ let(:content) { urls.join("\n") }
137
+
138
+ context 'when not given' do
139
+ it 'any scheme is matched' do
140
+ matched = []
141
+ klass.seek(content) { |url| matched << url.to_s }
142
+ matched.should == urls
143
+ end
144
+ end # when not given
145
+
146
+ context 'when given' do
147
+ it 'only URLs of specified schemes are matched' do
148
+ matched = []
149
+ klass.seek(content, :scheme => ['http', 'mailto']) do |url|
150
+ matched << url.to_s
151
+ end
152
+ matched.should == urls.select do |item|
153
+ item.match(/^(http\:|mailto\:)/)
154
+ end
155
+ end
156
+ end # when given
157
+ end # scheme option
158
+
159
+ describe 'extname option' do
160
+ let(:urls) do
161
+ [ 'http://urlybird.com/foo.php', 'http://adobe.com/foo.cfm',
162
+ 'http://lolcats.com/lol.jpg', 'http://lolcats.com/lol.png' ]
163
+ end
164
+
165
+ let(:content) { urls.join("\n") }
166
+
167
+ context 'when not given' do
168
+ it 'no extension-based filtering is performed' do
169
+ matched = []
170
+ klass.seek(content) { |url| matched << url.to_s }
171
+ matched.should == urls
172
+ end
173
+ end
174
+
175
+ context 'when given' do
176
+ it 'only URLs with matching extensions are matched' do
177
+ matched = []
178
+ klass.seek(content, :extname => /^(?!jpg|png|gif)/) do |url|
179
+ matched << url.to_s
180
+ end
181
+ matched.should == urls.reject do |item|
182
+ item.match(/\.(jpg|png)$/)
183
+ end
184
+ end
185
+ end
186
+ end # extname option
187
+
188
+ describe 'host option' do
189
+ let(:urls) do
190
+ [ 'http://www.urlybird.com/', 'https://images.urlybird.com/',
191
+ 'http://www.not-so-urlybird.com/foo', 'http://wormy.co.uk/' ]
192
+ end
193
+
194
+ let(:content) { urls.join("\n") }
195
+
196
+ context 'when not given' do
197
+ it 'any host is matched' do
198
+ matched = []
199
+ klass.seek(content) { |url| matched << url.to_s }
200
+ matched.should == urls
201
+ end
202
+ end # when not given
203
+
204
+ context 'when given' do
205
+ it 'only URLs of specified hosts are matched' do
206
+ matched = []
207
+ klass.seek(content, :host => /wormy\.com/) do |url|
208
+ matched << url.to_s
209
+ end
210
+ matched.should == urls.select do |item|
211
+ item.match(/wormy\.com/)
212
+ end
213
+ end
214
+
215
+ context 'when input contains URLs of various schemes/types' do
216
+ let(:urls) do
217
+ [ 'http://www.urlybird.com/', 'http://www.wormy.com/',
218
+ 'https://www.urlybird.com/', 'https://www.wormy.com/',
219
+ 'ftp://www.urlybird.com/', 'ftp://www.wormy.com/',
220
+ 'ssh://www.urlybird.com/', 'ssh://www.wormy.com/',
221
+ 'mailto:foo@urlybird.com', 'mailto:foo@wormy.com' ]
222
+ end
223
+
224
+ let(:content) { urls.join("\n") }
225
+
226
+ it 'URLs matching specified host are matched' do
227
+ matched = []
228
+ klass.seek(content, :host => /urlybird\.com/) do |url|
229
+ matched << url.to_s
230
+ end
231
+ matched.should == urls.reject do |item|
232
+ item.match(/wormy\.com\/$/)
233
+ end
234
+ end
235
+ end # when input contains non-http/https URLs
236
+ end # when given
237
+ end # host option
238
+ end # URL matching
239
+
240
+ describe 'URL Manipulation' do
241
+ context 'when manipulating URLs within anchor tags' do
242
+ let(:content) do
243
+ <<-EOS.undent
244
+ <a href="http://urlybird.com/search?q=wormy&amp;ie=UTF-8">Do It</a>
245
+ http://urlybird.com/search?q=wormy&ie=UTF-8
246
+ EOS
247
+ end
248
+ it 'should escape ampersands (&) to "&amp;"' do
249
+ result = klass.seek(content) { |url| }
250
+ lines = result.split("\n")
251
+ lines[0].should include('?q=wormy&amp;ie=UTF-8')
252
+ lines[1].should include('?q=wormy&ie=UTF-8')
253
+ end
254
+ end
255
+
256
+ context 'when injecting query params' do
257
+ context 'into URLs without any params' do
258
+ let(:content) do
259
+ <<-EOS.undent
260
+ http://www.urlybird.com/
261
+ http://www.urlybird.com/
262
+ EOS
263
+ end
264
+
265
+ it 'the query params are added' do
266
+ result = klass.seek(content) do |url|
267
+ url.query_values = (url.query_values || {}).merge(:foo => 'bar')
268
+ end
269
+ query_strings = result.split("\n").inject([]) do |result, item|
270
+ result << item.split('?').last
271
+ end
272
+ query_strings.each do |string|
273
+ string.should == 'foo=bar'
274
+ end
275
+ end
276
+ end # without any params
277
+
278
+ context 'into URLs with existing params' do
279
+ let(:content) do
280
+ <<-EOS.undent
281
+ http://www.urlybird.com/search?q=wormy&ie=UTF-8
282
+ http://www.urlybird.com/search?q=urly&ie=latin1
283
+ EOS
284
+ end
285
+
286
+ it 'the query params are added' do
287
+ result = klass.seek(content) do |url|
288
+ url.query_values = (url.query_values || {}).merge(:foo => 'bar')
289
+ end
290
+ query_strings = result.split("\n").inject([]) do |result, item|
291
+ result << item.split('?').last
292
+ end
293
+ query_strings[0].should include('q=wormy', 'ie=UTF-8', 'foo=bar')
294
+ query_strings[1].should include('q=urly', 'ie=latin1', 'foo=bar')
295
+ end
296
+ end # with existing params
297
+
298
+ context 'with dollar signs in their values' do
299
+ let(:content) do
300
+ <<-EOS.undent
301
+ http://www.urlybird.com/search?q=wormy&amp;woo=$$boo$$
302
+ http://www.urlybird.com/search?q=urly&woo=$$boo$$
303
+ EOS
304
+ end
305
+
306
+ it 'the dollar signs are urlencoded properly' do
307
+ result = klass.seek(content, :encode => true) do |url|
308
+ new_query = (url.query_values || {}).merge(:foo => '$$bar$$')
309
+ url.query_values = new_query
310
+ end
311
+ query_strings = result.split("\n").inject([]) do |result, item|
312
+ result << item.split('?').last
313
+ end
314
+ query_strings[0].should include('q=wormy', 'woo=$$boo$$',
315
+ 'foo=$$bar$$')
316
+ query_strings[1].should include('q=urly', 'woo=$$boo$$',
317
+ 'foo=$$bar$$')
318
+ end
319
+ end # with dollar signs
320
+ end # injecting query params
321
+ end # URL Manipulation
322
+ end # seek
323
+
324
+ end # UrlyBird
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: urlybird
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kriselda Rabino
9
+ - Jim Myhrberg
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-05-17 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ requirement: !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: 2.8.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ! '>='
29
+ - !ruby/object:Gem::Version
30
+ version: 2.8.0
31
+ - !ruby/object:Gem::Dependency
32
+ name: simplecov
33
+ requirement: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ type: :development
40
+ prerelease: false
41
+ version_requirements: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: addressable
49
+ requirement: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 2.2.7
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: 2.2.7
63
+ description: Send UrlyBird off into the intricate canopies of your URI-inhabited content,
64
+ and watch him bring you back a beakful of Addressable::URI objects to do with what
65
+ you will.
66
+ email:
67
+ - kriselda.rabino@gmail.com
68
+ - contact@jimeh.me
69
+ executables: []
70
+ extensions: []
71
+ extra_rdoc_files: []
72
+ files:
73
+ - lib/urlybird/version.rb
74
+ - lib/urlybird.rb
75
+ - spec/spec_helper.rb
76
+ - spec/urlybird_spec.rb
77
+ homepage: http://rubygems.org/gems/urlybird
78
+ licenses: []
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 1.8.19
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: UrlyBird fetches all your URIs in one fell swoop
101
+ test_files:
102
+ - spec/spec_helper.rb
103
+ - spec/urlybird_spec.rb