content_urls 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,64 +1,64 @@
1
- require 'uri'
2
-
3
- class ContentUrls
4
-
5
- # +JavaScriptParser+ finds and rewrites URLs in JavaScript content.
6
- #
7
- # === Implementation note:
8
- # This methods in this class identify URLs by locating strings which match +URI+'s regexp.
9
- class JavaScriptParser
10
-
11
- # Returns the URLs found in the JavaScript content.
12
- #
13
- # @param [String] content the JavaScript content.
14
- # @return [Array] the unique URLs found in the content.
15
- #
16
- # @example Parse JavaScript code for URLs
17
- # javascript = 'var link="http://example.com/"'
18
- # ContentUrls::JavaScriptParser.urls(javascript).each do |url|
19
- # puts "Found URL: #{url}"
20
- # end
21
- # # => "Found URL: http://example.com/"
22
- def self.urls(content)
23
- urls = []
24
- URI.extract(content).each { |u| urls << u }
25
- urls.uniq!
26
- urls
27
- end
28
-
29
- # Rewrites each URL in the JavaScript content by calling the supplied block with each URL.
30
- #
31
- # @param [String] content the JavaScript content.
32
- #
33
- # @example Rewrite URLs in JavaScript code
34
- # javascript = 'var link="http://example.com/"'
35
- # javascript = ContentUrls::JavaScriptParser.rewrite_each_url(javascript) {|url| url.upcase}
36
- # puts "Rewritten: #{javascript}"
37
- # # => "Rewritten: var link="HTTP://EXAMPLE.COM/""
38
- #
39
- def self.rewrite_each_url(content, &block)
40
- done = false
41
- remaining = content
42
- rewritten = ''
43
- while ! remaining.empty?
44
- if match = URI.regexp.match(remaining)
45
- url = match.to_s
46
- rewritten += match.pre_match
47
- replacement = url.nil? ? nil : (yield url)
48
- if replacement.nil? or replacement == url # no change in URL
49
- rewritten += url[0]
50
- remaining = url[1..-1] + match.post_match
51
- else
52
- rewritten += replacement
53
- remaining = match.post_match
54
- end
55
- else
56
- rewritten += remaining
57
- remaining = ''
58
- end
59
- end
60
- return rewritten
61
- end
62
-
63
- end
64
- end
1
+ require 'uri'
2
+
3
+ class ContentUrls
4
+
5
+ # +JavaScriptParser+ finds and rewrites URLs in JavaScript content.
6
+ #
7
+ # === Implementation note:
8
+ # This methods in this class identify URLs by locating strings which match +URI+'s regexp.
9
+ class JavaScriptParser
10
+
11
+ # Returns the URLs found in the JavaScript content.
12
+ #
13
+ # @param [String] content the JavaScript content.
14
+ # @return [Array] the unique URLs found in the content.
15
+ #
16
+ # @example Parse JavaScript code for URLs
17
+ # javascript = 'var link="http://example.com/"'
18
+ # ContentUrls::JavaScriptParser.urls(javascript).each do |url|
19
+ # puts "Found URL: #{url}"
20
+ # end
21
+ # # => "Found URL: http://example.com/"
22
+ def self.urls(content)
23
+ urls = []
24
+ URI.extract(content).each { |u| urls << u }
25
+ urls.uniq!
26
+ urls
27
+ end
28
+
29
+ # Rewrites each URL in the JavaScript content by calling the supplied block with each URL.
30
+ #
31
+ # @param [String] content the JavaScript content.
32
+ #
33
+ # @example Rewrite URLs in JavaScript code
34
+ # javascript = 'var link="http://example.com/"'
35
+ # javascript = ContentUrls::JavaScriptParser.rewrite_each_url(javascript) {|url| url.upcase}
36
+ # puts "Rewritten: #{javascript}"
37
+ # # => "Rewritten: var link="HTTP://EXAMPLE.COM/""
38
+ #
39
+ def self.rewrite_each_url(content, &block)
40
+ done = false
41
+ remaining = content
42
+ rewritten = ''
43
+ while ! remaining.empty?
44
+ if match = URI.regexp.match(remaining)
45
+ url = match.to_s
46
+ rewritten += match.pre_match
47
+ replacement = url.nil? ? nil : (yield url)
48
+ if replacement.nil? or replacement == url # no change in URL
49
+ rewritten += url[0]
50
+ remaining = url[1..-1] + match.post_match
51
+ else
52
+ rewritten += replacement
53
+ remaining = match.post_match
54
+ end
55
+ else
56
+ rewritten += remaining
57
+ remaining = ''
58
+ end
59
+ end
60
+ return rewritten
61
+ end
62
+
63
+ end
64
+ end
@@ -1,3 +1,3 @@
1
- class ContentUrls
2
- VERSION = "0.1.0"
3
- end
1
+ class ContentUrls
2
+ VERSION = "0.1.1"
3
+ end
@@ -1,29 +1,29 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe ContentUrls.to_absolute(nil, 'http://www.sample.com/') do
4
- it "returns nil when url is nil" do
5
- ContentUrls.to_absolute(nil, 'http://www.sample.com/').should eq nil
6
- end
7
- end
8
-
9
- describe ContentUrls.to_absolute('index.html', 'http://www.sample.com/') do
10
- it "merges url to base_url" do
11
- ContentUrls.to_absolute('index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/three/index.html'
12
- ContentUrls.to_absolute('/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/index.html'
13
- ContentUrls.to_absolute('/four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/four/index.html'
14
- ContentUrls.to_absolute('../index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/index.html'
15
- ContentUrls.to_absolute('../four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/four/index.html'
16
- end
17
- end
18
-
19
- describe ContentUrls.get_parser('bogus/bogus') do
20
- it "returns nil when content type is unknown" do
21
- ContentUrls.get_parser('bogus/bogus').should eq nil
22
- end
23
- end
24
-
25
- describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b}) do
26
- it "returns the class for the content type" do
27
- ContentUrls.get_parser('content/test').should eq 'some_parser_class'
28
- end
29
- end
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe ContentUrls.to_absolute(nil, 'http://www.sample.com/') do
4
+ it "returns nil when url is nil" do
5
+ ContentUrls.to_absolute(nil, 'http://www.sample.com/').should eq nil
6
+ end
7
+ end
8
+
9
+ describe ContentUrls.to_absolute('index.html', 'http://www.sample.com/') do
10
+ it "merges url to base_url" do
11
+ ContentUrls.to_absolute('index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/three/index.html'
12
+ ContentUrls.to_absolute('/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/index.html'
13
+ ContentUrls.to_absolute('/four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/four/index.html'
14
+ ContentUrls.to_absolute('../index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/index.html'
15
+ ContentUrls.to_absolute('../four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/four/index.html'
16
+ end
17
+ end
18
+
19
+ describe ContentUrls.get_parser('bogus/bogus') do
20
+ it "returns nil when content type is unknown" do
21
+ ContentUrls.get_parser('bogus/bogus').should eq nil
22
+ end
23
+ end
24
+
25
+ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b}) do
26
+ it "returns the class for the content type" do
27
+ ContentUrls.get_parser('content/test').should eq 'some_parser_class'
28
+ end
29
+ end
@@ -1,34 +1,34 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe ContentUrls::CssParser do
4
- it "should return no URLs given no content" do
5
- ContentUrls::CssParser.urls('').should eq []
6
- end
7
- it "should return no URLs given garbage content" do
8
- ContentUrls::CssParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
9
- end
10
- end
11
-
12
- describe ContentUrls::CssParser do
13
- it "should return the URLs in the content" do
14
- ContentUrls::CssParser.urls("body {background-image:url('image.png');}").first.should eq 'image.png'
15
- end
16
- end
17
-
18
- describe ContentUrls::CssParser do
19
- it "should execute the sample code for rewrite_each_url method" do
20
- output = ''
21
- css = 'body { background: url(/images/rainbows.jpg) }'
22
- css = ContentUrls::CssParser.rewrite_each_url(css) {|url| url.sub(/rainbows.jpg/, 'unicorns.jpg')}
23
- output += "Rewritten: #{css}" + "\n"
24
- output.should eq %Q{Rewritten: body { background: url(/images/unicorns.jpg) }\n}
25
- end
26
- it "should execute sample code for urls method" do
27
- output = ''
28
- css = 'body { background: url(/images/rainbows.jpg) }'
29
- ContentUrls::CssParser.urls(css).each do |url|
30
- output += "Found URL: #{url}" + "\n"
31
- end
32
- output.should eq %Q{Found URL: /images/rainbows.jpg\n}
33
- end
34
- end
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe ContentUrls::CssParser do
4
+ it "should return no URLs given no content" do
5
+ ContentUrls::CssParser.urls('').should eq []
6
+ end
7
+ it "should return no URLs given garbage content" do
8
+ ContentUrls::CssParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
9
+ end
10
+ end
11
+
12
+ describe ContentUrls::CssParser do
13
+ it "should return the URLs in the content" do
14
+ ContentUrls::CssParser.urls("body {background-image:url('image.png');}").first.should eq 'image.png'
15
+ end
16
+ end
17
+
18
+ describe ContentUrls::CssParser do
19
+ it "should execute the sample code for rewrite_each_url method" do
20
+ output = ''
21
+ css = 'body { background: url(/images/rainbows.jpg) }'
22
+ css = ContentUrls::CssParser.rewrite_each_url(css) {|url| url.sub(/rainbows.jpg/, 'unicorns.jpg')}
23
+ output += "Rewritten: #{css}" + "\n"
24
+ output.should eq %Q{Rewritten: body { background: url(/images/unicorns.jpg) }\n}
25
+ end
26
+ it "should execute sample code for urls method" do
27
+ output = ''
28
+ css = 'body { background: url(/images/rainbows.jpg) }'
29
+ ContentUrls::CssParser.urls(css).each do |url|
30
+ output += "Found URL: #{url}" + "\n"
31
+ end
32
+ output.should eq %Q{Found URL: /images/rainbows.jpg\n}
33
+ end
34
+ end
@@ -1,318 +1,358 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe ContentUrls::HtmlParser do
4
- it "should return no URLs when given no content" do
5
- ContentUrls::HtmlParser.urls('').should eq []
6
- end
7
- it "should return no URLs when given garbage content" do
8
- ContentUrls::HtmlParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
9
- end
10
- end
11
-
12
- describe ContentUrls::HtmlParser do
13
- it "should return the URLs in the content" do
14
- ContentUrls::HtmlParser.urls("<a href='index.html").first.should eq 'index.html'
15
- end
16
- end
17
-
18
- describe ContentUrls::HtmlParser do
19
- it "should parse HTML Sample 1 and return all a links" do
20
-
21
- html_sample_1 =<<SAMPLE_1
22
- <html>
23
- <head>
24
- <title>HTML Sample 1</title>
25
- </head>
26
- <body>
27
- <h1>HTML Sample 1</h1>
28
- <a href="a-href-link-1.html"></a>
29
- <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
30
- <a href="/folder/a-href-link-3.html?a=1"></a>
31
- </body>
32
- </html>
33
- SAMPLE_1
34
-
35
- urls = ContentUrls::HtmlParser.urls(html_sample_1)
36
- urls.include?('a-href-link-1.html').should eq true
37
- urls.include?('http://www.example.com/1/2/3/a-href-link-2.html').should eq true
38
- urls.include?('/folder/a-href-link-3.html?a=1').should eq true
39
- end
40
- end
41
-
42
- describe ContentUrls::HtmlParser do
43
- it "should parse HTML Sample 1 and rewrite all a links" do
44
-
45
- html_sample_1 =<<SAMPLE_1
46
- <html>
47
- <head>
48
- <title>HTML Sample 1</title>
49
- </head>
50
- <body>
51
- <h1>HTML Sample 1</h1>
52
- <a href="a-href-link-1.html"></a>
53
- <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
54
- <a href="/folder/a-href-link-3.html?a=1"></a>
55
- </body>
56
- </html>
57
- SAMPLE_1
58
-
59
- content = ContentUrls::HtmlParser.rewrite_each_url(html_sample_1) do |url|
60
- url = URI.parse url
61
- url.path = url.path.sub(/\.html\b/, '.php')
62
- url
63
- end
64
- urls = ContentUrls::HtmlParser.urls(content)
65
- urls.include?('a-href-link-1.php').should eq true
66
- urls.include?('http://www.example.com/1/2/3/a-href-link-2.php').should eq true
67
- urls.include?('/folder/a-href-link-3.php?a=1').should eq true
68
- end
69
- end
70
-
71
- describe ContentUrls::HtmlParser do
72
- it "should parse HTML Sample 2 and return all 'area href' URLs" do
73
-
74
- html_sample_2 =<<SAMPLE_2
75
- <html>
76
- <head>
77
- <title>HTML Sample 2</title>
78
- </head>
79
- <body>
80
- <h1>HTML Sample 2</h1>
81
- <img src="sample.gif" width="200" height="200" alt="Click somewhere" usemap="#sample-map">
82
- <map name="sample-map">
83
- <area shape="rect" coords="0,0,100,100" href="area-href-link-1.html" alt="link 1">
84
- <area shape="circle" coords="150,150,2" href="http://www.example.com/1/2/3/area-href-link-2.html" alt="link 2">
85
- <area shape="circle" coords="100,180,1" href="/folder/area-href-link-3.html?a=1" alt="link 3">
86
- </map>
87
- </body>
88
- </html>
89
- SAMPLE_2
90
-
91
- urls = ContentUrls::HtmlParser.urls(html_sample_2)
92
- urls.include?('area-href-link-1.html').should eq true
93
- urls.include?('http://www.example.com/1/2/3/area-href-link-2.html').should eq true
94
- urls.include?('/folder/area-href-link-3.html?a=1').should eq true
95
- end
96
- end
97
-
98
- describe ContentUrls::HtmlParser do
99
- it "should parse HTML Sample 3 and return 'body background' URL" do
100
-
101
- html_sample_3 =<<SAMPLE_3
102
- <html>
103
- <head>
104
- <title>HTML Sample 3</title>
105
- </head>
106
- <body background="/images/background.png">
107
- <h1>HTML Sample 3</h1>
108
- </body>
109
- </html>
110
- SAMPLE_3
111
-
112
- urls = ContentUrls::HtmlParser.urls(html_sample_3)
113
- urls.first.should eq '/images/background.png'
114
- end
115
- end
116
-
117
- describe ContentUrls::HtmlParser do
118
- it "should parse HTML Sample 4 and return 'embed src' URL" do
119
-
120
- html_sample_4 =<<SAMPLE_4
121
- <html>
122
- <head>
123
- <title>HTML Sample 4</title>
124
- </head>
125
- <body>
126
- <h1>HTML Sample 4</h1>
127
- <embed src="sample.swf" />
128
- </body>
129
- </html>
130
- SAMPLE_4
131
-
132
- urls = ContentUrls::HtmlParser.urls(html_sample_4)
133
- urls.first.should eq 'sample.swf'
134
- end
135
- end
136
-
137
- describe ContentUrls::HtmlParser do
138
- it "should parse HTML Sample 5 and return 'img src' URL" do
139
-
140
- html_sample_5 =<<SAMPLE_5
141
- <html>
142
- <head>
143
- <title>HTML Sample 5</title>
144
- </head>
145
- <body>
146
- <h1>HTML Sample 5</h1>
147
- <img src="sample.gif">
148
- </body>
149
- </html>
150
- SAMPLE_5
151
-
152
- urls = ContentUrls::HtmlParser.urls(html_sample_5)
153
- urls.first.should eq 'sample.gif'
154
- end
155
- end
156
-
157
- describe ContentUrls::HtmlParser do
158
- it "should parse HTML Sample 6 and return 'link href' URL" do
159
-
160
- html_sample_6 =<<SAMPLE_6
161
- <html>
162
- <head>
163
- <title>HTML Sample 6</title>
164
- <link href="/index.php" REL="index">
165
- </head>
166
- <body>
167
- <h1>HTML Sample 6</h1>
168
- </body>
169
- </html>
170
- SAMPLE_6
171
-
172
- urls = ContentUrls::HtmlParser.urls(html_sample_6)
173
- urls.first.should eq '/index.php'
174
- end
175
- end
176
-
177
- describe ContentUrls::HtmlParser do
178
- it "should parse HTML Sample 7 and return 'object data' URL" do
179
-
180
- html_sample_7 =<<SAMPLE_7
181
- <html>
182
- <head>
183
- <title>HTML Sample 7</title>
184
- </head>
185
- <body>
186
- <h1>HTML Sample 7</h1>
187
- <object width="400" height="400" data="/stuff/example.swf"></object>
188
- </body>
189
- </html>
190
- SAMPLE_7
191
-
192
- urls = ContentUrls::HtmlParser.urls(html_sample_7)
193
- urls.first.should eq '/stuff/example.swf'
194
- end
195
- end
196
-
197
- describe ContentUrls::HtmlParser do
198
- it "should parse HTML Sample 8 and return 'script src' URL" do
199
-
200
- html_sample_8 =<<SAMPLE_8
201
- <html>
202
- <head>
203
- <title>HTML Sample 8</title>
204
- </head>
205
- <body>
206
- <h1>HTML Sample 8</h1>
207
- <script language="javascript" src="../scripts/go.js"></script>
208
- </body>
209
- </html>
210
- SAMPLE_8
211
-
212
- urls = ContentUrls::HtmlParser.urls(html_sample_8)
213
- urls.first.should eq '../scripts/go.js'
214
- end
215
- end
216
-
217
- describe ContentUrls::HtmlParser do
218
- it "should parse HTML Sample 9 and return 'meta content' URL" do
219
-
220
- html_sample_9 =<<SAMPLE_9
221
- <html>
222
- <head>
223
- <title>HTML Sample 9</title>
224
- <meta http-equiv="refresh" content="5;URL='http://example.com/'">
225
- </head>
226
- <body>
227
- <h1>HTML Sample 9</h1>
228
- </body>
229
- </html>
230
- SAMPLE_9
231
-
232
- urls = ContentUrls::HtmlParser.urls(html_sample_9)
233
- urls.first.should eq 'http://example.com/'
234
- end
235
- end
236
-
237
- describe ContentUrls::HtmlParser do
238
- it "should parse HTML Sample 10 and return URLs found within 'style' attributes" do
239
-
240
- html_sample_10 =<<SAMPLE_10
241
- <html>
242
- <head>
243
- <title>HTML Sample 10</title>
244
- </head>
245
- <body style="background-image:url('background.jpg');">
246
- <h1>HTML Sample 10</h1>
247
- </body>
248
- </html>
249
- SAMPLE_10
250
-
251
- urls = ContentUrls::HtmlParser.urls(html_sample_10)
252
- urls.first.should eq 'background.jpg'
253
- end
254
- end
255
-
256
- describe ContentUrls::HtmlParser do
257
- it "should parse HTML Sample 11 and return URLs found within 'style' tags" do
258
-
259
- html_sample_11 =<<SAMPLE_11
260
- <html>
261
- <head>
262
- <title>HTML Sample 11</title>
263
- <style type="text/css">
264
- body {background-image:url('/image/background.jpg');}
265
- </style>
266
- </head>
267
- <body>
268
- <h1>HTML Sample 11</h1>
269
- </body>
270
- </html>
271
- SAMPLE_11
272
-
273
- urls = ContentUrls::HtmlParser.urls(html_sample_11)
274
- urls.first.should eq '/image/background.jpg'
275
- end
276
- end
277
-
278
- describe ContentUrls::HtmlParser do
279
- it "should parse HTML Sample 12 and return URLs found within 'script' tags" do
280
-
281
- html_sample_12 =<<SAMPLE_12
282
- <html>
283
- <head>
284
- <title>HTML Sample 12</title>
285
- <script type="text/javascript">
286
- var link="http://www.sample.com/index.html"
287
- // ...
288
- </script>
289
- </head>
290
- <body>
291
- <h1>HTML Sample 12</h1>
292
- </body>
293
- </html>
294
- SAMPLE_12
295
-
296
- urls = ContentUrls::HtmlParser.urls(html_sample_12)
297
- urls.first.should eq 'http://www.sample.com/index.html'
298
- end
299
- end
300
-
301
- describe ContentUrls::HtmlParser do
302
- it "should execute the sample code for rewrite_each_url method" do
303
- #output = ''
304
- html = '<html><a href="index.htm">Click me</a></html>'
305
- html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'}
306
- #output += "Rewritten: #{html}" + "\n"
307
- #output.should eq %Q{Rewritten: <html><a href="index.php">Click me</a></html>\n}
308
- ContentUrls::HtmlParser.urls(html).first.should eq 'index.php' # Nokogiri rewrites HTML, instead check rewritten URL
309
- end
310
- it "should execute sample code for urls method" do
311
- output = ''
312
- html = '<html><a href="index.htm">Click me</a></html>'
313
- ContentUrls::HtmlParser.urls(html).each do |url|
314
- output += "Found URL: #{url}" + "\n"
315
- end
316
- output.should eq %Q{Found URL: index.htm\n}
317
- end
318
- end
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe ContentUrls::HtmlParser do
4
+ it "should return no URLs when given no content" do
5
+ ContentUrls::HtmlParser.urls('').should eq []
6
+ end
7
+ it "should return no URLs when given garbage content" do
8
+ ContentUrls::HtmlParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
9
+ end
10
+ end
11
+
12
+ describe ContentUrls::HtmlParser do
13
+ it "should return the URLs in the content" do
14
+ ContentUrls::HtmlParser.urls("<a href='index.html").first.should eq 'index.html'
15
+ end
16
+ end
17
+
18
+ describe ContentUrls::HtmlParser do
19
+ it "should parse HTML Sample 1 and return all a links" do
20
+
21
+ html_sample_1 =<<SAMPLE_1
22
+ <html>
23
+ <head>
24
+ <title>HTML Sample 1</title>
25
+ </head>
26
+ <body>
27
+ <h1>HTML Sample 1</h1>
28
+ <a href="a-href-link-1.html"></a>
29
+ <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
30
+ <a href="/folder/a-href-link-3.html?a=1"></a>
31
+ </body>
32
+ </html>
33
+ SAMPLE_1
34
+
35
+ urls = ContentUrls::HtmlParser.urls(html_sample_1)
36
+ urls.include?('a-href-link-1.html').should eq true
37
+ urls.include?('http://www.example.com/1/2/3/a-href-link-2.html').should eq true
38
+ urls.include?('/folder/a-href-link-3.html?a=1').should eq true
39
+ end
40
+ end
41
+
42
+ describe ContentUrls::HtmlParser do
43
+ it "should parse HTML Sample 1 and rewrite all a links" do
44
+
45
+ html_sample_1 =<<SAMPLE_1
46
+ <html>
47
+ <head>
48
+ <title>HTML Sample 1</title>
49
+ </head>
50
+ <body>
51
+ <h1>HTML Sample 1</h1>
52
+ <a href="a-href-link-1.html"></a>
53
+ <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
54
+ <a href="/folder/a-href-link-3.html?a=1"></a>
55
+ </body>
56
+ </html>
57
+ SAMPLE_1
58
+
59
+ content = ContentUrls::HtmlParser.rewrite_each_url(html_sample_1) do |url|
60
+ url = URI.parse url
61
+ url.path = url.path.sub(/\.html\b/, '.php')
62
+ url
63
+ end
64
+ urls = ContentUrls::HtmlParser.urls(content)
65
+ urls.include?('a-href-link-1.php').should eq true
66
+ urls.include?('http://www.example.com/1/2/3/a-href-link-2.php').should eq true
67
+ urls.include?('/folder/a-href-link-3.php?a=1').should eq true
68
+ end
69
+ end
70
+
71
+ describe ContentUrls::HtmlParser do
72
+ it "should parse HTML Sample 2 and return all 'area href' URLs" do
73
+
74
+ html_sample_2 =<<SAMPLE_2
75
+ <html>
76
+ <head>
77
+ <title>HTML Sample 2</title>
78
+ </head>
79
+ <body>
80
+ <h1>HTML Sample 2</h1>
81
+ <img src="sample.gif" width="200" height="200" alt="Click somewhere" usemap="#sample-map">
82
+ <map name="sample-map">
83
+ <area shape="rect" coords="0,0,100,100" href="area-href-link-1.html" alt="link 1">
84
+ <area shape="circle" coords="150,150,2" href="http://www.example.com/1/2/3/area-href-link-2.html" alt="link 2">
85
+ <area shape="circle" coords="100,180,1" href="/folder/area-href-link-3.html?a=1" alt="link 3">
86
+ </map>
87
+ </body>
88
+ </html>
89
+ SAMPLE_2
90
+
91
+ urls = ContentUrls::HtmlParser.urls(html_sample_2)
92
+ urls.include?('area-href-link-1.html').should eq true
93
+ urls.include?('http://www.example.com/1/2/3/area-href-link-2.html').should eq true
94
+ urls.include?('/folder/area-href-link-3.html?a=1').should eq true
95
+ end
96
+ end
97
+
98
+ describe ContentUrls::HtmlParser do
99
+ it "should parse HTML Sample 3 and return 'body background' URL" do
100
+
101
+ html_sample_3 =<<SAMPLE_3
102
+ <html>
103
+ <head>
104
+ <title>HTML Sample 3</title>
105
+ </head>
106
+ <body background="/images/background.png">
107
+ <h1>HTML Sample 3</h1>
108
+ </body>
109
+ </html>
110
+ SAMPLE_3
111
+
112
+ urls = ContentUrls::HtmlParser.urls(html_sample_3)
113
+ urls.first.should eq '/images/background.png'
114
+ end
115
+ end
116
+
117
+ describe ContentUrls::HtmlParser do
118
+ it "should parse HTML Sample 4 and return 'embed src' URL" do
119
+
120
+ html_sample_4 =<<SAMPLE_4
121
+ <html>
122
+ <head>
123
+ <title>HTML Sample 4</title>
124
+ </head>
125
+ <body>
126
+ <h1>HTML Sample 4</h1>
127
+ <embed src="sample.swf" />
128
+ </body>
129
+ </html>
130
+ SAMPLE_4
131
+
132
+ urls = ContentUrls::HtmlParser.urls(html_sample_4)
133
+ urls.first.should eq 'sample.swf'
134
+ end
135
+ end
136
+
137
+ describe ContentUrls::HtmlParser do
138
+ it "should parse HTML Sample 5 and return 'img src' URL" do
139
+
140
+ html_sample_5 =<<SAMPLE_5
141
+ <html>
142
+ <head>
143
+ <title>HTML Sample 5</title>
144
+ </head>
145
+ <body>
146
+ <h1>HTML Sample 5</h1>
147
+ <img src="sample.gif">
148
+ </body>
149
+ </html>
150
+ SAMPLE_5
151
+
152
+ urls = ContentUrls::HtmlParser.urls(html_sample_5)
153
+ urls.first.should eq 'sample.gif'
154
+ end
155
+ end
156
+
157
+ describe ContentUrls::HtmlParser do
158
+ it "should parse HTML Sample 6 and return 'link href' URL" do
159
+
160
+ html_sample_6 =<<SAMPLE_6
161
+ <html>
162
+ <head>
163
+ <title>HTML Sample 6</title>
164
+ <link href="/index.php" REL="index">
165
+ </head>
166
+ <body>
167
+ <h1>HTML Sample 6</h1>
168
+ </body>
169
+ </html>
170
+ SAMPLE_6
171
+
172
+ urls = ContentUrls::HtmlParser.urls(html_sample_6)
173
+ urls.first.should eq '/index.php'
174
+ end
175
+ end
176
+
177
+ describe ContentUrls::HtmlParser do
178
+ it "should parse HTML Sample 7 and return 'object data' URL" do
179
+
180
+ html_sample_7 =<<SAMPLE_7
181
+ <html>
182
+ <head>
183
+ <title>HTML Sample 7</title>
184
+ </head>
185
+ <body>
186
+ <h1>HTML Sample 7</h1>
187
+ <object width="400" height="400" data="/stuff/example.swf"></object>
188
+ </body>
189
+ </html>
190
+ SAMPLE_7
191
+
192
+ urls = ContentUrls::HtmlParser.urls(html_sample_7)
193
+ urls.first.should eq '/stuff/example.swf'
194
+ end
195
+ end
196
+
197
+ describe ContentUrls::HtmlParser do
198
+ it "should parse HTML Sample 8 and return 'script src' URL" do
199
+
200
+ html_sample_8 =<<SAMPLE_8
201
+ <html>
202
+ <head>
203
+ <title>HTML Sample 8</title>
204
+ </head>
205
+ <body>
206
+ <h1>HTML Sample 8</h1>
207
+ <script language="javascript" src="../scripts/go.js"></script>
208
+ </body>
209
+ </html>
210
+ SAMPLE_8
211
+
212
+ urls = ContentUrls::HtmlParser.urls(html_sample_8)
213
+ urls.first.should eq '../scripts/go.js'
214
+ end
215
+ end
216
+
217
+ describe ContentUrls::HtmlParser do
218
+ it "should parse HTML Sample 9 and return 'meta content' URL" do
219
+
220
+ html_sample_9 =<<SAMPLE_9
221
+ <html>
222
+ <head>
223
+ <title>HTML Sample 9</title>
224
+ <meta http-equiv="refresh" content="5;URL='http://example.com/'">
225
+ </head>
226
+ <body>
227
+ <h1>HTML Sample 9</h1>
228
+ </body>
229
+ </html>
230
+ SAMPLE_9
231
+
232
+ urls = ContentUrls::HtmlParser.urls(html_sample_9)
233
+ urls.first.should eq 'http://example.com/'
234
+ end
235
+ end
236
+
237
+ describe ContentUrls::HtmlParser do
238
+ it "should parse HTML Sample 10 and return URLs found within 'style' attributes" do
239
+
240
+ html_sample_10 =<<SAMPLE_10
241
+ <html>
242
+ <head>
243
+ <title>HTML Sample 10</title>
244
+ </head>
245
+ <body style="background-image:url('background.jpg');">
246
+ <h1>HTML Sample 10</h1>
247
+ </body>
248
+ </html>
249
+ SAMPLE_10
250
+
251
+ urls = ContentUrls::HtmlParser.urls(html_sample_10)
252
+ urls.first.should eq 'background.jpg'
253
+ end
254
+ end
255
+
256
+ describe ContentUrls::HtmlParser do
257
+ it "should parse HTML Sample 11 and return URLs found within 'style' tags" do
258
+
259
+ html_sample_11 =<<SAMPLE_11
260
+ <html>
261
+ <head>
262
+ <title>HTML Sample 11</title>
263
+ <style type="text/css">
264
+ body {background-image:url('/image/background.jpg');}
265
+ </style>
266
+ </head>
267
+ <body>
268
+ <h1>HTML Sample 11</h1>
269
+ </body>
270
+ </html>
271
+ SAMPLE_11
272
+
273
+ urls = ContentUrls::HtmlParser.urls(html_sample_11)
274
+ urls.first.should eq '/image/background.jpg'
275
+ end
276
+ end
277
+
278
+ describe ContentUrls::HtmlParser do
279
+ it "should parse HTML Sample 12 and return URLs found within 'script' tags" do
280
+
281
+ html_sample_12 =<<SAMPLE_12
282
+ <html>
283
+ <head>
284
+ <title>HTML Sample 12</title>
285
+ <script type="text/javascript">
286
+ var link="http://www.sample.com/index.html"
287
+ // ...
288
+ </script>
289
+ </head>
290
+ <body>
291
+ <h1>HTML Sample 12</h1>
292
+ </body>
293
+ </html>
294
+ SAMPLE_12
295
+
296
+ urls = ContentUrls::HtmlParser.urls(html_sample_12)
297
+ urls.first.should eq 'http://www.sample.com/index.html'
298
+ end
299
+ end
300
+
301
+ describe ContentUrls::HtmlParser do
302
+ it "should parse HTML Sample 13 and return 'frame src' URL" do
303
+
304
+ html_sample_13 =<<SAMPLE_13
305
+ <html>
306
+ <head>
307
+ <title>HTML Sample 8</title>
308
+ </head>
309
+ <body>
310
+ <h1>HTML Sample 13</h1>
311
+ <frame src='/info.html'>
312
+ </body>
313
+ </html>
314
+ SAMPLE_13
315
+
316
+ urls = ContentUrls::HtmlParser.urls(html_sample_13)
317
+ urls.first.should eq '/info.html'
318
+ end
319
+ end
320
+
321
+ describe ContentUrls::HtmlParser do
322
+ it "should parse the HTML and return the 'base' URL" do
323
+
324
+ html_base_sample =<<BASE_SAMPLE
325
+ <html>
326
+ <head>
327
+ <base href='/en/'
328
+ <title>HTML base Sample</title>
329
+ </head>
330
+ <body>
331
+ <h1>HTML base Sample</h1>
332
+ </body>
333
+ </html>
334
+ BASE_SAMPLE
335
+
336
+ url = ContentUrls::HtmlParser.base(html_base_sample)
337
+ url.should eq '/en/'
338
+ end
339
+ end
340
+
341
+ describe ContentUrls::HtmlParser do
342
+ it "should execute the sample code for rewrite_each_url method" do
343
+ #output = ''
344
+ html = '<html><a href="index.htm">Click me</a></html>'
345
+ html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'}
346
+ #output += "Rewritten: #{html}" + "\n"
347
+ #output.should eq %Q{Rewritten: <html><a href="index.php">Click me</a></html>\n}
348
+ ContentUrls::HtmlParser.urls(html).first.should eq 'index.php' # Nokogiri rewrites HTML, instead check rewritten URL
349
+ end
350
+ it "should execute sample code for urls method" do
351
+ output = ''
352
+ html = '<html><a href="index.htm">Click me</a></html>'
353
+ ContentUrls::HtmlParser.urls(html).each do |url|
354
+ output += "Found URL: #{url}" + "\n"
355
+ end
356
+ output.should eq %Q{Found URL: index.htm\n}
357
+ end
358
+ end