metainspector 1.10.0 → 1.10.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this:
34
34
 
35
35
  page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
36
36
 
37
- Metainspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
37
+ MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
38
38
 
39
39
  page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
40
40
 
@@ -16,6 +16,7 @@ module MetaInspector
16
16
  # => timeout: defaults to 20 seconds
17
17
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
18
  def initialize(url, options = {})
19
+ url = encode_url(url)
19
20
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
20
21
  @scheme = URI.parse(@url).scheme
21
22
  @host = URI.parse(@url).host
@@ -53,21 +54,11 @@ module MetaInspector
53
54
  @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
54
55
  end
55
56
 
56
- def absolute_links
57
- warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
58
- links
59
- end
60
-
61
57
  # Images found on the page, as absolute URLs
62
58
  def images
63
59
  @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
64
60
  end
65
61
 
66
- def absolute_images
67
- warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
68
- images
69
- end
70
-
71
62
  # Returns the parsed document meta rss links
72
63
  def feed
73
64
  @data.feed ||= parsed_document.xpath("//link").select{ |link|
@@ -112,7 +103,7 @@ module MetaInspector
112
103
 
113
104
  # Returns the original, unparsed document
114
105
  def document
115
- @document ||= Timeout::timeout(@timeout) {
106
+ @document ||= Timeout::timeout(@timeout) {
116
107
  req = open(@url)
117
108
  @content_type = @data.content_type = req.content_type
118
109
 
@@ -186,13 +177,18 @@ module MetaInspector
186
177
  @errors << error
187
178
  end
188
179
 
180
+ # Encode url to deal with international characters
181
+ def encode_url(url)
182
+ URI.encode(url).to_s.gsub("%23", "#")
183
+ end
184
+
189
185
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
190
186
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
191
187
  def absolutify_url(url)
192
188
  if url =~ /^\w*\:/i
193
- url
189
+ encode_url(url)
194
190
  else
195
- URI.parse(@root_url).merge(URI.encode(url)).to_s.gsub("%23", "#")
191
+ URI.parse(@root_url).merge(encode_url(url)).to_s
196
192
  end
197
193
  end
198
194
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.10.0"
4
+ VERSION = "1.10.1"
5
5
  end
@@ -17,10 +17,18 @@ Cache-control: private
17
17
  <title>International chars</title>
18
18
  </head>
19
19
  <body>
20
+ <h1>Internal links:</h1>
20
21
  <a href="/españa.asp">España</a>
21
22
  <a href="/romanée">Romanée</a>
22
23
  <a href="/faqs#camión">FAQs camión</a>
23
24
  <a href="/search?q=camión">Search camión</a>
24
25
  <a href="/search?q=españa#top">Search España at top</a>
26
+
27
+ <h1>External links:</h1>
28
+ <a href="http://example.com/españa.asp">España</a>
29
+ <a href="http://example.com/romanée">Romanée</a>
30
+ <a href="http://example.com/faqs#camión">FAQs camión</a>
31
+ <a href="http://example.com/search?q=camión">Search camión</a>
32
+ <a href="http://example.com/search?q=españa#top">Search España at top</a>
25
33
  </body>
26
34
  </html>
@@ -29,13 +29,15 @@ describe MetaInspector do
29
29
 
30
30
  describe 'Initialization' do
31
31
  it 'should accept an URL with a scheme' do
32
- @m = MetaInspector.new('http://pagerankalert.com')
33
- @m.url.should == 'http://pagerankalert.com'
32
+ MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
34
33
  end
35
34
 
36
35
  it "should use http:// as a default scheme" do
37
- @m = MetaInspector.new('pagerankalert.com')
38
- @m.url.should == 'http://pagerankalert.com'
36
+ MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
37
+ end
38
+
39
+ it "should accept an URL with international characters" do
40
+ MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
39
41
  end
40
42
 
41
43
  it "should store the scheme" do
@@ -51,9 +53,10 @@ describe MetaInspector do
51
53
  end
52
54
 
53
55
  it "should store the root url" do
54
- MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
55
- MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
56
- MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
56
+ MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
57
+ MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
58
+ MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
59
+ MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
57
60
  end
58
61
  end
59
62
 
@@ -171,19 +174,44 @@ describe MetaInspector do
171
174
  "http://alazan.com/faqs.asp" ]
172
175
  end
173
176
 
174
- it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
175
- m = MetaInspector.new('http://international.com')
176
- m.links.should == [ "http://international.com/espa%C3%B1a.asp",
177
- "http://international.com/roman%C3%A9e",
178
- "http://international.com/faqs#cami%C3%B3n",
179
- "http://international.com/search?q=cami%C3%B3n",
180
- "http://international.com/search?q=espa%C3%B1a#top"]
181
- end
182
-
183
177
  it "should return empty array if no links found" do
184
178
  m = MetaInspector.new('http://example.com/empty')
185
179
  m.links.should == []
186
180
  end
181
+
182
+ describe "links with international characters" do
183
+ it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
184
+ m = MetaInspector.new('http://international.com')
185
+ m.links.should == [ "http://international.com/espa%C3%B1a.asp",
186
+ "http://international.com/roman%C3%A9e",
187
+ "http://international.com/faqs#cami%C3%B3n",
188
+ "http://international.com/search?q=cami%C3%B3n",
189
+ "http://international.com/search?q=espa%C3%B1a#top",
190
+ "http://example.com/espa%C3%B1a.asp",
191
+ "http://example.com/roman%C3%A9e",
192
+ "http://example.com/faqs#cami%C3%B3n",
193
+ "http://example.com/search?q=cami%C3%B3n",
194
+ "http://example.com/search?q=espa%C3%B1a#top"]
195
+ end
196
+
197
+ it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
198
+ m = MetaInspector.new('http://international.com')
199
+ m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
200
+ "http://international.com/roman%C3%A9e",
201
+ "http://international.com/faqs#cami%C3%B3n",
202
+ "http://international.com/search?q=cami%C3%B3n",
203
+ "http://international.com/search?q=espa%C3%B1a#top"]
204
+ end
205
+
206
+ it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
207
+ m = MetaInspector.new('http://international.com')
208
+ m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
209
+ "http://example.com/roman%C3%A9e",
210
+ "http://example.com/faqs#cami%C3%B3n",
211
+ "http://example.com/search?q=cami%C3%B3n",
212
+ "http://example.com/search?q=espa%C3%B1a#top"]
213
+ end
214
+ end
187
215
  end
188
216
 
189
217
  describe 'Non-HTTP links' do
@@ -342,7 +370,7 @@ describe MetaInspector do
342
370
 
343
371
  it "should handle errors when content is image/jpeg and html_content_type_only is true" do
344
372
  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
345
-
373
+
346
374
  expect {
347
375
  title = image_url.title
348
376
  }.to change { image_url.errors.size }
@@ -352,7 +380,7 @@ describe MetaInspector do
352
380
 
353
381
  it "should handle errors when content is not text/html and html_content_type_only is true" do
354
382
  tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
355
-
383
+
356
384
  expect {
357
385
  title = tar_url.title
358
386
  }.to change { tar_url.errors.size }
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 63
4
+ hash: 61
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 10
9
- - 0
10
- version: 1.10.0
9
+ - 1
10
+ version: 1.10.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-15 00:00:00 Z
18
+ date: 2012-11-16 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement