metainspector 1.10.0 → 1.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this:
34
34
 
35
35
  page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
36
36
 
37
- Metainspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
37
+ MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
38
38
 
39
39
  page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
40
40
 
@@ -16,6 +16,7 @@ module MetaInspector
16
16
  # => timeout: defaults to 20 seconds
17
17
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
18
  def initialize(url, options = {})
19
+ url = encode_url(url)
19
20
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
20
21
  @scheme = URI.parse(@url).scheme
21
22
  @host = URI.parse(@url).host
@@ -53,21 +54,11 @@ module MetaInspector
53
54
  @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
54
55
  end
55
56
 
56
- def absolute_links
57
- warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
58
- links
59
- end
60
-
61
57
  # Images found on the page, as absolute URLs
62
58
  def images
63
59
  @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
64
60
  end
65
61
 
66
- def absolute_images
67
- warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
68
- images
69
- end
70
-
71
62
  # Returns the parsed document meta rss links
72
63
  def feed
73
64
  @data.feed ||= parsed_document.xpath("//link").select{ |link|
@@ -112,7 +103,7 @@ module MetaInspector
112
103
 
113
104
  # Returns the original, unparsed document
114
105
  def document
115
- @document ||= Timeout::timeout(@timeout) {
106
+ @document ||= Timeout::timeout(@timeout) {
116
107
  req = open(@url)
117
108
  @content_type = @data.content_type = req.content_type
118
109
 
@@ -186,13 +177,18 @@ module MetaInspector
186
177
  @errors << error
187
178
  end
188
179
 
180
+ # Encode url to deal with international characters
181
+ def encode_url(url)
182
+ URI.encode(url).to_s.gsub("%23", "#")
183
+ end
184
+
189
185
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
190
186
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
191
187
  def absolutify_url(url)
192
188
  if url =~ /^\w*\:/i
193
- url
189
+ encode_url(url)
194
190
  else
195
- URI.parse(@root_url).merge(URI.encode(url)).to_s.gsub("%23", "#")
191
+ URI.parse(@root_url).merge(encode_url(url)).to_s
196
192
  end
197
193
  end
198
194
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.10.0"
4
+ VERSION = "1.10.1"
5
5
  end
@@ -17,10 +17,18 @@ Cache-control: private
17
17
  <title>International chars</title>
18
18
  </head>
19
19
  <body>
20
+ <h1>Internal links:</h1>
20
21
  <a href="/españa.asp">España</a>
21
22
  <a href="/romanée">Romanée</a>
22
23
  <a href="/faqs#camión">FAQs camión</a>
23
24
  <a href="/search?q=camión">Search camión</a>
24
25
  <a href="/search?q=españa#top">Search España at top</a>
26
+
27
+ <h1>External links:</h1>
28
+ <a href="http://example.com/españa.asp">España</a>
29
+ <a href="http://example.com/romanée">Romanée</a>
30
+ <a href="http://example.com/faqs#camión">FAQs camión</a>
31
+ <a href="http://example.com/search?q=camión">Search camión</a>
32
+ <a href="http://example.com/search?q=españa#top">Search España at top</a>
25
33
  </body>
26
34
  </html>
@@ -29,13 +29,15 @@ describe MetaInspector do
29
29
 
30
30
  describe 'Initialization' do
31
31
  it 'should accept an URL with a scheme' do
32
- @m = MetaInspector.new('http://pagerankalert.com')
33
- @m.url.should == 'http://pagerankalert.com'
32
+ MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
34
33
  end
35
34
 
36
35
  it "should use http:// as a default scheme" do
37
- @m = MetaInspector.new('pagerankalert.com')
38
- @m.url.should == 'http://pagerankalert.com'
36
+ MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
37
+ end
38
+
39
+ it "should accept an URL with international characters" do
40
+ MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
39
41
  end
40
42
 
41
43
  it "should store the scheme" do
@@ -51,9 +53,10 @@ describe MetaInspector do
51
53
  end
52
54
 
53
55
  it "should store the root url" do
54
- MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
55
- MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
56
- MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
56
+ MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
57
+ MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
58
+ MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
59
+ MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
57
60
  end
58
61
  end
59
62
 
@@ -171,19 +174,44 @@ describe MetaInspector do
171
174
  "http://alazan.com/faqs.asp" ]
172
175
  end
173
176
 
174
- it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
175
- m = MetaInspector.new('http://international.com')
176
- m.links.should == [ "http://international.com/espa%C3%B1a.asp",
177
- "http://international.com/roman%C3%A9e",
178
- "http://international.com/faqs#cami%C3%B3n",
179
- "http://international.com/search?q=cami%C3%B3n",
180
- "http://international.com/search?q=espa%C3%B1a#top"]
181
- end
182
-
183
177
  it "should return empty array if no links found" do
184
178
  m = MetaInspector.new('http://example.com/empty')
185
179
  m.links.should == []
186
180
  end
181
+
182
+ describe "links with international characters" do
183
+ it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
184
+ m = MetaInspector.new('http://international.com')
185
+ m.links.should == [ "http://international.com/espa%C3%B1a.asp",
186
+ "http://international.com/roman%C3%A9e",
187
+ "http://international.com/faqs#cami%C3%B3n",
188
+ "http://international.com/search?q=cami%C3%B3n",
189
+ "http://international.com/search?q=espa%C3%B1a#top",
190
+ "http://example.com/espa%C3%B1a.asp",
191
+ "http://example.com/roman%C3%A9e",
192
+ "http://example.com/faqs#cami%C3%B3n",
193
+ "http://example.com/search?q=cami%C3%B3n",
194
+ "http://example.com/search?q=espa%C3%B1a#top"]
195
+ end
196
+
197
+ it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
198
+ m = MetaInspector.new('http://international.com')
199
+ m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
200
+ "http://international.com/roman%C3%A9e",
201
+ "http://international.com/faqs#cami%C3%B3n",
202
+ "http://international.com/search?q=cami%C3%B3n",
203
+ "http://international.com/search?q=espa%C3%B1a#top"]
204
+ end
205
+
206
+ it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
207
+ m = MetaInspector.new('http://international.com')
208
+ m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
209
+ "http://example.com/roman%C3%A9e",
210
+ "http://example.com/faqs#cami%C3%B3n",
211
+ "http://example.com/search?q=cami%C3%B3n",
212
+ "http://example.com/search?q=espa%C3%B1a#top"]
213
+ end
214
+ end
187
215
  end
188
216
 
189
217
  describe 'Non-HTTP links' do
@@ -342,7 +370,7 @@ describe MetaInspector do
342
370
 
343
371
  it "should handle errors when content is image/jpeg and html_content_type_only is true" do
344
372
  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
345
-
373
+
346
374
  expect {
347
375
  title = image_url.title
348
376
  }.to change { image_url.errors.size }
@@ -352,7 +380,7 @@ describe MetaInspector do
352
380
 
353
381
  it "should handle errors when content is not text/html and html_content_type_only is true" do
354
382
  tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
355
-
383
+
356
384
  expect {
357
385
  title = tar_url.title
358
386
  }.to change { tar_url.errors.size }
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 63
4
+ hash: 61
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 10
9
- - 0
10
- version: 1.10.0
9
+ - 1
10
+ version: 1.10.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-15 00:00:00 Z
18
+ date: 2012-11-16 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement