metainspector 1.10.0 → 1.10.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -1
- data/lib/meta_inspector/scraper.rb +9 -13
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/international.response +8 -0
- data/spec/metainspector_spec.rb +46 -18
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this:
|
|
34
34
|
|
35
35
|
page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
|
36
36
|
|
37
|
-
|
37
|
+
MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
|
38
38
|
|
39
39
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
40
40
|
|
@@ -16,6 +16,7 @@ module MetaInspector
|
|
16
16
|
# => timeout: defaults to 20 seconds
|
17
17
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
18
|
def initialize(url, options = {})
|
19
|
+
url = encode_url(url)
|
19
20
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
20
21
|
@scheme = URI.parse(@url).scheme
|
21
22
|
@host = URI.parse(@url).host
|
@@ -53,21 +54,11 @@ module MetaInspector
|
|
53
54
|
@data.external_links ||= links.select {|link| URI.parse(link).host != @host }
|
54
55
|
end
|
55
56
|
|
56
|
-
def absolute_links
|
57
|
-
warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
|
58
|
-
links
|
59
|
-
end
|
60
|
-
|
61
57
|
# Images found on the page, as absolute URLs
|
62
58
|
def images
|
63
59
|
@data.images ||= parsed_images.map{ |i| absolutify_url(i) }
|
64
60
|
end
|
65
61
|
|
66
|
-
def absolute_images
|
67
|
-
warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
|
68
|
-
images
|
69
|
-
end
|
70
|
-
|
71
62
|
# Returns the parsed document meta rss links
|
72
63
|
def feed
|
73
64
|
@data.feed ||= parsed_document.xpath("//link").select{ |link|
|
@@ -112,7 +103,7 @@ module MetaInspector
|
|
112
103
|
|
113
104
|
# Returns the original, unparsed document
|
114
105
|
def document
|
115
|
-
@document ||= Timeout::timeout(@timeout) {
|
106
|
+
@document ||= Timeout::timeout(@timeout) {
|
116
107
|
req = open(@url)
|
117
108
|
@content_type = @data.content_type = req.content_type
|
118
109
|
|
@@ -186,13 +177,18 @@ module MetaInspector
|
|
186
177
|
@errors << error
|
187
178
|
end
|
188
179
|
|
180
|
+
# Encode url to deal with international characters
|
181
|
+
def encode_url(url)
|
182
|
+
URI.encode(url).to_s.gsub("%23", "#")
|
183
|
+
end
|
184
|
+
|
189
185
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
190
186
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
191
187
|
def absolutify_url(url)
|
192
188
|
if url =~ /^\w*\:/i
|
193
|
-
url
|
189
|
+
encode_url(url)
|
194
190
|
else
|
195
|
-
URI.parse(@root_url).merge(
|
191
|
+
URI.parse(@root_url).merge(encode_url(url)).to_s
|
196
192
|
end
|
197
193
|
end
|
198
194
|
|
@@ -17,10 +17,18 @@ Cache-control: private
|
|
17
17
|
<title>International chars</title>
|
18
18
|
</head>
|
19
19
|
<body>
|
20
|
+
<h1>Internal links:</h1>
|
20
21
|
<a href="/españa.asp">España</a>
|
21
22
|
<a href="/romanée">Romanée</a>
|
22
23
|
<a href="/faqs#camión">FAQs camión</a>
|
23
24
|
<a href="/search?q=camión">Search camión</a>
|
24
25
|
<a href="/search?q=españa#top">Search España at top</a>
|
26
|
+
|
27
|
+
<h1>External links:</h1>
|
28
|
+
<a href="http://example.com/españa.asp">España</a>
|
29
|
+
<a href="http://example.com/romanée">Romanée</a>
|
30
|
+
<a href="http://example.com/faqs#camión">FAQs camión</a>
|
31
|
+
<a href="http://example.com/search?q=camión">Search camión</a>
|
32
|
+
<a href="http://example.com/search?q=españa#top">Search España at top</a>
|
25
33
|
</body>
|
26
34
|
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -29,13 +29,15 @@ describe MetaInspector do
|
|
29
29
|
|
30
30
|
describe 'Initialization' do
|
31
31
|
it 'should accept an URL with a scheme' do
|
32
|
-
|
33
|
-
@m.url.should == 'http://pagerankalert.com'
|
32
|
+
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
34
33
|
end
|
35
34
|
|
36
35
|
it "should use http:// as a default scheme" do
|
37
|
-
|
38
|
-
|
36
|
+
MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should accept an URL with international characters" do
|
40
|
+
MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
|
39
41
|
end
|
40
42
|
|
41
43
|
it "should store the scheme" do
|
@@ -51,9 +53,10 @@ describe MetaInspector do
|
|
51
53
|
end
|
52
54
|
|
53
55
|
it "should store the root url" do
|
54
|
-
MetaInspector.new('http://pagerankalert.com').root_url.should
|
55
|
-
MetaInspector.new('https://pagerankalert.com').root_url.should
|
56
|
-
MetaInspector.new('pagerankalert.com').root_url.should
|
56
|
+
MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
57
|
+
MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
|
58
|
+
MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
59
|
+
MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
|
57
60
|
end
|
58
61
|
end
|
59
62
|
|
@@ -171,19 +174,44 @@ describe MetaInspector do
|
|
171
174
|
"http://alazan.com/faqs.asp" ]
|
172
175
|
end
|
173
176
|
|
174
|
-
it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
|
175
|
-
m = MetaInspector.new('http://international.com')
|
176
|
-
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
177
|
-
"http://international.com/roman%C3%A9e",
|
178
|
-
"http://international.com/faqs#cami%C3%B3n",
|
179
|
-
"http://international.com/search?q=cami%C3%B3n",
|
180
|
-
"http://international.com/search?q=espa%C3%B1a#top"]
|
181
|
-
end
|
182
|
-
|
183
177
|
it "should return empty array if no links found" do
|
184
178
|
m = MetaInspector.new('http://example.com/empty')
|
185
179
|
m.links.should == []
|
186
180
|
end
|
181
|
+
|
182
|
+
describe "links with international characters" do
|
183
|
+
it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
|
184
|
+
m = MetaInspector.new('http://international.com')
|
185
|
+
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
186
|
+
"http://international.com/roman%C3%A9e",
|
187
|
+
"http://international.com/faqs#cami%C3%B3n",
|
188
|
+
"http://international.com/search?q=cami%C3%B3n",
|
189
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
190
|
+
"http://example.com/espa%C3%B1a.asp",
|
191
|
+
"http://example.com/roman%C3%A9e",
|
192
|
+
"http://example.com/faqs#cami%C3%B3n",
|
193
|
+
"http://example.com/search?q=cami%C3%B3n",
|
194
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
195
|
+
end
|
196
|
+
|
197
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
198
|
+
m = MetaInspector.new('http://international.com')
|
199
|
+
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
200
|
+
"http://international.com/roman%C3%A9e",
|
201
|
+
"http://international.com/faqs#cami%C3%B3n",
|
202
|
+
"http://international.com/search?q=cami%C3%B3n",
|
203
|
+
"http://international.com/search?q=espa%C3%B1a#top"]
|
204
|
+
end
|
205
|
+
|
206
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
207
|
+
m = MetaInspector.new('http://international.com')
|
208
|
+
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
209
|
+
"http://example.com/roman%C3%A9e",
|
210
|
+
"http://example.com/faqs#cami%C3%B3n",
|
211
|
+
"http://example.com/search?q=cami%C3%B3n",
|
212
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
213
|
+
end
|
214
|
+
end
|
187
215
|
end
|
188
216
|
|
189
217
|
describe 'Non-HTTP links' do
|
@@ -342,7 +370,7 @@ describe MetaInspector do
|
|
342
370
|
|
343
371
|
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
344
372
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
345
|
-
|
373
|
+
|
346
374
|
expect {
|
347
375
|
title = image_url.title
|
348
376
|
}.to change { image_url.errors.size }
|
@@ -352,7 +380,7 @@ describe MetaInspector do
|
|
352
380
|
|
353
381
|
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
354
382
|
tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
355
|
-
|
383
|
+
|
356
384
|
expect {
|
357
385
|
title = tar_url.title
|
358
386
|
}.to change { tar_url.errors.size }
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 61
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 10
|
9
|
-
-
|
10
|
-
version: 1.10.
|
9
|
+
- 1
|
10
|
+
version: 1.10.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-16 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|