metainspector 1.10.0 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/lib/meta_inspector/scraper.rb +9 -13
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/international.response +8 -0
- data/spec/metainspector_spec.rb +46 -18
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -34,7 +34,7 @@ You can set a different timeout with a second parameter, like this:
|
|
34
34
|
|
35
35
|
page = MetaInspector.new('markupvalidator.com', :timeout => 5) # this would wait just 5 seconds to timeout
|
36
36
|
|
37
|
-
|
37
|
+
MetaInspector will try to parse all URLs by default. If you want to parse only those URLs that have text/html as content-type you can specify it like this:
|
38
38
|
|
39
39
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
40
40
|
|
@@ -16,6 +16,7 @@ module MetaInspector
|
|
16
16
|
# => timeout: defaults to 20 seconds
|
17
17
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
18
|
def initialize(url, options = {})
|
19
|
+
url = encode_url(url)
|
19
20
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
20
21
|
@scheme = URI.parse(@url).scheme
|
21
22
|
@host = URI.parse(@url).host
|
@@ -53,21 +54,11 @@ module MetaInspector
|
|
53
54
|
@data.external_links ||= links.select {|link| URI.parse(link).host != @host }
|
54
55
|
end
|
55
56
|
|
56
|
-
def absolute_links
|
57
|
-
warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
|
58
|
-
links
|
59
|
-
end
|
60
|
-
|
61
57
|
# Images found on the page, as absolute URLs
|
62
58
|
def images
|
63
59
|
@data.images ||= parsed_images.map{ |i| absolutify_url(i) }
|
64
60
|
end
|
65
61
|
|
66
|
-
def absolute_images
|
67
|
-
warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
|
68
|
-
images
|
69
|
-
end
|
70
|
-
|
71
62
|
# Returns the parsed document meta rss links
|
72
63
|
def feed
|
73
64
|
@data.feed ||= parsed_document.xpath("//link").select{ |link|
|
@@ -112,7 +103,7 @@ module MetaInspector
|
|
112
103
|
|
113
104
|
# Returns the original, unparsed document
|
114
105
|
def document
|
115
|
-
@document ||= Timeout::timeout(@timeout) {
|
106
|
+
@document ||= Timeout::timeout(@timeout) {
|
116
107
|
req = open(@url)
|
117
108
|
@content_type = @data.content_type = req.content_type
|
118
109
|
|
@@ -186,13 +177,18 @@ module MetaInspector
|
|
186
177
|
@errors << error
|
187
178
|
end
|
188
179
|
|
180
|
+
# Encode url to deal with international characters
|
181
|
+
def encode_url(url)
|
182
|
+
URI.encode(url).to_s.gsub("%23", "#")
|
183
|
+
end
|
184
|
+
|
189
185
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
190
186
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
191
187
|
def absolutify_url(url)
|
192
188
|
if url =~ /^\w*\:/i
|
193
|
-
url
|
189
|
+
encode_url(url)
|
194
190
|
else
|
195
|
-
URI.parse(@root_url).merge(
|
191
|
+
URI.parse(@root_url).merge(encode_url(url)).to_s
|
196
192
|
end
|
197
193
|
end
|
198
194
|
|
@@ -17,10 +17,18 @@ Cache-control: private
|
|
17
17
|
<title>International chars</title>
|
18
18
|
</head>
|
19
19
|
<body>
|
20
|
+
<h1>Internal links:</h1>
|
20
21
|
<a href="/españa.asp">España</a>
|
21
22
|
<a href="/romanée">Romanée</a>
|
22
23
|
<a href="/faqs#camión">FAQs camión</a>
|
23
24
|
<a href="/search?q=camión">Search camión</a>
|
24
25
|
<a href="/search?q=españa#top">Search España at top</a>
|
26
|
+
|
27
|
+
<h1>External links:</h1>
|
28
|
+
<a href="http://example.com/españa.asp">España</a>
|
29
|
+
<a href="http://example.com/romanée">Romanée</a>
|
30
|
+
<a href="http://example.com/faqs#camión">FAQs camión</a>
|
31
|
+
<a href="http://example.com/search?q=camión">Search camión</a>
|
32
|
+
<a href="http://example.com/search?q=españa#top">Search España at top</a>
|
25
33
|
</body>
|
26
34
|
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -29,13 +29,15 @@ describe MetaInspector do
|
|
29
29
|
|
30
30
|
describe 'Initialization' do
|
31
31
|
it 'should accept an URL with a scheme' do
|
32
|
-
|
33
|
-
@m.url.should == 'http://pagerankalert.com'
|
32
|
+
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
34
33
|
end
|
35
34
|
|
36
35
|
it "should use http:// as a default scheme" do
|
37
|
-
|
38
|
-
|
36
|
+
MetaInspector.new('pagerankalert.com').url.should == 'http://pagerankalert.com'
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should accept an URL with international characters" do
|
40
|
+
MetaInspector.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
|
39
41
|
end
|
40
42
|
|
41
43
|
it "should store the scheme" do
|
@@ -51,9 +53,10 @@ describe MetaInspector do
|
|
51
53
|
end
|
52
54
|
|
53
55
|
it "should store the root url" do
|
54
|
-
MetaInspector.new('http://pagerankalert.com').root_url.should
|
55
|
-
MetaInspector.new('https://pagerankalert.com').root_url.should
|
56
|
-
MetaInspector.new('pagerankalert.com').root_url.should
|
56
|
+
MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
57
|
+
MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
|
58
|
+
MetaInspector.new('pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
59
|
+
MetaInspector.new('http://international.com/olé').root_url.should == 'http://international.com/'
|
57
60
|
end
|
58
61
|
end
|
59
62
|
|
@@ -171,19 +174,44 @@ describe MetaInspector do
|
|
171
174
|
"http://alazan.com/faqs.asp" ]
|
172
175
|
end
|
173
176
|
|
174
|
-
it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
|
175
|
-
m = MetaInspector.new('http://international.com')
|
176
|
-
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
177
|
-
"http://international.com/roman%C3%A9e",
|
178
|
-
"http://international.com/faqs#cami%C3%B3n",
|
179
|
-
"http://international.com/search?q=cami%C3%B3n",
|
180
|
-
"http://international.com/search?q=espa%C3%B1a#top"]
|
181
|
-
end
|
182
|
-
|
183
177
|
it "should return empty array if no links found" do
|
184
178
|
m = MetaInspector.new('http://example.com/empty')
|
185
179
|
m.links.should == []
|
186
180
|
end
|
181
|
+
|
182
|
+
describe "links with international characters" do
|
183
|
+
it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
|
184
|
+
m = MetaInspector.new('http://international.com')
|
185
|
+
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
186
|
+
"http://international.com/roman%C3%A9e",
|
187
|
+
"http://international.com/faqs#cami%C3%B3n",
|
188
|
+
"http://international.com/search?q=cami%C3%B3n",
|
189
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
190
|
+
"http://example.com/espa%C3%B1a.asp",
|
191
|
+
"http://example.com/roman%C3%A9e",
|
192
|
+
"http://example.com/faqs#cami%C3%B3n",
|
193
|
+
"http://example.com/search?q=cami%C3%B3n",
|
194
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
195
|
+
end
|
196
|
+
|
197
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
198
|
+
m = MetaInspector.new('http://international.com')
|
199
|
+
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
200
|
+
"http://international.com/roman%C3%A9e",
|
201
|
+
"http://international.com/faqs#cami%C3%B3n",
|
202
|
+
"http://international.com/search?q=cami%C3%B3n",
|
203
|
+
"http://international.com/search?q=espa%C3%B1a#top"]
|
204
|
+
end
|
205
|
+
|
206
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
207
|
+
m = MetaInspector.new('http://international.com')
|
208
|
+
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
209
|
+
"http://example.com/roman%C3%A9e",
|
210
|
+
"http://example.com/faqs#cami%C3%B3n",
|
211
|
+
"http://example.com/search?q=cami%C3%B3n",
|
212
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
213
|
+
end
|
214
|
+
end
|
187
215
|
end
|
188
216
|
|
189
217
|
describe 'Non-HTTP links' do
|
@@ -342,7 +370,7 @@ describe MetaInspector do
|
|
342
370
|
|
343
371
|
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
344
372
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
345
|
-
|
373
|
+
|
346
374
|
expect {
|
347
375
|
title = image_url.title
|
348
376
|
}.to change { image_url.errors.size }
|
@@ -352,7 +380,7 @@ describe MetaInspector do
|
|
352
380
|
|
353
381
|
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
354
382
|
tar_url = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
355
|
-
|
383
|
+
|
356
384
|
expect {
|
357
385
|
title = tar_url.title
|
358
386
|
}.to change { tar_url.errors.size }
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 61
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 10
|
9
|
-
-
|
10
|
-
version: 1.10.
|
9
|
+
- 1
|
10
|
+
version: 1.10.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-16 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|