metainspector 1.10.2 → 1.11.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -2
- data/lib/meta_inspector/scraper.rb +98 -67
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/metainspector_spec.rb +17 -33
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -47,7 +47,7 @@ Then you can see the scraped data like this:
|
|
47
47
|
page.title # title of the page, as string
|
48
48
|
page.links # array of strings, with every link found on the page as an absolute URL
|
49
49
|
page.internal_links # array of strings, with every internal link found on the page as an absolute URL
|
50
|
-
page.
|
50
|
+
page.external_links # array of strings, with every external link found on the page as an absolute URL
|
51
51
|
page.meta_description # meta description, as string
|
52
52
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
53
53
|
page.meta_keywords # meta keywords, as string
|
@@ -85,7 +85,7 @@ The full scraped document if accessible from:
|
|
85
85
|
|
86
86
|
You can check if the page has been succesfully parsed with:
|
87
87
|
|
88
|
-
page.
|
88
|
+
page.ok? # Will return true if everything looks OK
|
89
89
|
|
90
90
|
In case there have been any errors, you can check them with:
|
91
91
|
|
@@ -8,90 +8,93 @@ require 'timeout'
|
|
8
8
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
9
9
|
module MetaInspector
|
10
10
|
class Scraper
|
11
|
-
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type
|
11
|
+
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
12
12
|
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
|
-
# If no scheme given, set it to http:// by default
|
15
14
|
# Options:
|
16
15
|
# => timeout: defaults to 20 seconds
|
17
16
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
17
|
def initialize(url, options = {})
|
19
|
-
url
|
20
|
-
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
|
+
@url = with_default_scheme(encode_url(url))
|
21
19
|
@scheme = URI.parse(@url).scheme
|
22
20
|
@host = URI.parse(@url).host
|
23
21
|
@root_url = "#{@scheme}://#{@host}/"
|
24
22
|
@timeout = options[:timeout] || 20
|
25
|
-
@data = Hashie::Rash.new
|
23
|
+
@data = Hashie::Rash.new
|
26
24
|
@errors = []
|
27
25
|
@html_content_only = options[:html_content_only] || false
|
28
26
|
end
|
29
27
|
|
30
28
|
# Returns the parsed document title, from the content of the <title> tag.
|
31
|
-
# This is not the same as the
|
29
|
+
# This is not the same as the meta_title tag
|
32
30
|
def title
|
33
|
-
@
|
31
|
+
@title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
|
34
32
|
end
|
35
33
|
|
36
34
|
# A description getter that first checks for a meta description and if not present will
|
37
|
-
# guess by looking
|
35
|
+
# guess by looking at the first paragraph with more than 120 characters
|
38
36
|
def description
|
39
37
|
meta_description.nil? ? secondary_description : meta_description
|
40
38
|
end
|
41
39
|
|
42
40
|
# Links found on the page, as absolute URLs
|
43
41
|
def links
|
44
|
-
@
|
42
|
+
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
|
45
43
|
end
|
46
44
|
|
47
45
|
# Internal links found on the page, as absolute URLs
|
48
46
|
def internal_links
|
49
|
-
@
|
47
|
+
@internal_links ||= links.select {|link| URI.parse(link).host == host }
|
50
48
|
end
|
51
49
|
|
52
50
|
# External links found on the page, as absolute URLs
|
53
51
|
def external_links
|
54
|
-
@
|
52
|
+
@external_links ||= links.select {|link| URI.parse(link).host != host }
|
55
53
|
end
|
56
54
|
|
57
55
|
# Images found on the page, as absolute URLs
|
58
56
|
def images
|
59
|
-
@
|
57
|
+
@images ||= parsed_images.map{ |i| absolutify_url(i) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the parsed image from Facebook's open graph property tags
|
61
|
+
# Most all major websites now define this property and is usually very relevant
|
62
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
63
|
+
def image
|
64
|
+
meta_og_image
|
60
65
|
end
|
61
66
|
|
62
67
|
# Returns the parsed document meta rss links
|
63
68
|
def feed
|
64
|
-
@
|
69
|
+
@feed ||= parsed_document.xpath("//link").select{ |link|
|
65
70
|
link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
|
66
71
|
}.map { |link|
|
67
72
|
absolutify_url(link.attributes["href"].value)
|
68
73
|
}.first rescue nil
|
69
74
|
end
|
70
75
|
|
71
|
-
# Returns the parsed image from Facebook's open graph property tags
|
72
|
-
# Most all major websites now define this property and is usually very relevant
|
73
|
-
# See doc at http://developers.facebook.com/docs/opengraph/
|
74
|
-
def image
|
75
|
-
meta_og_image
|
76
|
-
end
|
77
|
-
|
78
76
|
# Returns the charset from the meta tags, looking for it in the following order:
|
79
77
|
# <meta charset='utf-8' />
|
80
78
|
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
81
79
|
def charset
|
82
|
-
@
|
80
|
+
@charset ||= (charset_from_meta_charset || charset_from_content_type)
|
83
81
|
end
|
84
82
|
|
85
83
|
# Returns all parsed data as a nested Hash
|
86
84
|
def to_hash
|
87
|
-
|
88
|
-
image;images;feed;links;charset;title;meta_keywords;internal_links;external_links
|
89
|
-
@data.to_hash
|
90
|
-
end
|
85
|
+
scrape_meta_data
|
91
86
|
|
92
|
-
|
93
|
-
|
94
|
-
|
87
|
+
{
|
88
|
+
'url' => url,
|
89
|
+
'title' => title,
|
90
|
+
'links' => links,
|
91
|
+
'internal_links' => internal_links,
|
92
|
+
'external_links' => external_links,
|
93
|
+
'images' => images,
|
94
|
+
'charset' => charset,
|
95
|
+
'feed' => feed,
|
96
|
+
'content_type' => content_type
|
97
|
+
}.merge @data.to_hash
|
95
98
|
end
|
96
99
|
|
97
100
|
# Returns the whole parsed document
|
@@ -103,24 +106,33 @@ module MetaInspector
|
|
103
106
|
|
104
107
|
# Returns the original, unparsed document
|
105
108
|
def document
|
106
|
-
@document ||=
|
107
|
-
|
108
|
-
|
109
|
+
@document ||= if html_content_only && content_type != "text/html"
|
110
|
+
raise "The url provided contains #{content_type} content instead of text/html content" and nil
|
111
|
+
else
|
112
|
+
request.read
|
113
|
+
end
|
114
|
+
rescue Exception => e
|
115
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
116
|
+
end
|
109
117
|
|
110
|
-
|
111
|
-
|
112
|
-
|
118
|
+
# Returns the content_type of the fetched document
|
119
|
+
def content_type
|
120
|
+
@content_type ||= request.content_type
|
121
|
+
end
|
113
122
|
|
114
|
-
|
115
|
-
|
123
|
+
# Returns true if there are no errors
|
124
|
+
def ok?
|
125
|
+
errors.empty?
|
126
|
+
end
|
116
127
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
rescue Exception => e
|
122
|
-
add_fatal_error "Scraping exception: #{e.message}"
|
128
|
+
##### DEPRECATIONS ####
|
129
|
+
def parsed?
|
130
|
+
warn "the parsed? method has been deprecated, please use ok? instead"
|
131
|
+
!@parsed_document.nil?
|
123
132
|
end
|
133
|
+
##### DEPRECATIONS ####
|
134
|
+
|
135
|
+
private
|
124
136
|
|
125
137
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
126
138
|
# meta name: keywords, description, robots, generator
|
@@ -132,43 +144,57 @@ module MetaInspector
|
|
132
144
|
def method_missing(method_name)
|
133
145
|
if method_name.to_s =~ /^meta_(.*)/
|
134
146
|
key = $1
|
135
|
-
#special treatment for og:
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
unless @data.meta
|
140
|
-
@data.meta!.name!
|
141
|
-
@data.meta!.property!
|
142
|
-
parsed_document.xpath("//meta").each do |element|
|
143
|
-
if element.attributes["content"]
|
144
|
-
if element.attributes["name"]
|
145
|
-
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
|
146
|
-
end
|
147
|
-
|
148
|
-
if element.attributes["property"]
|
149
|
-
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
147
|
+
key = "og:#{$1}" if key =~ /^og_(.*)/ # special treatment for og:
|
148
|
+
|
149
|
+
scrape_meta_data
|
150
|
+
|
154
151
|
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
155
152
|
else
|
156
153
|
super
|
157
154
|
end
|
158
155
|
end
|
159
156
|
|
160
|
-
|
157
|
+
# Makes the request to the server
|
158
|
+
def request
|
159
|
+
Timeout::timeout(timeout) { @request ||= open(url) }
|
160
|
+
|
161
|
+
rescue TimeoutError
|
162
|
+
add_fatal_error 'Timeout!!!'
|
163
|
+
rescue SocketError
|
164
|
+
add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
|
165
|
+
rescue Exception => e
|
166
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
167
|
+
end
|
168
|
+
|
169
|
+
# Scrapes all meta tags found
|
170
|
+
def scrape_meta_data
|
171
|
+
unless @data.meta
|
172
|
+
@data.meta!.name!
|
173
|
+
@data.meta!.property!
|
174
|
+
parsed_document.xpath("//meta").each do |element|
|
175
|
+
if element.attributes["content"]
|
176
|
+
if element.attributes["name"]
|
177
|
+
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
|
178
|
+
end
|
179
|
+
|
180
|
+
if element.attributes["property"]
|
181
|
+
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
161
187
|
|
162
188
|
def parsed_links
|
163
189
|
@parsed_links ||= parsed_document.search("//a") \
|
164
|
-
|
165
|
-
|
190
|
+
.map {|link| link.attributes["href"] \
|
191
|
+
.to_s.strip}.uniq rescue []
|
166
192
|
end
|
167
193
|
|
168
194
|
def parsed_images
|
169
195
|
@parsed_images ||= parsed_document.search('//img') \
|
170
|
-
|
171
|
-
|
196
|
+
.reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
|
197
|
+
.map{ |i| i.attributes['src'].value }.uniq
|
172
198
|
end
|
173
199
|
|
174
200
|
# Stores the error for later inspection
|
@@ -182,13 +208,18 @@ module MetaInspector
|
|
182
208
|
URI.encode(url).to_s.gsub("%23", "#")
|
183
209
|
end
|
184
210
|
|
211
|
+
# Adds 'http' as default scheme, if there if none
|
212
|
+
def with_default_scheme(url)
|
213
|
+
URI.parse(url).scheme.nil? ? 'http://' + url : url
|
214
|
+
end
|
215
|
+
|
185
216
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
186
217
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
187
218
|
def absolutify_url(url)
|
188
219
|
if url =~ /^\w*\:/i
|
189
220
|
encode_url(url)
|
190
221
|
else
|
191
|
-
URI.parse(
|
222
|
+
URI.parse(root_url).merge(encode_url(url)).to_s
|
192
223
|
end
|
193
224
|
rescue URI::InvalidURIError => e
|
194
225
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
data/spec/metainspector_spec.rb
CHANGED
@@ -25,7 +25,7 @@ describe MetaInspector do
|
|
25
25
|
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
26
26
|
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
27
27
|
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
28
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/
|
28
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
29
29
|
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
30
30
|
|
31
31
|
describe 'Initialization' do
|
@@ -373,16 +373,14 @@ describe MetaInspector do
|
|
373
373
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png')
|
374
374
|
desc = image_url.description
|
375
375
|
|
376
|
-
image_url.
|
377
|
-
image_url.parsed? == true
|
376
|
+
image_url.should be_ok
|
378
377
|
end
|
379
378
|
|
380
379
|
it "should parse images when parse_html_content_type_only is false" do
|
381
380
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
|
382
381
|
desc = image_url.description
|
383
382
|
|
384
|
-
image_url.
|
385
|
-
image_url.parsed? == true
|
383
|
+
image_url.should be_ok
|
386
384
|
end
|
387
385
|
|
388
386
|
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
@@ -392,7 +390,7 @@ describe MetaInspector do
|
|
392
390
|
title = image_url.title
|
393
391
|
}.to change { image_url.errors.size }
|
394
392
|
|
395
|
-
image_url.errors.first.should == "Scraping exception: The url provided contains image/
|
393
|
+
image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
|
396
394
|
end
|
397
395
|
|
398
396
|
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
@@ -405,55 +403,41 @@ describe MetaInspector do
|
|
405
403
|
tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
|
406
404
|
end
|
407
405
|
|
408
|
-
describe "
|
409
|
-
it "should return true if we have
|
406
|
+
describe "ok?" do
|
407
|
+
it "should return true if we have no errors" do
|
410
408
|
good = MetaInspector.new('http://pagerankalert.com')
|
411
|
-
|
409
|
+
good.to_hash
|
412
410
|
|
413
|
-
good.
|
411
|
+
good.should be_ok
|
414
412
|
end
|
415
413
|
|
416
|
-
it "should return false if
|
417
|
-
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :
|
418
|
-
|
414
|
+
it "should return false if there are errors" do
|
415
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
|
416
|
+
bad.title
|
419
417
|
|
420
|
-
bad.
|
418
|
+
bad.should_not be_ok
|
421
419
|
end
|
422
420
|
|
423
421
|
it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
|
424
422
|
tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
425
423
|
title = tar.title
|
426
424
|
|
427
|
-
tar.
|
425
|
+
tar.should_not be_ok
|
428
426
|
end
|
429
427
|
end
|
430
428
|
end
|
431
429
|
|
432
430
|
describe "content_type" do
|
433
|
-
it "should return the correct content type of the url
|
431
|
+
it "should return the correct content type of the url for non html pages" do
|
434
432
|
good = MetaInspector.new('http://pagerankalert.com/image.png')
|
435
|
-
title = good.title
|
436
433
|
|
437
|
-
good.
|
438
|
-
good.content_type == "image/jpeg"
|
434
|
+
good.content_type.should == "image/png"
|
439
435
|
end
|
440
436
|
|
441
|
-
it "should return the correct content type of the url
|
437
|
+
it "should return the correct content type of the url for html pages" do
|
442
438
|
good = MetaInspector.new('http://pagerankalert.com')
|
443
|
-
title = good.title
|
444
439
|
|
445
|
-
good.
|
446
|
-
good.content_type == "text/html"
|
440
|
+
good.content_type.should == "text/html"
|
447
441
|
end
|
448
|
-
|
449
|
-
it "should return the correct content type of the url if it is not parsed correctly" do
|
450
|
-
bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
451
|
-
title = bad.title
|
452
|
-
|
453
|
-
bad.parsed?.should == false
|
454
|
-
bad.content_type == "image/jpeg"
|
455
|
-
end
|
456
|
-
|
457
442
|
end
|
458
|
-
|
459
443
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 11
|
9
|
+
- 0
|
10
|
+
version: 1.11.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-26 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|