metainspector 1.10.2 → 1.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +2 -2
- data/lib/meta_inspector/scraper.rb +98 -67
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/metainspector_spec.rb +17 -33
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -47,7 +47,7 @@ Then you can see the scraped data like this:
|
|
47
47
|
page.title # title of the page, as string
|
48
48
|
page.links # array of strings, with every link found on the page as an absolute URL
|
49
49
|
page.internal_links # array of strings, with every internal link found on the page as an absolute URL
|
50
|
-
page.
|
50
|
+
page.external_links # array of strings, with every external link found on the page as an absolute URL
|
51
51
|
page.meta_description # meta description, as string
|
52
52
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
53
53
|
page.meta_keywords # meta keywords, as string
|
@@ -85,7 +85,7 @@ The full scraped document if accessible from:
|
|
85
85
|
|
86
86
|
You can check if the page has been succesfully parsed with:
|
87
87
|
|
88
|
-
page.
|
88
|
+
page.ok? # Will return true if everything looks OK
|
89
89
|
|
90
90
|
In case there have been any errors, you can check them with:
|
91
91
|
|
@@ -8,90 +8,93 @@ require 'timeout'
|
|
8
8
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
9
9
|
module MetaInspector
|
10
10
|
class Scraper
|
11
|
-
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type
|
11
|
+
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
12
12
|
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
|
-
# If no scheme given, set it to http:// by default
|
15
14
|
# Options:
|
16
15
|
# => timeout: defaults to 20 seconds
|
17
16
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
17
|
def initialize(url, options = {})
|
19
|
-
url
|
20
|
-
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
|
+
@url = with_default_scheme(encode_url(url))
|
21
19
|
@scheme = URI.parse(@url).scheme
|
22
20
|
@host = URI.parse(@url).host
|
23
21
|
@root_url = "#{@scheme}://#{@host}/"
|
24
22
|
@timeout = options[:timeout] || 20
|
25
|
-
@data = Hashie::Rash.new
|
23
|
+
@data = Hashie::Rash.new
|
26
24
|
@errors = []
|
27
25
|
@html_content_only = options[:html_content_only] || false
|
28
26
|
end
|
29
27
|
|
30
28
|
# Returns the parsed document title, from the content of the <title> tag.
|
31
|
-
# This is not the same as the
|
29
|
+
# This is not the same as the meta_title tag
|
32
30
|
def title
|
33
|
-
@
|
31
|
+
@title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
|
34
32
|
end
|
35
33
|
|
36
34
|
# A description getter that first checks for a meta description and if not present will
|
37
|
-
# guess by looking
|
35
|
+
# guess by looking at the first paragraph with more than 120 characters
|
38
36
|
def description
|
39
37
|
meta_description.nil? ? secondary_description : meta_description
|
40
38
|
end
|
41
39
|
|
42
40
|
# Links found on the page, as absolute URLs
|
43
41
|
def links
|
44
|
-
@
|
42
|
+
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
|
45
43
|
end
|
46
44
|
|
47
45
|
# Internal links found on the page, as absolute URLs
|
48
46
|
def internal_links
|
49
|
-
@
|
47
|
+
@internal_links ||= links.select {|link| URI.parse(link).host == host }
|
50
48
|
end
|
51
49
|
|
52
50
|
# External links found on the page, as absolute URLs
|
53
51
|
def external_links
|
54
|
-
@
|
52
|
+
@external_links ||= links.select {|link| URI.parse(link).host != host }
|
55
53
|
end
|
56
54
|
|
57
55
|
# Images found on the page, as absolute URLs
|
58
56
|
def images
|
59
|
-
@
|
57
|
+
@images ||= parsed_images.map{ |i| absolutify_url(i) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the parsed image from Facebook's open graph property tags
|
61
|
+
# Most all major websites now define this property and is usually very relevant
|
62
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
63
|
+
def image
|
64
|
+
meta_og_image
|
60
65
|
end
|
61
66
|
|
62
67
|
# Returns the parsed document meta rss links
|
63
68
|
def feed
|
64
|
-
@
|
69
|
+
@feed ||= parsed_document.xpath("//link").select{ |link|
|
65
70
|
link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
|
66
71
|
}.map { |link|
|
67
72
|
absolutify_url(link.attributes["href"].value)
|
68
73
|
}.first rescue nil
|
69
74
|
end
|
70
75
|
|
71
|
-
# Returns the parsed image from Facebook's open graph property tags
|
72
|
-
# Most all major websites now define this property and is usually very relevant
|
73
|
-
# See doc at http://developers.facebook.com/docs/opengraph/
|
74
|
-
def image
|
75
|
-
meta_og_image
|
76
|
-
end
|
77
|
-
|
78
76
|
# Returns the charset from the meta tags, looking for it in the following order:
|
79
77
|
# <meta charset='utf-8' />
|
80
78
|
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
81
79
|
def charset
|
82
|
-
@
|
80
|
+
@charset ||= (charset_from_meta_charset || charset_from_content_type)
|
83
81
|
end
|
84
82
|
|
85
83
|
# Returns all parsed data as a nested Hash
|
86
84
|
def to_hash
|
87
|
-
|
88
|
-
image;images;feed;links;charset;title;meta_keywords;internal_links;external_links
|
89
|
-
@data.to_hash
|
90
|
-
end
|
85
|
+
scrape_meta_data
|
91
86
|
|
92
|
-
|
93
|
-
|
94
|
-
|
87
|
+
{
|
88
|
+
'url' => url,
|
89
|
+
'title' => title,
|
90
|
+
'links' => links,
|
91
|
+
'internal_links' => internal_links,
|
92
|
+
'external_links' => external_links,
|
93
|
+
'images' => images,
|
94
|
+
'charset' => charset,
|
95
|
+
'feed' => feed,
|
96
|
+
'content_type' => content_type
|
97
|
+
}.merge @data.to_hash
|
95
98
|
end
|
96
99
|
|
97
100
|
# Returns the whole parsed document
|
@@ -103,24 +106,33 @@ module MetaInspector
|
|
103
106
|
|
104
107
|
# Returns the original, unparsed document
|
105
108
|
def document
|
106
|
-
@document ||=
|
107
|
-
|
108
|
-
|
109
|
+
@document ||= if html_content_only && content_type != "text/html"
|
110
|
+
raise "The url provided contains #{content_type} content instead of text/html content" and nil
|
111
|
+
else
|
112
|
+
request.read
|
113
|
+
end
|
114
|
+
rescue Exception => e
|
115
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
116
|
+
end
|
109
117
|
|
110
|
-
|
111
|
-
|
112
|
-
|
118
|
+
# Returns the content_type of the fetched document
|
119
|
+
def content_type
|
120
|
+
@content_type ||= request.content_type
|
121
|
+
end
|
113
122
|
|
114
|
-
|
115
|
-
|
123
|
+
# Returns true if there are no errors
|
124
|
+
def ok?
|
125
|
+
errors.empty?
|
126
|
+
end
|
116
127
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
rescue Exception => e
|
122
|
-
add_fatal_error "Scraping exception: #{e.message}"
|
128
|
+
##### DEPRECATIONS ####
|
129
|
+
def parsed?
|
130
|
+
warn "the parsed? method has been deprecated, please use ok? instead"
|
131
|
+
!@parsed_document.nil?
|
123
132
|
end
|
133
|
+
##### DEPRECATIONS ####
|
134
|
+
|
135
|
+
private
|
124
136
|
|
125
137
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
126
138
|
# meta name: keywords, description, robots, generator
|
@@ -132,43 +144,57 @@ module MetaInspector
|
|
132
144
|
def method_missing(method_name)
|
133
145
|
if method_name.to_s =~ /^meta_(.*)/
|
134
146
|
key = $1
|
135
|
-
#special treatment for og:
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
unless @data.meta
|
140
|
-
@data.meta!.name!
|
141
|
-
@data.meta!.property!
|
142
|
-
parsed_document.xpath("//meta").each do |element|
|
143
|
-
if element.attributes["content"]
|
144
|
-
if element.attributes["name"]
|
145
|
-
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
|
146
|
-
end
|
147
|
-
|
148
|
-
if element.attributes["property"]
|
149
|
-
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
147
|
+
key = "og:#{$1}" if key =~ /^og_(.*)/ # special treatment for og:
|
148
|
+
|
149
|
+
scrape_meta_data
|
150
|
+
|
154
151
|
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
155
152
|
else
|
156
153
|
super
|
157
154
|
end
|
158
155
|
end
|
159
156
|
|
160
|
-
|
157
|
+
# Makes the request to the server
|
158
|
+
def request
|
159
|
+
Timeout::timeout(timeout) { @request ||= open(url) }
|
160
|
+
|
161
|
+
rescue TimeoutError
|
162
|
+
add_fatal_error 'Timeout!!!'
|
163
|
+
rescue SocketError
|
164
|
+
add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
|
165
|
+
rescue Exception => e
|
166
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
167
|
+
end
|
168
|
+
|
169
|
+
# Scrapes all meta tags found
|
170
|
+
def scrape_meta_data
|
171
|
+
unless @data.meta
|
172
|
+
@data.meta!.name!
|
173
|
+
@data.meta!.property!
|
174
|
+
parsed_document.xpath("//meta").each do |element|
|
175
|
+
if element.attributes["content"]
|
176
|
+
if element.attributes["name"]
|
177
|
+
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
|
178
|
+
end
|
179
|
+
|
180
|
+
if element.attributes["property"]
|
181
|
+
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
161
187
|
|
162
188
|
def parsed_links
|
163
189
|
@parsed_links ||= parsed_document.search("//a") \
|
164
|
-
|
165
|
-
|
190
|
+
.map {|link| link.attributes["href"] \
|
191
|
+
.to_s.strip}.uniq rescue []
|
166
192
|
end
|
167
193
|
|
168
194
|
def parsed_images
|
169
195
|
@parsed_images ||= parsed_document.search('//img') \
|
170
|
-
|
171
|
-
|
196
|
+
.reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
|
197
|
+
.map{ |i| i.attributes['src'].value }.uniq
|
172
198
|
end
|
173
199
|
|
174
200
|
# Stores the error for later inspection
|
@@ -182,13 +208,18 @@ module MetaInspector
|
|
182
208
|
URI.encode(url).to_s.gsub("%23", "#")
|
183
209
|
end
|
184
210
|
|
211
|
+
# Adds 'http' as default scheme, if there if none
|
212
|
+
def with_default_scheme(url)
|
213
|
+
URI.parse(url).scheme.nil? ? 'http://' + url : url
|
214
|
+
end
|
215
|
+
|
185
216
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
186
217
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
187
218
|
def absolutify_url(url)
|
188
219
|
if url =~ /^\w*\:/i
|
189
220
|
encode_url(url)
|
190
221
|
else
|
191
|
-
URI.parse(
|
222
|
+
URI.parse(root_url).merge(encode_url(url)).to_s
|
192
223
|
end
|
193
224
|
rescue URI::InvalidURIError => e
|
194
225
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
data/spec/metainspector_spec.rb
CHANGED
@@ -25,7 +25,7 @@ describe MetaInspector do
|
|
25
25
|
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
26
26
|
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
27
27
|
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
28
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/
|
28
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
29
29
|
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
30
30
|
|
31
31
|
describe 'Initialization' do
|
@@ -373,16 +373,14 @@ describe MetaInspector do
|
|
373
373
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png')
|
374
374
|
desc = image_url.description
|
375
375
|
|
376
|
-
image_url.
|
377
|
-
image_url.parsed? == true
|
376
|
+
image_url.should be_ok
|
378
377
|
end
|
379
378
|
|
380
379
|
it "should parse images when parse_html_content_type_only is false" do
|
381
380
|
image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
|
382
381
|
desc = image_url.description
|
383
382
|
|
384
|
-
image_url.
|
385
|
-
image_url.parsed? == true
|
383
|
+
image_url.should be_ok
|
386
384
|
end
|
387
385
|
|
388
386
|
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
@@ -392,7 +390,7 @@ describe MetaInspector do
|
|
392
390
|
title = image_url.title
|
393
391
|
}.to change { image_url.errors.size }
|
394
392
|
|
395
|
-
image_url.errors.first.should == "Scraping exception: The url provided contains image/
|
393
|
+
image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
|
396
394
|
end
|
397
395
|
|
398
396
|
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
@@ -405,55 +403,41 @@ describe MetaInspector do
|
|
405
403
|
tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
|
406
404
|
end
|
407
405
|
|
408
|
-
describe "
|
409
|
-
it "should return true if we have
|
406
|
+
describe "ok?" do
|
407
|
+
it "should return true if we have no errors" do
|
410
408
|
good = MetaInspector.new('http://pagerankalert.com')
|
411
|
-
|
409
|
+
good.to_hash
|
412
410
|
|
413
|
-
good.
|
411
|
+
good.should be_ok
|
414
412
|
end
|
415
413
|
|
416
|
-
it "should return false if
|
417
|
-
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :
|
418
|
-
|
414
|
+
it "should return false if there are errors" do
|
415
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
|
416
|
+
bad.title
|
419
417
|
|
420
|
-
bad.
|
418
|
+
bad.should_not be_ok
|
421
419
|
end
|
422
420
|
|
423
421
|
it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
|
424
422
|
tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
|
425
423
|
title = tar.title
|
426
424
|
|
427
|
-
tar.
|
425
|
+
tar.should_not be_ok
|
428
426
|
end
|
429
427
|
end
|
430
428
|
end
|
431
429
|
|
432
430
|
describe "content_type" do
|
433
|
-
it "should return the correct content type of the url
|
431
|
+
it "should return the correct content type of the url for non html pages" do
|
434
432
|
good = MetaInspector.new('http://pagerankalert.com/image.png')
|
435
|
-
title = good.title
|
436
433
|
|
437
|
-
good.
|
438
|
-
good.content_type == "image/jpeg"
|
434
|
+
good.content_type.should == "image/png"
|
439
435
|
end
|
440
436
|
|
441
|
-
it "should return the correct content type of the url
|
437
|
+
it "should return the correct content type of the url for html pages" do
|
442
438
|
good = MetaInspector.new('http://pagerankalert.com')
|
443
|
-
title = good.title
|
444
439
|
|
445
|
-
good.
|
446
|
-
good.content_type == "text/html"
|
440
|
+
good.content_type.should == "text/html"
|
447
441
|
end
|
448
|
-
|
449
|
-
it "should return the correct content type of the url if it is not parsed correctly" do
|
450
|
-
bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
|
451
|
-
title = bad.title
|
452
|
-
|
453
|
-
bad.parsed?.should == false
|
454
|
-
bad.content_type == "image/jpeg"
|
455
|
-
end
|
456
|
-
|
457
442
|
end
|
458
|
-
|
459
443
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 11
|
9
|
+
- 0
|
10
|
+
version: 1.11.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-11-
|
18
|
+
date: 2012-11-26 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|