metainspector 1.9.2 → 1.9.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +10 -4
- data/lib/meta_inspector/scraper.rb +17 -11
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/metainspector_spec.rb +46 -0
- metadata +7 -7
data/README.rdoc
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
2
|
-
|
3
1
|
MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
|
4
2
|
|
5
3
|
= Installation
|
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
|
|
69
67
|
|
70
68
|
page.document # Nokogiri doc that you can use it to get any element from the page
|
71
69
|
|
70
|
+
= Errors handling
|
71
|
+
|
72
|
+
You can check if the page has been succesfully parsed with:
|
73
|
+
|
74
|
+
page.parsed? # Will return true if everything looks OK
|
75
|
+
|
76
|
+
In case there have been any errors, you can check them with:
|
77
|
+
|
78
|
+
page.errors # Will return an array with the error messages
|
79
|
+
|
72
80
|
= Examples
|
73
81
|
|
74
82
|
You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
114
122
|
|
115
123
|
* Get page.base_dir from the URL
|
116
124
|
* Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
|
117
|
-
* Be able to set a timeout in seconds
|
118
125
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
119
|
-
* Mocks
|
120
126
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
121
127
|
* Autodiscover all available meta tags
|
122
128
|
|
@@ -9,14 +9,16 @@ require 'timeout'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url, :scheme
|
12
|
+
attr_reader :url, :scheme, :errors
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
|
+
|
15
16
|
def initialize(url, timeout = 20)
|
16
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
18
|
@scheme = URI.parse(url).scheme || 'http'
|
18
19
|
@timeout = timeout
|
19
20
|
@data = Hashie::Rash.new('url' => @url)
|
21
|
+
@errors = []
|
20
22
|
end
|
21
23
|
|
22
24
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -83,13 +85,16 @@ module MetaInspector
|
|
83
85
|
@data.to_hash
|
84
86
|
end
|
85
87
|
|
88
|
+
# Returns true if parsing has been successful
|
89
|
+
def parsed?
|
90
|
+
!@parsed_document.nil?
|
91
|
+
end
|
92
|
+
|
86
93
|
# Returns the whole parsed document
|
87
94
|
def parsed_document
|
88
95
|
@parsed_document ||= Nokogiri::HTML(document)
|
89
|
-
|
90
96
|
rescue Exception => e
|
91
|
-
|
92
|
-
warn e.message
|
97
|
+
add_fatal_error "Parsing exception: #{e.message}"
|
93
98
|
end
|
94
99
|
|
95
100
|
# Returns the original, unparsed document
|
@@ -97,15 +102,11 @@ module MetaInspector
|
|
97
102
|
@document ||= Timeout::timeout(@timeout) { open(@url).read }
|
98
103
|
|
99
104
|
rescue SocketError
|
100
|
-
|
101
|
-
@scraped = false
|
105
|
+
add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
|
102
106
|
rescue TimeoutError
|
103
|
-
|
104
|
-
@scraped = false
|
107
|
+
add_fatal_error 'Timeout!!!'
|
105
108
|
rescue Exception => e
|
106
|
-
|
107
|
-
warn e.message
|
108
|
-
@scraped = false
|
109
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
109
110
|
end
|
110
111
|
|
111
112
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
@@ -145,6 +146,11 @@ module MetaInspector
|
|
145
146
|
|
146
147
|
private
|
147
148
|
|
149
|
+
# Stores the error for later inspection
|
150
|
+
def add_fatal_error(error)
|
151
|
+
@errors << error
|
152
|
+
end
|
153
|
+
|
148
154
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
149
155
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
150
156
|
def absolutify_url(url)
|
data/meta_inspector.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
|
|
18
18
|
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
19
|
gem.add_dependency 'rash', '0.3.2'
|
20
20
|
|
21
|
-
gem.add_development_dependency 'rspec', '2.
|
21
|
+
gem.add_development_dependency 'rspec', '2.11.0'
|
22
22
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
23
23
|
gem.add_development_dependency 'awesome_print', '1.0.2'
|
24
24
|
gem.add_development_dependency 'rake', '0.9.2.2'
|
data/spec/metainspector_spec.rb
CHANGED
@@ -231,6 +231,52 @@ describe MetaInspector do
|
|
231
231
|
end
|
232
232
|
end
|
233
233
|
|
234
|
+
describe 'exception handling' do
|
235
|
+
before(:each) do
|
236
|
+
FakeWeb.allow_net_connect = true
|
237
|
+
end
|
238
|
+
|
239
|
+
after(:each) do
|
240
|
+
FakeWeb.allow_net_connect = false
|
241
|
+
end
|
242
|
+
|
243
|
+
it "should handle timeouts" do
|
244
|
+
impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
|
245
|
+
|
246
|
+
expect {
|
247
|
+
title = impatient.title
|
248
|
+
}.to change { impatient.errors.size }
|
249
|
+
|
250
|
+
impatient.errors.first.should == "Timeout!!!"
|
251
|
+
end
|
252
|
+
|
253
|
+
it "should handle socket errors" do
|
254
|
+
nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
|
255
|
+
|
256
|
+
expect {
|
257
|
+
title = nowhere.title
|
258
|
+
}.to change { nowhere.errors.size }
|
259
|
+
|
260
|
+
nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
|
261
|
+
end
|
262
|
+
|
263
|
+
describe "parsed?" do
|
264
|
+
it "should return true if we have a parsed document" do
|
265
|
+
good = MetaInspector.new('http://w3clove.com')
|
266
|
+
title = good.title
|
267
|
+
|
268
|
+
good.parsed?.should == true
|
269
|
+
end
|
270
|
+
|
271
|
+
it "should return false if we don't have a parsed document" do
|
272
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
|
273
|
+
title = bad.title
|
274
|
+
|
275
|
+
bad.parsed?.should == false
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
234
280
|
describe "regression tests" do
|
235
281
|
describe "get image" do
|
236
282
|
it "should find image on youtube" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 53
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 3
|
10
|
+
version: 1.9.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-07-
|
18
|
+
date: 2012-07-22 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -72,12 +72,12 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
hash:
|
75
|
+
hash: 35
|
76
76
|
segments:
|
77
77
|
- 2
|
78
|
-
-
|
78
|
+
- 11
|
79
79
|
- 0
|
80
|
-
version: 2.
|
80
|
+
version: 2.11.0
|
81
81
|
type: :development
|
82
82
|
version_requirements: *id004
|
83
83
|
- !ruby/object:Gem::Dependency
|