metainspector 1.9.2 → 1.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +10 -4
- data/lib/meta_inspector/scraper.rb +17 -11
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/metainspector_spec.rb +46 -0
- metadata +7 -7
data/README.rdoc
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
2
|
-
|
3
1
|
MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
|
4
2
|
|
5
3
|
= Installation
|
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
|
|
69
67
|
|
70
68
|
page.document # Nokogiri doc that you can use it to get any element from the page
|
71
69
|
|
70
|
+
= Errors handling
|
71
|
+
|
72
|
+
You can check if the page has been succesfully parsed with:
|
73
|
+
|
74
|
+
page.parsed? # Will return true if everything looks OK
|
75
|
+
|
76
|
+
In case there have been any errors, you can check them with:
|
77
|
+
|
78
|
+
page.errors # Will return an array with the error messages
|
79
|
+
|
72
80
|
= Examples
|
73
81
|
|
74
82
|
You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
114
122
|
|
115
123
|
* Get page.base_dir from the URL
|
116
124
|
* Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
|
117
|
-
* Be able to set a timeout in seconds
|
118
125
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
119
|
-
* Mocks
|
120
126
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
121
127
|
* Autodiscover all available meta tags
|
122
128
|
|
@@ -9,14 +9,16 @@ require 'timeout'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url, :scheme
|
12
|
+
attr_reader :url, :scheme, :errors
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
|
+
|
15
16
|
def initialize(url, timeout = 20)
|
16
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
18
|
@scheme = URI.parse(url).scheme || 'http'
|
18
19
|
@timeout = timeout
|
19
20
|
@data = Hashie::Rash.new('url' => @url)
|
21
|
+
@errors = []
|
20
22
|
end
|
21
23
|
|
22
24
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -83,13 +85,16 @@ module MetaInspector
|
|
83
85
|
@data.to_hash
|
84
86
|
end
|
85
87
|
|
88
|
+
# Returns true if parsing has been successful
|
89
|
+
def parsed?
|
90
|
+
!@parsed_document.nil?
|
91
|
+
end
|
92
|
+
|
86
93
|
# Returns the whole parsed document
|
87
94
|
def parsed_document
|
88
95
|
@parsed_document ||= Nokogiri::HTML(document)
|
89
|
-
|
90
96
|
rescue Exception => e
|
91
|
-
|
92
|
-
warn e.message
|
97
|
+
add_fatal_error "Parsing exception: #{e.message}"
|
93
98
|
end
|
94
99
|
|
95
100
|
# Returns the original, unparsed document
|
@@ -97,15 +102,11 @@ module MetaInspector
|
|
97
102
|
@document ||= Timeout::timeout(@timeout) { open(@url).read }
|
98
103
|
|
99
104
|
rescue SocketError
|
100
|
-
|
101
|
-
@scraped = false
|
105
|
+
add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
|
102
106
|
rescue TimeoutError
|
103
|
-
|
104
|
-
@scraped = false
|
107
|
+
add_fatal_error 'Timeout!!!'
|
105
108
|
rescue Exception => e
|
106
|
-
|
107
|
-
warn e.message
|
108
|
-
@scraped = false
|
109
|
+
add_fatal_error "Scraping exception: #{e.message}"
|
109
110
|
end
|
110
111
|
|
111
112
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
@@ -145,6 +146,11 @@ module MetaInspector
|
|
145
146
|
|
146
147
|
private
|
147
148
|
|
149
|
+
# Stores the error for later inspection
|
150
|
+
def add_fatal_error(error)
|
151
|
+
@errors << error
|
152
|
+
end
|
153
|
+
|
148
154
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
149
155
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
150
156
|
def absolutify_url(url)
|
data/meta_inspector.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
|
|
18
18
|
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
19
|
gem.add_dependency 'rash', '0.3.2'
|
20
20
|
|
21
|
-
gem.add_development_dependency 'rspec', '2.
|
21
|
+
gem.add_development_dependency 'rspec', '2.11.0'
|
22
22
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
23
23
|
gem.add_development_dependency 'awesome_print', '1.0.2'
|
24
24
|
gem.add_development_dependency 'rake', '0.9.2.2'
|
data/spec/metainspector_spec.rb
CHANGED
@@ -231,6 +231,52 @@ describe MetaInspector do
|
|
231
231
|
end
|
232
232
|
end
|
233
233
|
|
234
|
+
describe 'exception handling' do
|
235
|
+
before(:each) do
|
236
|
+
FakeWeb.allow_net_connect = true
|
237
|
+
end
|
238
|
+
|
239
|
+
after(:each) do
|
240
|
+
FakeWeb.allow_net_connect = false
|
241
|
+
end
|
242
|
+
|
243
|
+
it "should handle timeouts" do
|
244
|
+
impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
|
245
|
+
|
246
|
+
expect {
|
247
|
+
title = impatient.title
|
248
|
+
}.to change { impatient.errors.size }
|
249
|
+
|
250
|
+
impatient.errors.first.should == "Timeout!!!"
|
251
|
+
end
|
252
|
+
|
253
|
+
it "should handle socket errors" do
|
254
|
+
nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
|
255
|
+
|
256
|
+
expect {
|
257
|
+
title = nowhere.title
|
258
|
+
}.to change { nowhere.errors.size }
|
259
|
+
|
260
|
+
nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
|
261
|
+
end
|
262
|
+
|
263
|
+
describe "parsed?" do
|
264
|
+
it "should return true if we have a parsed document" do
|
265
|
+
good = MetaInspector.new('http://w3clove.com')
|
266
|
+
title = good.title
|
267
|
+
|
268
|
+
good.parsed?.should == true
|
269
|
+
end
|
270
|
+
|
271
|
+
it "should return false if we don't have a parsed document" do
|
272
|
+
bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
|
273
|
+
title = bad.title
|
274
|
+
|
275
|
+
bad.parsed?.should == false
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
234
280
|
describe "regression tests" do
|
235
281
|
describe "get image" do
|
236
282
|
it "should find image on youtube" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 53
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 3
|
10
|
+
version: 1.9.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-07-
|
18
|
+
date: 2012-07-22 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -72,12 +72,12 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
hash:
|
75
|
+
hash: 35
|
76
76
|
segments:
|
77
77
|
- 2
|
78
|
-
-
|
78
|
+
- 11
|
79
79
|
- 0
|
80
|
-
version: 2.
|
80
|
+
version: 2.11.0
|
81
81
|
type: :development
|
82
82
|
version_requirements: *id004
|
83
83
|
- !ruby/object:Gem::Dependency
|