metainspector 1.9.2 → 1.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,3 @@
1
- = MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
2
-
3
1
  MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
4
2
 
5
3
  = Installation
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
69
67
 
70
68
  page.document # Nokogiri doc that you can use it to get any element from the page
71
69
 
70
+ = Errors handling
71
+
72
+ You can check if the page has been succesfully parsed with:
73
+
74
+ page.parsed? # Will return true if everything looks OK
75
+
76
+ In case there have been any errors, you can check them with:
77
+
78
+ page.errors # Will return an array with the error messages
79
+
72
80
  = Examples
73
81
 
74
82
  You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
114
122
 
115
123
  * Get page.base_dir from the URL
116
124
  * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
117
- * Be able to set a timeout in seconds
118
125
  * If keywords seem to be separated by blank spaces, replace them with commas
119
- * Mocks
120
126
  * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
121
127
  * Autodiscover all available meta tags
122
128
 
@@ -9,14 +9,16 @@ require 'timeout'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :url, :scheme
12
+ attr_reader :url, :scheme, :errors
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
14
  # If no scheme given, set it to http:// by default
15
+
15
16
  def initialize(url, timeout = 20)
16
17
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
18
  @scheme = URI.parse(url).scheme || 'http'
18
19
  @timeout = timeout
19
20
  @data = Hashie::Rash.new('url' => @url)
21
+ @errors = []
20
22
  end
21
23
 
22
24
  # Returns the parsed document title, from the content of the <title> tag.
@@ -83,13 +85,16 @@ module MetaInspector
83
85
  @data.to_hash
84
86
  end
85
87
 
88
+ # Returns true if parsing has been successful
89
+ def parsed?
90
+ !@parsed_document.nil?
91
+ end
92
+
86
93
  # Returns the whole parsed document
87
94
  def parsed_document
88
95
  @parsed_document ||= Nokogiri::HTML(document)
89
-
90
96
  rescue Exception => e
91
- warn 'An exception occurred while trying to scrape the page!'
92
- warn e.message
97
+ add_fatal_error "Parsing exception: #{e.message}"
93
98
  end
94
99
 
95
100
  # Returns the original, unparsed document
@@ -97,15 +102,11 @@ module MetaInspector
97
102
  @document ||= Timeout::timeout(@timeout) { open(@url).read }
98
103
 
99
104
  rescue SocketError
100
- warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
101
- @scraped = false
105
+ add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
102
106
  rescue TimeoutError
103
- warn 'Timeout!!!'
104
- @scraped = false
107
+ add_fatal_error 'Timeout!!!'
105
108
  rescue Exception => e
106
- warn 'An exception occurred while trying to fetch the page!'
107
- warn e.message
108
- @scraped = false
109
+ add_fatal_error "Scraping exception: #{e.message}"
109
110
  end
110
111
 
111
112
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
@@ -145,6 +146,11 @@ module MetaInspector
145
146
 
146
147
  private
147
148
 
149
+ # Stores the error for later inspection
150
+ def add_fatal_error(error)
151
+ @errors << error
152
+ end
153
+
148
154
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
149
155
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
150
156
  def absolutify_url(url)
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.9.2"
4
+ VERSION = "1.9.3"
5
5
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'charguess', '1.3.20111021164500'
19
19
  gem.add_dependency 'rash', '0.3.2'
20
20
 
21
- gem.add_development_dependency 'rspec', '2.10.0'
21
+ gem.add_development_dependency 'rspec', '2.11.0'
22
22
  gem.add_development_dependency 'fakeweb', '1.3.0'
23
23
  gem.add_development_dependency 'awesome_print', '1.0.2'
24
24
  gem.add_development_dependency 'rake', '0.9.2.2'
@@ -231,6 +231,52 @@ describe MetaInspector do
231
231
  end
232
232
  end
233
233
 
234
+ describe 'exception handling' do
235
+ before(:each) do
236
+ FakeWeb.allow_net_connect = true
237
+ end
238
+
239
+ after(:each) do
240
+ FakeWeb.allow_net_connect = false
241
+ end
242
+
243
+ it "should handle timeouts" do
244
+ impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
245
+
246
+ expect {
247
+ title = impatient.title
248
+ }.to change { impatient.errors.size }
249
+
250
+ impatient.errors.first.should == "Timeout!!!"
251
+ end
252
+
253
+ it "should handle socket errors" do
254
+ nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
255
+
256
+ expect {
257
+ title = nowhere.title
258
+ }.to change { nowhere.errors.size }
259
+
260
+ nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
261
+ end
262
+
263
+ describe "parsed?" do
264
+ it "should return true if we have a parsed document" do
265
+ good = MetaInspector.new('http://w3clove.com')
266
+ title = good.title
267
+
268
+ good.parsed?.should == true
269
+ end
270
+
271
+ it "should return false if we don't have a parsed document" do
272
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
273
+ title = bad.title
274
+
275
+ bad.parsed?.should == false
276
+ end
277
+ end
278
+ end
279
+
234
280
  describe "regression tests" do
235
281
  describe "get image" do
236
282
  it "should find image on youtube" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 55
4
+ hash: 53
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 9
9
- - 2
10
- version: 1.9.2
9
+ - 3
10
+ version: 1.9.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-07-13 00:00:00 Z
18
+ date: 2012-07-22 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -72,12 +72,12 @@ dependencies:
72
72
  requirements:
73
73
  - - "="
74
74
  - !ruby/object:Gem::Version
75
- hash: 39
75
+ hash: 35
76
76
  segments:
77
77
  - 2
78
- - 10
78
+ - 11
79
79
  - 0
80
- version: 2.10.0
80
+ version: 2.11.0
81
81
  type: :development
82
82
  version_requirements: *id004
83
83
  - !ruby/object:Gem::Dependency