metainspector 1.9.2 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,3 @@
1
- = MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
2
-
3
1
  MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, and meta tags.
4
2
 
5
3
  = Installation
@@ -69,6 +67,16 @@ The full scraped document if accessible from:
69
67
 
70
68
  page.document # Nokogiri doc that you can use it to get any element from the page
71
69
 
70
+ = Errors handling
71
+
72
+ You can check if the page has been succesfully parsed with:
73
+
74
+ page.parsed? # Will return true if everything looks OK
75
+
76
+ In case there have been any errors, you can check them with:
77
+
78
+ page.errors # Will return an array with the error messages
79
+
72
80
  = Examples
73
81
 
74
82
  You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -114,9 +122,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
114
122
 
115
123
  * Get page.base_dir from the URL
116
124
  * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
117
- * Be able to set a timeout in seconds
118
125
  * If keywords seem to be separated by blank spaces, replace them with commas
119
- * Mocks
120
126
  * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
121
127
  * Autodiscover all available meta tags
122
128
 
@@ -9,14 +9,16 @@ require 'timeout'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :url, :scheme
12
+ attr_reader :url, :scheme, :errors
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
14
  # If no scheme given, set it to http:// by default
15
+
15
16
  def initialize(url, timeout = 20)
16
17
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
18
  @scheme = URI.parse(url).scheme || 'http'
18
19
  @timeout = timeout
19
20
  @data = Hashie::Rash.new('url' => @url)
21
+ @errors = []
20
22
  end
21
23
 
22
24
  # Returns the parsed document title, from the content of the <title> tag.
@@ -83,13 +85,16 @@ module MetaInspector
83
85
  @data.to_hash
84
86
  end
85
87
 
88
+ # Returns true if parsing has been successful
89
+ def parsed?
90
+ !@parsed_document.nil?
91
+ end
92
+
86
93
  # Returns the whole parsed document
87
94
  def parsed_document
88
95
  @parsed_document ||= Nokogiri::HTML(document)
89
-
90
96
  rescue Exception => e
91
- warn 'An exception occurred while trying to scrape the page!'
92
- warn e.message
97
+ add_fatal_error "Parsing exception: #{e.message}"
93
98
  end
94
99
 
95
100
  # Returns the original, unparsed document
@@ -97,15 +102,11 @@ module MetaInspector
97
102
  @document ||= Timeout::timeout(@timeout) { open(@url).read }
98
103
 
99
104
  rescue SocketError
100
- warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
101
- @scraped = false
105
+ add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
102
106
  rescue TimeoutError
103
- warn 'Timeout!!!'
104
- @scraped = false
107
+ add_fatal_error 'Timeout!!!'
105
108
  rescue Exception => e
106
- warn 'An exception occurred while trying to fetch the page!'
107
- warn e.message
108
- @scraped = false
109
+ add_fatal_error "Scraping exception: #{e.message}"
109
110
  end
110
111
 
111
112
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
@@ -145,6 +146,11 @@ module MetaInspector
145
146
 
146
147
  private
147
148
 
149
+ # Stores the error for later inspection
150
+ def add_fatal_error(error)
151
+ @errors << error
152
+ end
153
+
148
154
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
149
155
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
150
156
  def absolutify_url(url)
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.9.2"
4
+ VERSION = "1.9.3"
5
5
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'charguess', '1.3.20111021164500'
19
19
  gem.add_dependency 'rash', '0.3.2'
20
20
 
21
- gem.add_development_dependency 'rspec', '2.10.0'
21
+ gem.add_development_dependency 'rspec', '2.11.0'
22
22
  gem.add_development_dependency 'fakeweb', '1.3.0'
23
23
  gem.add_development_dependency 'awesome_print', '1.0.2'
24
24
  gem.add_development_dependency 'rake', '0.9.2.2'
@@ -231,6 +231,52 @@ describe MetaInspector do
231
231
  end
232
232
  end
233
233
 
234
+ describe 'exception handling' do
235
+ before(:each) do
236
+ FakeWeb.allow_net_connect = true
237
+ end
238
+
239
+ after(:each) do
240
+ FakeWeb.allow_net_connect = false
241
+ end
242
+
243
+ it "should handle timeouts" do
244
+ impatient = MetaInspector.new('http://w3clove.com', 0.0000000000001)
245
+
246
+ expect {
247
+ title = impatient.title
248
+ }.to change { impatient.errors.size }
249
+
250
+ impatient.errors.first.should == "Timeout!!!"
251
+ end
252
+
253
+ it "should handle socket errors" do
254
+ nowhere = MetaInspector.new('http://caca232dsdsaer3sdsd-asd343.org')
255
+
256
+ expect {
257
+ title = nowhere.title
258
+ }.to change { nowhere.errors.size }
259
+
260
+ nowhere.errors.first.should == "Socket error: The url provided does not exist or is temporarily unavailable"
261
+ end
262
+
263
+ describe "parsed?" do
264
+ it "should return true if we have a parsed document" do
265
+ good = MetaInspector.new('http://w3clove.com')
266
+ title = good.title
267
+
268
+ good.parsed?.should == true
269
+ end
270
+
271
+ it "should return false if we don't have a parsed document" do
272
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', 0.00000000000001)
273
+ title = bad.title
274
+
275
+ bad.parsed?.should == false
276
+ end
277
+ end
278
+ end
279
+
234
280
  describe "regression tests" do
235
281
  describe "get image" do
236
282
  it "should find image on youtube" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 55
4
+ hash: 53
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 9
9
- - 2
10
- version: 1.9.2
9
+ - 3
10
+ version: 1.9.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-07-13 00:00:00 Z
18
+ date: 2012-07-22 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -72,12 +72,12 @@ dependencies:
72
72
  requirements:
73
73
  - - "="
74
74
  - !ruby/object:Gem::Version
75
- hash: 39
75
+ hash: 35
76
76
  segments:
77
77
  - 2
78
- - 10
78
+ - 11
79
79
  - 0
80
- version: 2.10.0
80
+ version: 2.11.0
81
81
  type: :development
82
82
  version_requirements: *id004
83
83
  - !ruby/object:Gem::Dependency