cobweb 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.2
2
+ h1. Cobweb v1.0.3
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -101,6 +101,7 @@ Creates a new crawler object based on a base_url
101
101
  ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
102
102
  ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
103
103
  ** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
104
+ ** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
104
105
 
105
106
 
106
107
  bc. crawler = Cobweb.new(:follow_redirects => false)
@@ -47,6 +47,7 @@ class Cobweb
47
47
  default_obey_robots_to false
48
48
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
49
49
  default_valid_mime_types_to ["*/*"]
50
+ default_raise_exceptions_to false
50
51
 
51
52
  end
52
53
 
@@ -205,6 +206,7 @@ class Cobweb
205
206
  redis.expire unique_id, @options[:cache].to_i
206
207
  end
207
208
  rescue RedirectError => e
209
+ raise e if @options[:raise_exceptions]
208
210
  puts "ERROR RedirectError: #{e.message}"
209
211
 
210
212
  ## generate a blank content
@@ -220,6 +222,7 @@ class Cobweb
220
222
  content[:links] = {}
221
223
 
222
224
  rescue SocketError => e
225
+ raise e if @options[:raise_exceptions]
223
226
  puts "ERROR SocketError: #{e.message}"
224
227
 
225
228
  ## generate a blank content
@@ -235,6 +238,7 @@ class Cobweb
235
238
  content[:links] = {}
236
239
 
237
240
  rescue Timeout::Error => e
241
+ raise e if @options[:raise_exceptions]
238
242
  puts "ERROR Timeout::Error: #{e.message}"
239
243
 
240
244
  ## generate a blank content
@@ -342,6 +346,7 @@ class Cobweb
342
346
  end
343
347
  end
344
348
  rescue RedirectError => e
349
+ raise e if @options[:raise_exceptions]
345
350
  puts "ERROR RedirectError: #{e.message}"
346
351
 
347
352
  ## generate a blank content
@@ -357,6 +362,7 @@ class Cobweb
357
362
  content[:links] = {}
358
363
 
359
364
  rescue SocketError => e
365
+ raise e if @options[:raise_exceptions]
360
366
  puts "ERROR SocketError: #{e.message}"
361
367
 
362
368
  ## generate a blank content
@@ -372,6 +378,7 @@ class Cobweb
372
378
  content[:links] = {}
373
379
 
374
380
  rescue Timeout::Error => e
381
+ raise e if @options[:raise_exceptions]
375
382
  puts "ERROR Timeout::Error: #{e.message}"
376
383
 
377
384
  ## generate a blank content
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.2"
6
+ "1.0.3"
7
7
  end
8
8
 
9
9
  end
@@ -5,6 +5,8 @@ describe Cobweb do
5
5
  before(:each) do
6
6
  @base_url = "http://www.baseurl.com/"
7
7
  @cobweb = Cobweb.new :quiet => true, :cache => nil
8
+
9
+ @default_options = {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}
8
10
  end
9
11
 
10
12
  it "should generate a cobweb object" do
@@ -186,7 +188,7 @@ describe Cobweb do
186
188
  describe "location setting" do
187
189
  it "Get should strip fragments" do
188
190
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
191
+ Net::HTTP::Get.should_receive(:new).with("/", @default_options)
190
192
  @cobweb.get("http://www.google.com/#ignore")
191
193
  end
192
194
  it "head should strip fragments" do
@@ -196,12 +198,12 @@ describe Cobweb do
196
198
  end
197
199
  it "get should not strip path" do
198
200
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
201
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
200
202
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
203
  end
202
204
  it "get should not strip query string" do
203
205
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
206
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
205
207
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
208
  end
207
209
  end
@@ -63,9 +63,9 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request2)
69
69
 
70
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
71
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-28 00:00:00.000000000 Z
12
+ date: 2013-01-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70096521975680 !ruby/object:Gem::Requirement
16
+ requirement: &70115295148600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70096521975680
24
+ version_requirements: *70115295148600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70096521974080 !ruby/object:Gem::Requirement
27
+ requirement: &70115295147540 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70096521974080
35
+ version_requirements: *70115295147540
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70096521972700 !ruby/object:Gem::Requirement
38
+ requirement: &70115295145880 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70096521972700
46
+ version_requirements: *70115295145880
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70096521971740 !ruby/object:Gem::Requirement
49
+ requirement: &70115295144920 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70096521971740
57
+ version_requirements: *70115295144920
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70096521971220 !ruby/object:Gem::Requirement
60
+ requirement: &70115295144060 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70096521971220
68
+ version_requirements: *70115295144060
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70096521970620 !ruby/object:Gem::Requirement
71
+ requirement: &70115295143400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70096521970620
79
+ version_requirements: *70115295143400
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70096521969800 !ruby/object:Gem::Requirement
82
+ requirement: &70115295142120 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70096521969800
90
+ version_requirements: *70115295142120
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70096521969240 !ruby/object:Gem::Requirement
93
+ requirement: &70115295138080 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70096521969240
101
+ version_requirements: *70115295138080
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70096521968800 !ruby/object:Gem::Requirement
104
+ requirement: &70115295134720 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,21 +109,21 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70096521968800
112
+ version_requirements: *70115295134720
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70096521968060 !ruby/object:Gem::Requirement
115
+ requirement: &70115295131880 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
119
119
  - !ruby/object:Gem::Version
120
- version: 1.0.2
120
+ version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70096521968060
123
+ version_requirements: *70115295131880
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70096521967360 !ruby/object:Gem::Requirement
126
+ requirement: &70115295127820 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70096521967360
134
+ version_requirements: *70115295127820
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface