cobweb 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.2
2
+ h1. Cobweb v1.0.3
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -101,6 +101,7 @@ Creates a new crawler object based on a base_url
101
101
  ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
102
102
  ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
103
103
  ** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
104
+ ** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
104
105
 
105
106
 
106
107
  bc. crawler = Cobweb.new(:follow_redirects => false)
@@ -47,6 +47,7 @@ class Cobweb
47
47
  default_obey_robots_to false
48
48
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
49
49
  default_valid_mime_types_to ["*/*"]
50
+ default_raise_exceptions_to false
50
51
 
51
52
  end
52
53
 
@@ -205,6 +206,7 @@ class Cobweb
205
206
  redis.expire unique_id, @options[:cache].to_i
206
207
  end
207
208
  rescue RedirectError => e
209
+ raise e if @options[:raise_exceptions]
208
210
  puts "ERROR RedirectError: #{e.message}"
209
211
 
210
212
  ## generate a blank content
@@ -220,6 +222,7 @@ class Cobweb
220
222
  content[:links] = {}
221
223
 
222
224
  rescue SocketError => e
225
+ raise e if @options[:raise_exceptions]
223
226
  puts "ERROR SocketError: #{e.message}"
224
227
 
225
228
  ## generate a blank content
@@ -235,6 +238,7 @@ class Cobweb
235
238
  content[:links] = {}
236
239
 
237
240
  rescue Timeout::Error => e
241
+ raise e if @options[:raise_exceptions]
238
242
  puts "ERROR Timeout::Error: #{e.message}"
239
243
 
240
244
  ## generate a blank content
@@ -342,6 +346,7 @@ class Cobweb
342
346
  end
343
347
  end
344
348
  rescue RedirectError => e
349
+ raise e if @options[:raise_exceptions]
345
350
  puts "ERROR RedirectError: #{e.message}"
346
351
 
347
352
  ## generate a blank content
@@ -357,6 +362,7 @@ class Cobweb
357
362
  content[:links] = {}
358
363
 
359
364
  rescue SocketError => e
365
+ raise e if @options[:raise_exceptions]
360
366
  puts "ERROR SocketError: #{e.message}"
361
367
 
362
368
  ## generate a blank content
@@ -372,6 +378,7 @@ class Cobweb
372
378
  content[:links] = {}
373
379
 
374
380
  rescue Timeout::Error => e
381
+ raise e if @options[:raise_exceptions]
375
382
  puts "ERROR Timeout::Error: #{e.message}"
376
383
 
377
384
  ## generate a blank content
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.2"
6
+ "1.0.3"
7
7
  end
8
8
 
9
9
  end
@@ -5,6 +5,8 @@ describe Cobweb do
5
5
  before(:each) do
6
6
  @base_url = "http://www.baseurl.com/"
7
7
  @cobweb = Cobweb.new :quiet => true, :cache => nil
8
+
9
+ @default_options = {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}
8
10
  end
9
11
 
10
12
  it "should generate a cobweb object" do
@@ -186,7 +188,7 @@ describe Cobweb do
186
188
  describe "location setting" do
187
189
  it "Get should strip fragments" do
188
190
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
191
+ Net::HTTP::Get.should_receive(:new).with("/", @default_options)
190
192
  @cobweb.get("http://www.google.com/#ignore")
191
193
  end
192
194
  it "head should strip fragments" do
@@ -196,12 +198,12 @@ describe Cobweb do
196
198
  end
197
199
  it "get should not strip path" do
198
200
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
201
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
200
202
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
203
  end
202
204
  it "get should not strip query string" do
203
205
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
206
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
205
207
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
208
  end
207
209
  end
@@ -63,9 +63,9 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request2)
69
69
 
70
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
71
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-28 00:00:00.000000000 Z
12
+ date: 2013-01-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70096521975680 !ruby/object:Gem::Requirement
16
+ requirement: &70115295148600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70096521975680
24
+ version_requirements: *70115295148600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70096521974080 !ruby/object:Gem::Requirement
27
+ requirement: &70115295147540 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70096521974080
35
+ version_requirements: *70115295147540
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70096521972700 !ruby/object:Gem::Requirement
38
+ requirement: &70115295145880 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70096521972700
46
+ version_requirements: *70115295145880
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70096521971740 !ruby/object:Gem::Requirement
49
+ requirement: &70115295144920 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70096521971740
57
+ version_requirements: *70115295144920
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70096521971220 !ruby/object:Gem::Requirement
60
+ requirement: &70115295144060 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70096521971220
68
+ version_requirements: *70115295144060
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70096521970620 !ruby/object:Gem::Requirement
71
+ requirement: &70115295143400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70096521970620
79
+ version_requirements: *70115295143400
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70096521969800 !ruby/object:Gem::Requirement
82
+ requirement: &70115295142120 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70096521969800
90
+ version_requirements: *70115295142120
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70096521969240 !ruby/object:Gem::Requirement
93
+ requirement: &70115295138080 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70096521969240
101
+ version_requirements: *70115295138080
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70096521968800 !ruby/object:Gem::Requirement
104
+ requirement: &70115295134720 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,21 +109,21 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70096521968800
112
+ version_requirements: *70115295134720
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70096521968060 !ruby/object:Gem::Requirement
115
+ requirement: &70115295131880 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
119
119
  - !ruby/object:Gem::Version
120
- version: 1.0.2
120
+ version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70096521968060
123
+ version_requirements: *70115295131880
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70096521967360 !ruby/object:Gem::Requirement
126
+ requirement: &70115295127820 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70096521967360
134
+ version_requirements: *70115295127820
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface