cobweb 0.0.36 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.36
2
+ h1. Cobweb v0.0.37
3
3
 
4
4
  h2. Intro
5
5
 
data/lib/cobweb.rb CHANGED
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.36"
23
+ "0.0.37"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -34,7 +34,7 @@ class Cobweb
34
34
 
35
35
  def initialize(options = {})
36
36
  @options = options
37
-
37
+ default_use_encoding_safe_process_job_to false
38
38
  default_follow_redirects_to true
39
39
  default_redirect_limit_to 10
40
40
  default_processing_queue_to CobwebProcessJob
data/lib/crawl_job.rb CHANGED
@@ -52,9 +52,7 @@ class CrawlJob
52
52
  end
53
53
 
54
54
  # enqueue to processing queue
55
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
56
- puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
57
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
55
+ send_to_processing_queue(content, content_request)
58
56
 
59
57
  #if the enqueue counter has been requested update that
60
58
  if content_request.has_key? :enqueue_counter_key
@@ -63,7 +61,7 @@ class CrawlJob
63
61
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
64
62
  end
65
63
 
66
- # if the'res nothing left queued or the crawled limit has been reached
64
+ # if there's nothing left queued or the crawled limit has been reached
67
65
  if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
68
66
 
69
67
  # finished
@@ -86,6 +84,19 @@ class CrawlJob
86
84
 
87
85
  end
88
86
 
87
+ def self.send_to_processing_queue(content, content_request)
88
+ content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
89
+ if content_request[:use_encoding_safe_process_job]
90
+ content_to_send[:body] = Base64.encode64(content[:body])
91
+ content_to_send[:processing_queue] = content_request[:processing_queue]
92
+ Resque.enqueue(EncodingSafeProcessJob, content_to_send)
93
+ else
94
+ Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
95
+ end
96
+ puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
97
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
98
+ end
99
+
89
100
  private
90
101
 
91
102
  def self.within_crawl_limits?(crawl_limit)
@@ -0,0 +1,13 @@
1
+ class EncodingSafeProcessJob
2
+
3
+ @queue = :encoding_safe_process_job
4
+
5
+ def self.perform(content)
6
+ clazz = const_get(content["processing_queue"])
7
+ content["body"] = Base64.decode64(content["body"])
8
+ clazz.perform(content)
9
+ end
10
+ end
11
+
12
+
13
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.36
4
+ version: 0.0.37
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-19 00:00:00.000000000 Z
12
+ date: 2012-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70241538594440 !ruby/object:Gem::Requirement
16
+ requirement: &70275220162200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70241538594440
24
+ version_requirements: *70275220162200
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70241538594020 !ruby/object:Gem::Requirement
27
+ requirement: &70275220161740 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70241538594020
35
+ version_requirements: *70275220161740
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70241538593600 !ruby/object:Gem::Requirement
38
+ requirement: &70275220177640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70241538593600
46
+ version_requirements: *70275220177640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70241538593180 !ruby/object:Gem::Requirement
49
+ requirement: &70275220177220 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70241538593180
57
+ version_requirements: *70275220177220
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70241538609120 !ruby/object:Gem::Requirement
60
+ requirement: &70275220176640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70241538609120
68
+ version_requirements: *70275220176640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70241538608700 !ruby/object:Gem::Requirement
71
+ requirement: &70275220176220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70241538608700
79
+ version_requirements: *70275220176220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70241538608280 !ruby/object:Gem::Requirement
82
+ requirement: &70275220175780 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70241538608280
90
+ version_requirements: *70275220175780
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70241538607860 !ruby/object:Gem::Requirement
93
+ requirement: &70275220175360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70241538607860
101
+ version_requirements: *70275220175360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70241538607440 !ruby/object:Gem::Requirement
104
+ requirement: &70275220174940 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70241538607440
112
+ version_requirements: *70275220174940
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70241538607020 !ruby/object:Gem::Requirement
115
+ requirement: &70275220174520 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70241538607020
123
+ version_requirements: *70275220174520
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -141,6 +141,7 @@ files:
141
141
  - lib/cobweb_process_job.rb
142
142
  - lib/content_link_parser.rb
143
143
  - lib/crawl_job.rb
144
+ - lib/encoding_safe_process_job.rb
144
145
  - lib/redirect_error.rb
145
146
  - lib/robots.rb
146
147
  - lib/stats.rb