cobweb 0.0.36 → 0.0.37

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.36
2
+ h1. Cobweb v0.0.37
3
3
 
4
4
  h2. Intro
5
5
 
data/lib/cobweb.rb CHANGED
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.36"
23
+ "0.0.37"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -34,7 +34,7 @@ class Cobweb
34
34
 
35
35
  def initialize(options = {})
36
36
  @options = options
37
-
37
+ default_use_encoding_safe_process_job_to false
38
38
  default_follow_redirects_to true
39
39
  default_redirect_limit_to 10
40
40
  default_processing_queue_to CobwebProcessJob
data/lib/crawl_job.rb CHANGED
@@ -52,9 +52,7 @@ class CrawlJob
52
52
  end
53
53
 
54
54
  # enqueue to processing queue
55
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
56
- puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
57
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
55
+ send_to_processing_queue(content, content_request)
58
56
 
59
57
  #if the enqueue counter has been requested update that
60
58
  if content_request.has_key? :enqueue_counter_key
@@ -63,7 +61,7 @@ class CrawlJob
63
61
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
64
62
  end
65
63
 
66
- # if the'res nothing left queued or the crawled limit has been reached
64
+ # if there's nothing left queued or the crawled limit has been reached
67
65
  if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
68
66
 
69
67
  # finished
@@ -86,6 +84,19 @@ class CrawlJob
86
84
 
87
85
  end
88
86
 
87
+ def self.send_to_processing_queue(content, content_request)
88
+ content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
89
+ if content_request[:use_encoding_safe_process_job]
90
+ content_to_send[:body] = Base64.encode64(content[:body])
91
+ content_to_send[:processing_queue] = content_request[:processing_queue]
92
+ Resque.enqueue(EncodingSafeProcessJob, content_to_send)
93
+ else
94
+ Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
95
+ end
96
+ puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
97
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
98
+ end
99
+
89
100
  private
90
101
 
91
102
  def self.within_crawl_limits?(crawl_limit)
@@ -0,0 +1,13 @@
1
+ class EncodingSafeProcessJob
2
+
3
+ @queue = :encoding_safe_process_job
4
+
5
+ def self.perform(content)
6
+ clazz = const_get(content["processing_queue"])
7
+ content["body"] = Base64.decode64(content["body"])
8
+ clazz.perform(content)
9
+ end
10
+ end
11
+
12
+
13
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.36
4
+ version: 0.0.37
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-19 00:00:00.000000000 Z
12
+ date: 2012-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70241538594440 !ruby/object:Gem::Requirement
16
+ requirement: &70275220162200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70241538594440
24
+ version_requirements: *70275220162200
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70241538594020 !ruby/object:Gem::Requirement
27
+ requirement: &70275220161740 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70241538594020
35
+ version_requirements: *70275220161740
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70241538593600 !ruby/object:Gem::Requirement
38
+ requirement: &70275220177640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70241538593600
46
+ version_requirements: *70275220177640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70241538593180 !ruby/object:Gem::Requirement
49
+ requirement: &70275220177220 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70241538593180
57
+ version_requirements: *70275220177220
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70241538609120 !ruby/object:Gem::Requirement
60
+ requirement: &70275220176640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70241538609120
68
+ version_requirements: *70275220176640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70241538608700 !ruby/object:Gem::Requirement
71
+ requirement: &70275220176220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70241538608700
79
+ version_requirements: *70275220176220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70241538608280 !ruby/object:Gem::Requirement
82
+ requirement: &70275220175780 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70241538608280
90
+ version_requirements: *70275220175780
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70241538607860 !ruby/object:Gem::Requirement
93
+ requirement: &70275220175360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70241538607860
101
+ version_requirements: *70275220175360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70241538607440 !ruby/object:Gem::Requirement
104
+ requirement: &70275220174940 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70241538607440
112
+ version_requirements: *70275220174940
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70241538607020 !ruby/object:Gem::Requirement
115
+ requirement: &70275220174520 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70241538607020
123
+ version_requirements: *70275220174520
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -141,6 +141,7 @@ files:
141
141
  - lib/cobweb_process_job.rb
142
142
  - lib/content_link_parser.rb
143
143
  - lib/crawl_job.rb
144
+ - lib/encoding_safe_process_job.rb
144
145
  - lib/redirect_error.rb
145
146
  - lib/robots.rb
146
147
  - lib/stats.rb