cobweb 0.0.76 → 0.0.77

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.76
2
+ h1. Cobweb v0.0.77
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.76"
6
+ "0.0.77"
7
7
  end
8
8
 
9
9
  end
@@ -50,37 +50,35 @@ module CobwebModule
50
50
  end
51
51
 
52
52
  def retrieve
53
- lock("retrieve") do
54
- unless @redis.sismember("currently_running", @options[:url])
55
- @redis.sadd("currently_running", @options[:url])
56
- unless already_crawled?
57
- if within_crawl_limits?
58
- @stats.update_status("Retrieving #{@options[:url]}...")
59
- @content = Cobweb.new(@options).get(@options[:url], @options)
60
- if @options[:url] == @redis.get("original_base_url")
61
- @redis.set("crawled_base_url", @content[:base_url])
62
- end
63
- update_queues
53
+ unless @redis.sismember("currently_running", @options[:url])
54
+ @redis.sadd("currently_running", @options[:url])
55
+ unless already_crawled?
56
+ if within_crawl_limits?
57
+ @stats.update_status("Retrieving #{@options[:url]}...")
58
+ @content = Cobweb.new(@options).get(@options[:url], @options)
59
+ if @options[:url] == @redis.get("original_base_url")
60
+ @redis.set("crawled_base_url", @content[:base_url])
61
+ end
62
+ update_queues
64
63
 
65
- if content.permitted_type?
66
- ## update statistics
64
+ if content.permitted_type?
65
+ ## update statistics
67
66
 
68
- @stats.update_statistics(@content)
69
- return true
70
- end
71
- else
72
- decrement_queue_counter
67
+ @stats.update_statistics(@content)
68
+ return true
73
69
  end
74
70
  else
75
71
  decrement_queue_counter
76
72
  end
77
73
  else
78
- debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
79
- debug_ap @redis.smembers("currently_running")
80
74
  decrement_queue_counter
81
75
  end
82
- false
76
+ else
77
+ debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
78
+ debug_ap @redis.smembers("currently_running")
79
+ decrement_queue_counter
83
80
  end
81
+ false
84
82
  end
85
83
 
86
84
  def process_links &block
@@ -156,7 +154,7 @@ module CobwebModule
156
154
  increment_process_counter
157
155
  end
158
156
  @redis.sadd "enqueued", @options[:url]
159
-
157
+
160
158
  yield if block_given?
161
159
  @redis.incr("crawl_job_enqueued_count")
162
160
  end
@@ -181,7 +179,7 @@ module CobwebModule
181
179
  end
182
180
 
183
181
  def finished
184
- set_first_to_finish
182
+ set_first_to_finish
185
183
  debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
186
184
  @stats.end_crawl(@options)
187
185
  end
@@ -204,7 +202,7 @@ module CobwebModule
204
202
  def first_to_finish?
205
203
  @first_to_finish
206
204
  end
207
-
205
+
208
206
  def crawled_base_url
209
207
  @redis.get("crawled_base_url")
210
208
  end
@@ -228,6 +226,7 @@ module CobwebModule
228
226
  end
229
227
 
230
228
  debug_puts "RECEIVED LOCK [#{key}]"
229
+ @redis.expire("#{key}_lock", 10)
231
230
  begin
232
231
  result = yield
233
232
  ensure
@@ -236,15 +235,15 @@ module CobwebModule
236
235
  end
237
236
  result
238
237
  end
239
-
238
+
240
239
  def debug_ap(value)
241
240
  ap(value) if @options[:debug]
242
241
  end
243
-
242
+
244
243
  def debug_puts(value)
245
244
  puts(value) if @options[:debug]
246
245
  end
247
-
246
+
248
247
  private
249
248
  def setup_defaults
250
249
  @options[:redis_options] = {} unless @options.has_key? :redis_options
@@ -29,30 +29,28 @@ class CrawlJob
29
29
 
30
30
  end
31
31
 
32
- @crawl.lock("crawl_job_process") do
33
- if @crawl.to_be_processed?
34
-
35
- @crawl.process do
32
+ if @crawl.to_be_processed?
33
+
34
+ @crawl.process do
36
35
 
37
- # enqueue to processing queue
38
- @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
39
- send_to_processing_queue(@crawl.content.to_hash, content_request)
36
+ # enqueue to processing queue
37
+ @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
38
+ send_to_processing_queue(@crawl.content.to_hash, content_request)
40
39
 
41
- #if the enqueue counter has been requested update that
42
- if content_request.has_key?(:enqueue_counter_key)
43
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
44
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
45
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
46
- end
47
-
40
+ #if the enqueue counter has been requested update that
41
+ if content_request.has_key?(:enqueue_counter_key)
42
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
43
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
44
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
48
45
  end
49
- else
50
- @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
51
- @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
52
- @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
46
+
53
47
  end
54
-
48
+ else
49
+ @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
50
+ @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
51
+ @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
55
52
  end
53
+
56
54
  end
57
55
  end
58
56
 
@@ -262,16 +262,10 @@ def running?(crawl_id)
262
262
  else
263
263
  if status == @last_stat
264
264
  if @counter > 20
265
- puts ""
266
265
  raise "Static status: #{status}"
267
266
  else
268
267
  @counter += 1
269
268
  end
270
- if @counter == 1
271
- print "Static Status.. #{21-@counter}"
272
- else
273
- print ".#{21-@counter}"
274
- end
275
269
  else
276
270
  result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
277
271
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.76
4
+ version: 0.0.77
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-18 00:00:00.000000000 Z
12
+ date: 2012-10-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70301036171860 !ruby/object:Gem::Requirement
16
+ requirement: &70355025183680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70301036171860
24
+ version_requirements: *70355025183680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70301036170940 !ruby/object:Gem::Requirement
27
+ requirement: &70355025182000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70301036170940
35
+ version_requirements: *70355025182000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70301036169780 !ruby/object:Gem::Requirement
38
+ requirement: &70355025180660 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70301036169780
46
+ version_requirements: *70355025180660
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70301036169040 !ruby/object:Gem::Requirement
49
+ requirement: &70355025179740 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70301036169040
57
+ version_requirements: *70355025179740
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70301036167760 !ruby/object:Gem::Requirement
60
+ requirement: &70355025179160 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70301036167760
68
+ version_requirements: *70355025179160
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70301036166240 !ruby/object:Gem::Requirement
71
+ requirement: &70355025178560 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70301036166240
79
+ version_requirements: *70355025178560
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70301036164760 !ruby/object:Gem::Requirement
82
+ requirement: &70355025177760 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70301036164760
90
+ version_requirements: *70355025177760
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70301036163660 !ruby/object:Gem::Requirement
93
+ requirement: &70355025177180 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70301036163660
101
+ version_requirements: *70355025177180
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70301036162760 !ruby/object:Gem::Requirement
104
+ requirement: &70355025176740 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70301036162760
112
+ version_requirements: *70355025176740
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70301036161620 !ruby/object:Gem::Requirement
115
+ requirement: &70355025175980 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70301036161620
123
+ version_requirements: *70355025175980
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70301036161100 !ruby/object:Gem::Requirement
126
+ requirement: &70355025175280 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,9 +131,9 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70301036161100
134
+ version_requirements: *70355025175280
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
- crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
136
+ crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
138
138
  to monitor the progress of the crawls.
139
139
  email: stewart@rockwellcottage.com