cobweb 0.0.76 → 0.0.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.76
2
+ h1. Cobweb v0.0.77
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.76"
6
+ "0.0.77"
7
7
  end
8
8
 
9
9
  end
@@ -50,37 +50,35 @@ module CobwebModule
50
50
  end
51
51
 
52
52
  def retrieve
53
- lock("retrieve") do
54
- unless @redis.sismember("currently_running", @options[:url])
55
- @redis.sadd("currently_running", @options[:url])
56
- unless already_crawled?
57
- if within_crawl_limits?
58
- @stats.update_status("Retrieving #{@options[:url]}...")
59
- @content = Cobweb.new(@options).get(@options[:url], @options)
60
- if @options[:url] == @redis.get("original_base_url")
61
- @redis.set("crawled_base_url", @content[:base_url])
62
- end
63
- update_queues
53
+ unless @redis.sismember("currently_running", @options[:url])
54
+ @redis.sadd("currently_running", @options[:url])
55
+ unless already_crawled?
56
+ if within_crawl_limits?
57
+ @stats.update_status("Retrieving #{@options[:url]}...")
58
+ @content = Cobweb.new(@options).get(@options[:url], @options)
59
+ if @options[:url] == @redis.get("original_base_url")
60
+ @redis.set("crawled_base_url", @content[:base_url])
61
+ end
62
+ update_queues
64
63
 
65
- if content.permitted_type?
66
- ## update statistics
64
+ if content.permitted_type?
65
+ ## update statistics
67
66
 
68
- @stats.update_statistics(@content)
69
- return true
70
- end
71
- else
72
- decrement_queue_counter
67
+ @stats.update_statistics(@content)
68
+ return true
73
69
  end
74
70
  else
75
71
  decrement_queue_counter
76
72
  end
77
73
  else
78
- debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
79
- debug_ap @redis.smembers("currently_running")
80
74
  decrement_queue_counter
81
75
  end
82
- false
76
+ else
77
+ debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
78
+ debug_ap @redis.smembers("currently_running")
79
+ decrement_queue_counter
83
80
  end
81
+ false
84
82
  end
85
83
 
86
84
  def process_links &block
@@ -156,7 +154,7 @@ module CobwebModule
156
154
  increment_process_counter
157
155
  end
158
156
  @redis.sadd "enqueued", @options[:url]
159
-
157
+
160
158
  yield if block_given?
161
159
  @redis.incr("crawl_job_enqueued_count")
162
160
  end
@@ -181,7 +179,7 @@ module CobwebModule
181
179
  end
182
180
 
183
181
  def finished
184
- set_first_to_finish
182
+ set_first_to_finish
185
183
  debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
186
184
  @stats.end_crawl(@options)
187
185
  end
@@ -204,7 +202,7 @@ module CobwebModule
204
202
  def first_to_finish?
205
203
  @first_to_finish
206
204
  end
207
-
205
+
208
206
  def crawled_base_url
209
207
  @redis.get("crawled_base_url")
210
208
  end
@@ -228,6 +226,7 @@ module CobwebModule
228
226
  end
229
227
 
230
228
  debug_puts "RECEIVED LOCK [#{key}]"
229
+ @redis.expire("#{key}_lock", 10)
231
230
  begin
232
231
  result = yield
233
232
  ensure
@@ -236,15 +235,15 @@ module CobwebModule
236
235
  end
237
236
  result
238
237
  end
239
-
238
+
240
239
  def debug_ap(value)
241
240
  ap(value) if @options[:debug]
242
241
  end
243
-
242
+
244
243
  def debug_puts(value)
245
244
  puts(value) if @options[:debug]
246
245
  end
247
-
246
+
248
247
  private
249
248
  def setup_defaults
250
249
  @options[:redis_options] = {} unless @options.has_key? :redis_options
@@ -29,30 +29,28 @@ class CrawlJob
29
29
 
30
30
  end
31
31
 
32
- @crawl.lock("crawl_job_process") do
33
- if @crawl.to_be_processed?
34
-
35
- @crawl.process do
32
+ if @crawl.to_be_processed?
33
+
34
+ @crawl.process do
36
35
 
37
- # enqueue to processing queue
38
- @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
39
- send_to_processing_queue(@crawl.content.to_hash, content_request)
36
+ # enqueue to processing queue
37
+ @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
38
+ send_to_processing_queue(@crawl.content.to_hash, content_request)
40
39
 
41
- #if the enqueue counter has been requested update that
42
- if content_request.has_key?(:enqueue_counter_key)
43
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
44
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
45
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
46
- end
47
-
40
+ #if the enqueue counter has been requested update that
41
+ if content_request.has_key?(:enqueue_counter_key)
42
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
43
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
44
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
48
45
  end
49
- else
50
- @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
51
- @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
52
- @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
46
+
53
47
  end
54
-
48
+ else
49
+ @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
50
+ @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
51
+ @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
55
52
  end
53
+
56
54
  end
57
55
  end
58
56
 
@@ -262,16 +262,10 @@ def running?(crawl_id)
262
262
  else
263
263
  if status == @last_stat
264
264
  if @counter > 20
265
- puts ""
266
265
  raise "Static status: #{status}"
267
266
  else
268
267
  @counter += 1
269
268
  end
270
- if @counter == 1
271
- print "Static Status.. #{21-@counter}"
272
- else
273
- print ".#{21-@counter}"
274
- end
275
269
  else
276
270
  result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
277
271
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.76
4
+ version: 0.0.77
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-18 00:00:00.000000000 Z
12
+ date: 2012-10-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70301036171860 !ruby/object:Gem::Requirement
16
+ requirement: &70355025183680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70301036171860
24
+ version_requirements: *70355025183680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70301036170940 !ruby/object:Gem::Requirement
27
+ requirement: &70355025182000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70301036170940
35
+ version_requirements: *70355025182000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70301036169780 !ruby/object:Gem::Requirement
38
+ requirement: &70355025180660 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70301036169780
46
+ version_requirements: *70355025180660
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70301036169040 !ruby/object:Gem::Requirement
49
+ requirement: &70355025179740 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70301036169040
57
+ version_requirements: *70355025179740
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70301036167760 !ruby/object:Gem::Requirement
60
+ requirement: &70355025179160 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70301036167760
68
+ version_requirements: *70355025179160
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70301036166240 !ruby/object:Gem::Requirement
71
+ requirement: &70355025178560 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70301036166240
79
+ version_requirements: *70355025178560
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70301036164760 !ruby/object:Gem::Requirement
82
+ requirement: &70355025177760 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70301036164760
90
+ version_requirements: *70355025177760
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70301036163660 !ruby/object:Gem::Requirement
93
+ requirement: &70355025177180 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70301036163660
101
+ version_requirements: *70355025177180
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70301036162760 !ruby/object:Gem::Requirement
104
+ requirement: &70355025176740 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70301036162760
112
+ version_requirements: *70355025176740
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70301036161620 !ruby/object:Gem::Requirement
115
+ requirement: &70355025175980 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70301036161620
123
+ version_requirements: *70355025175980
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70301036161100 !ruby/object:Gem::Requirement
126
+ requirement: &70355025175280 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,9 +131,9 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70301036161100
134
+ version_requirements: *70355025175280
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
- crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
136
+ crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
138
138
  to monitor the progress of the crawls.
139
139
  email: stewart@rockwellcottage.com