cobweb 0.0.76 → 0.0.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +26 -27
- data/lib/crawl_job.rb +17 -19
- data/spec/cobweb/cobweb_job_spec.rb +0 -6
- metadata +25 -25
data/README.textile
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -50,37 +50,35 @@ module CobwebModule
|
|
50
50
|
end
|
51
51
|
|
52
52
|
def retrieve
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
update_queues
|
53
|
+
unless @redis.sismember("currently_running", @options[:url])
|
54
|
+
@redis.sadd("currently_running", @options[:url])
|
55
|
+
unless already_crawled?
|
56
|
+
if within_crawl_limits?
|
57
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
58
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
59
|
+
if @options[:url] == @redis.get("original_base_url")
|
60
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
61
|
+
end
|
62
|
+
update_queues
|
64
63
|
|
65
|
-
|
66
|
-
|
64
|
+
if content.permitted_type?
|
65
|
+
## update statistics
|
67
66
|
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
else
|
72
|
-
decrement_queue_counter
|
67
|
+
@stats.update_statistics(@content)
|
68
|
+
return true
|
73
69
|
end
|
74
70
|
else
|
75
71
|
decrement_queue_counter
|
76
72
|
end
|
77
73
|
else
|
78
|
-
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
|
79
|
-
debug_ap @redis.smembers("currently_running")
|
80
74
|
decrement_queue_counter
|
81
75
|
end
|
82
|
-
|
76
|
+
else
|
77
|
+
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
|
78
|
+
debug_ap @redis.smembers("currently_running")
|
79
|
+
decrement_queue_counter
|
83
80
|
end
|
81
|
+
false
|
84
82
|
end
|
85
83
|
|
86
84
|
def process_links &block
|
@@ -156,7 +154,7 @@ module CobwebModule
|
|
156
154
|
increment_process_counter
|
157
155
|
end
|
158
156
|
@redis.sadd "enqueued", @options[:url]
|
159
|
-
|
157
|
+
|
160
158
|
yield if block_given?
|
161
159
|
@redis.incr("crawl_job_enqueued_count")
|
162
160
|
end
|
@@ -181,7 +179,7 @@ module CobwebModule
|
|
181
179
|
end
|
182
180
|
|
183
181
|
def finished
|
184
|
-
set_first_to_finish
|
182
|
+
set_first_to_finish
|
185
183
|
debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
|
186
184
|
@stats.end_crawl(@options)
|
187
185
|
end
|
@@ -204,7 +202,7 @@ module CobwebModule
|
|
204
202
|
def first_to_finish?
|
205
203
|
@first_to_finish
|
206
204
|
end
|
207
|
-
|
205
|
+
|
208
206
|
def crawled_base_url
|
209
207
|
@redis.get("crawled_base_url")
|
210
208
|
end
|
@@ -228,6 +226,7 @@ module CobwebModule
|
|
228
226
|
end
|
229
227
|
|
230
228
|
debug_puts "RECEIVED LOCK [#{key}]"
|
229
|
+
@redis.expire("#{key}_lock", 10)
|
231
230
|
begin
|
232
231
|
result = yield
|
233
232
|
ensure
|
@@ -236,15 +235,15 @@ module CobwebModule
|
|
236
235
|
end
|
237
236
|
result
|
238
237
|
end
|
239
|
-
|
238
|
+
|
240
239
|
def debug_ap(value)
|
241
240
|
ap(value) if @options[:debug]
|
242
241
|
end
|
243
|
-
|
242
|
+
|
244
243
|
def debug_puts(value)
|
245
244
|
puts(value) if @options[:debug]
|
246
245
|
end
|
247
|
-
|
246
|
+
|
248
247
|
private
|
249
248
|
def setup_defaults
|
250
249
|
@options[:redis_options] = {} unless @options.has_key? :redis_options
|
data/lib/crawl_job.rb
CHANGED
@@ -29,30 +29,28 @@ class CrawlJob
|
|
29
29
|
|
30
30
|
end
|
31
31
|
|
32
|
-
@crawl.
|
33
|
-
|
34
|
-
|
35
|
-
@crawl.process do
|
32
|
+
if @crawl.to_be_processed?
|
33
|
+
|
34
|
+
@crawl.process do
|
36
35
|
|
37
|
-
|
38
|
-
|
39
|
-
|
36
|
+
# enqueue to processing queue
|
37
|
+
@crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
38
|
+
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
40
39
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
|
40
|
+
#if the enqueue counter has been requested update that
|
41
|
+
if content_request.has_key?(:enqueue_counter_key)
|
42
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
|
43
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
44
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
48
45
|
end
|
49
|
-
|
50
|
-
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
51
|
-
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
52
|
-
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
46
|
+
|
53
47
|
end
|
54
|
-
|
48
|
+
else
|
49
|
+
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
50
|
+
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
51
|
+
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
55
52
|
end
|
53
|
+
|
56
54
|
end
|
57
55
|
end
|
58
56
|
|
@@ -262,16 +262,10 @@ def running?(crawl_id)
|
|
262
262
|
else
|
263
263
|
if status == @last_stat
|
264
264
|
if @counter > 20
|
265
|
-
puts ""
|
266
265
|
raise "Static status: #{status}"
|
267
266
|
else
|
268
267
|
@counter += 1
|
269
268
|
end
|
270
|
-
if @counter == 1
|
271
|
-
print "Static Status.. #{21-@counter}"
|
272
|
-
else
|
273
|
-
print ".#{21-@counter}"
|
274
|
-
end
|
275
269
|
else
|
276
270
|
result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
|
277
271
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.77
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70355025183680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70355025183680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70355025182000 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70355025182000
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70355025180660 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70355025180660
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70355025179740 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70355025179740
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70355025179160 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70355025179160
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70355025178560 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70355025178560
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70355025177760 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70355025177760
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70355025177180 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70355025177180
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70355025176740 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70355025176740
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70355025175980 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70355025175980
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70355025175280 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,9 +131,9 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70355025175280
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
|
-
crawl extremely large sites which is much more
|
136
|
+
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
138
138
|
to monitor the progress of the crawls.
|
139
139
|
email: stewart@rockwellcottage.com
|