cobweb 0.0.71 → 0.0.72

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.71
2
+ h1. Cobweb v0.0.72
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -10,28 +10,41 @@ class CobwebCrawlHelper
10
10
 
11
11
  def initialize(data)
12
12
  @data = data
13
+
14
+ # TAKING A LONG TIME TO RUN ON PRODUCTION BOX
13
15
  @stats = Stats.new(data)
14
16
  end
15
17
 
16
- def destroy
17
- queue_name = "cobweb_crawl_job"
18
+ def destroy(options)
19
+
20
+ options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
21
+ options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
22
+
18
23
  # set status as cancelled now so that we don't enqueue any further pages
19
24
  self.statistics.end_crawl(@data, true)
25
+ puts "end_crawl: #{self.statistics.get_status}"
26
+ if options[:finished_resque_queue]
27
+ puts "enqueueing finished job..."
28
+
29
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
30
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
31
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
32
+
33
+ Resque.enqueue(options[:finished_resque_queue], @stats.get_statistics.merge(additional_stats))
34
+ end
20
35
 
21
- job_items = Resque.peek(queue_name, 0, BATCH_SIZE)
22
- batch_count = 0
36
+ position = 0
37
+ job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
23
38
  until job_items.empty?
24
-
39
+ puts "Batch: #{position} : #{job_items.count}"
25
40
  job_items.each do |item|
26
41
  if item["args"][0]["crawl_id"] == id
27
- # remote this job from the queue
42
+ # remove this job from the queue
28
43
  Resque.dequeue(CrawlJob, item["args"][0])
29
44
  end
30
45
  end
31
46
 
32
- position = batch_count*BATCH_SIZE
33
- batch_count += 1
34
- job_items = Resque.peek(queue_name, position, BATCH_SIZE)
47
+ job_items = Resque.peek(options[:queue_name], position+=BATCH_SIZE, BATCH_SIZE)
35
48
  end
36
49
 
37
50
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.71"
6
+ "0.0.72"
7
7
  end
8
8
 
9
9
  end
@@ -1,6 +1,9 @@
1
1
  # Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
2
2
  class Stats
3
3
  require 'json'
4
+
5
+ attr_reader :redis
6
+
4
7
  # Sets up redis usage for statistics
5
8
  def initialize(options)
6
9
  options[:redis_options] = {} unless options.has_key? :redis_options
@@ -131,7 +134,7 @@ class Stats
131
134
  @statistics[:status_counts] = status_counts.to_json
132
135
 
133
136
  ## time based statistics
134
- increment_time_stat("minute_totals", "minute", 60)
137
+ increment_time_stat("minute_totals", "minute", 60)
135
138
 
136
139
  redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
137
140
  instance_eval redis_command
@@ -158,7 +161,7 @@ class Stats
158
161
 
159
162
  # Sets the current status of the crawl
160
163
  def update_status(status)
161
- #@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
164
+ @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
162
165
  end
163
166
 
164
167
  # Returns the current status of the crawl
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.71
4
+ version: 0.0.72
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-16 00:00:00.000000000 Z
12
+ date: 2012-09-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70207650005840 !ruby/object:Gem::Requirement
16
+ requirement: &70308627743380 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70207650005840
24
+ version_requirements: *70308627743380
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70207650005340 !ruby/object:Gem::Requirement
27
+ requirement: &70308627741840 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70207650005340
35
+ version_requirements: *70308627741840
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70207650004420 !ruby/object:Gem::Requirement
38
+ requirement: &70308627741060 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70207650004420
46
+ version_requirements: *70308627741060
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70207650003640 !ruby/object:Gem::Requirement
49
+ requirement: &70308627740200 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70207650003640
57
+ version_requirements: *70308627740200
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70207650002440 !ruby/object:Gem::Requirement
60
+ requirement: &70308627739500 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70207650002440
68
+ version_requirements: *70308627739500
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70207650001860 !ruby/object:Gem::Requirement
71
+ requirement: &70308627739020 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70207650001860
79
+ version_requirements: *70308627739020
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70207650001420 !ruby/object:Gem::Requirement
82
+ requirement: &70308627738100 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70207650001420
90
+ version_requirements: *70308627738100
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70207650000420 !ruby/object:Gem::Requirement
93
+ requirement: &70308627737580 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70207650000420
101
+ version_requirements: *70308627737580
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70207649999520 !ruby/object:Gem::Requirement
104
+ requirement: &70308627737040 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70207649999520
112
+ version_requirements: *70308627737040
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70207649998760 !ruby/object:Gem::Requirement
115
+ requirement: &70308627736400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70207649998760
123
+ version_requirements: *70308627736400
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70207649998280 !ruby/object:Gem::Requirement
126
+ requirement: &70308627735860 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70207649998280
134
+ version_requirements: *70308627735860
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface