cobweb 0.0.71 → 0.0.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.71
2
+ h1. Cobweb v0.0.72
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -10,28 +10,41 @@ class CobwebCrawlHelper
10
10
 
11
11
  def initialize(data)
12
12
  @data = data
13
+
14
+ # TAKING A LONG TIME TO RUN ON PRODUCTION BOX
13
15
  @stats = Stats.new(data)
14
16
  end
15
17
 
16
- def destroy
17
- queue_name = "cobweb_crawl_job"
18
+ def destroy(options)
19
+
20
+ options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
21
+ options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
22
+
18
23
  # set status as cancelled now so that we don't enqueue any further pages
19
24
  self.statistics.end_crawl(@data, true)
25
+ puts "end_crawl: #{self.statistics.get_status}"
26
+ if options[:finished_resque_queue]
27
+ puts "enqueueing finished job..."
28
+
29
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
30
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
31
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
32
+
33
+ Resque.enqueue(options[:finished_resque_queue], @stats.get_statistics.merge(additional_stats))
34
+ end
20
35
 
21
- job_items = Resque.peek(queue_name, 0, BATCH_SIZE)
22
- batch_count = 0
36
+ position = 0
37
+ job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
23
38
  until job_items.empty?
24
-
39
+ puts "Batch: #{position} : #{job_items.count}"
25
40
  job_items.each do |item|
26
41
  if item["args"][0]["crawl_id"] == id
27
- # remote this job from the queue
42
+ # remove this job from the queue
28
43
  Resque.dequeue(CrawlJob, item["args"][0])
29
44
  end
30
45
  end
31
46
 
32
- position = batch_count*BATCH_SIZE
33
- batch_count += 1
34
- job_items = Resque.peek(queue_name, position, BATCH_SIZE)
47
+ job_items = Resque.peek(options[:queue_name], position+=BATCH_SIZE, BATCH_SIZE)
35
48
  end
36
49
 
37
50
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.71"
6
+ "0.0.72"
7
7
  end
8
8
 
9
9
  end
@@ -1,6 +1,9 @@
1
1
  # Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
2
2
  class Stats
3
3
  require 'json'
4
+
5
+ attr_reader :redis
6
+
4
7
  # Sets up redis usage for statistics
5
8
  def initialize(options)
6
9
  options[:redis_options] = {} unless options.has_key? :redis_options
@@ -131,7 +134,7 @@ class Stats
131
134
  @statistics[:status_counts] = status_counts.to_json
132
135
 
133
136
  ## time based statistics
134
- increment_time_stat("minute_totals", "minute", 60)
137
+ increment_time_stat("minute_totals", "minute", 60)
135
138
 
136
139
  redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
137
140
  instance_eval redis_command
@@ -158,7 +161,7 @@ class Stats
158
161
 
159
162
  # Sets the current status of the crawl
160
163
  def update_status(status)
161
- #@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
164
+ @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
162
165
  end
163
166
 
164
167
  # Returns the current status of the crawl
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.71
4
+ version: 0.0.72
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-16 00:00:00.000000000 Z
12
+ date: 2012-09-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70207650005840 !ruby/object:Gem::Requirement
16
+ requirement: &70308627743380 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70207650005840
24
+ version_requirements: *70308627743380
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70207650005340 !ruby/object:Gem::Requirement
27
+ requirement: &70308627741840 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70207650005340
35
+ version_requirements: *70308627741840
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70207650004420 !ruby/object:Gem::Requirement
38
+ requirement: &70308627741060 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70207650004420
46
+ version_requirements: *70308627741060
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70207650003640 !ruby/object:Gem::Requirement
49
+ requirement: &70308627740200 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70207650003640
57
+ version_requirements: *70308627740200
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70207650002440 !ruby/object:Gem::Requirement
60
+ requirement: &70308627739500 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70207650002440
68
+ version_requirements: *70308627739500
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70207650001860 !ruby/object:Gem::Requirement
71
+ requirement: &70308627739020 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70207650001860
79
+ version_requirements: *70308627739020
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70207650001420 !ruby/object:Gem::Requirement
82
+ requirement: &70308627738100 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70207650001420
90
+ version_requirements: *70308627738100
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70207650000420 !ruby/object:Gem::Requirement
93
+ requirement: &70308627737580 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70207650000420
101
+ version_requirements: *70308627737580
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70207649999520 !ruby/object:Gem::Requirement
104
+ requirement: &70308627737040 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70207649999520
112
+ version_requirements: *70308627737040
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70207649998760 !ruby/object:Gem::Requirement
115
+ requirement: &70308627736400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70207649998760
123
+ version_requirements: *70308627736400
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70207649998280 !ruby/object:Gem::Requirement
126
+ requirement: &70308627735860 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70207649998280
134
+ version_requirements: *70308627735860
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface