massive 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +22 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/.travis.yml +7 -0
  6. data/Gemfile +19 -0
  7. data/Gemfile.lock +141 -0
  8. data/Guardfile +9 -0
  9. data/LICENSE.txt +22 -0
  10. data/README.md +196 -0
  11. data/Rakefile +8 -0
  12. data/lib/massive.rb +63 -0
  13. data/lib/massive/cancelling.rb +20 -0
  14. data/lib/massive/file.rb +80 -0
  15. data/lib/massive/file_job.rb +9 -0
  16. data/lib/massive/file_process.rb +7 -0
  17. data/lib/massive/file_step.rb +7 -0
  18. data/lib/massive/job.rb +115 -0
  19. data/lib/massive/locking.rb +27 -0
  20. data/lib/massive/memory_consumption.rb +15 -0
  21. data/lib/massive/notifications.rb +40 -0
  22. data/lib/massive/notifiers.rb +6 -0
  23. data/lib/massive/notifiers/base.rb +32 -0
  24. data/lib/massive/notifiers/pusher.rb +17 -0
  25. data/lib/massive/process.rb +69 -0
  26. data/lib/massive/process_serializer.rb +12 -0
  27. data/lib/massive/retry.rb +49 -0
  28. data/lib/massive/status.rb +59 -0
  29. data/lib/massive/step.rb +143 -0
  30. data/lib/massive/step_serializer.rb +12 -0
  31. data/lib/massive/timing_support.rb +10 -0
  32. data/lib/massive/version.rb +3 -0
  33. data/massive.gemspec +23 -0
  34. data/spec/fixtures/custom_job.rb +4 -0
  35. data/spec/fixtures/custom_step.rb +19 -0
  36. data/spec/models/massive/cancelling_spec.rb +83 -0
  37. data/spec/models/massive/file_job_spec.rb +24 -0
  38. data/spec/models/massive/file_spec.rb +209 -0
  39. data/spec/models/massive/file_step_spec.rb +22 -0
  40. data/spec/models/massive/job_spec.rb +319 -0
  41. data/spec/models/massive/locking_spec.rb +52 -0
  42. data/spec/models/massive/memory_consumption_spec.rb +24 -0
  43. data/spec/models/massive/notifications_spec.rb +107 -0
  44. data/spec/models/massive/notifiers/base_spec.rb +48 -0
  45. data/spec/models/massive/notifiers/pusher_spec.rb +49 -0
  46. data/spec/models/massive/process_serializer_spec.rb +38 -0
  47. data/spec/models/massive/process_spec.rb +235 -0
  48. data/spec/models/massive/status_spec.rb +104 -0
  49. data/spec/models/massive/step_serializer_spec.rb +40 -0
  50. data/spec/models/massive/step_spec.rb +490 -0
  51. data/spec/models/massive/timing_support_spec.rb +55 -0
  52. data/spec/shared/step_context.rb +25 -0
  53. data/spec/spec_helper.rb +42 -0
  54. data/spec/support/mongoid.yml +78 -0
  55. metadata +175 -0
@@ -0,0 +1,17 @@
1
+ module Massive
2
+ module Notifiers
3
+ class Pusher < Base
4
+ protected
5
+
6
+ def send_notification(message, data, &block)
7
+ data = block.call if block_given?
8
+
9
+ client.trigger(id, message, data)
10
+ end
11
+
12
+ def client
13
+ @client ||= options[:client] || ::Pusher
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,69 @@
1
+ module Massive
2
+ class Process
3
+ include Mongoid::Document
4
+ include Mongoid::Timestamps
5
+
6
+ field :cancelled_at, type: Time
7
+
8
+ embeds_many :steps, class_name: 'Massive::Step'
9
+
10
+ def self.find_step(process_id, step_id)
11
+ find(process_id).steps.find(step_id)
12
+ end
13
+
14
+ def self.find_job(process_id, step_id, job_id)
15
+ find_step(process_id, step_id).jobs.find(job_id)
16
+ end
17
+
18
+ def enqueue_next
19
+ next_step.try(:enqueue)
20
+ end
21
+
22
+ def next_step
23
+ step = steps.not_completed.not_started.first
24
+ step.try(:enqueued?) ? nil : step
25
+ end
26
+
27
+ def processed_percentage
28
+ total_weight > 0 ? total_steps_processed_percentage.to_f / total_weight : 0
29
+ end
30
+
31
+ def completed?
32
+ steps.not_completed.none?
33
+ end
34
+
35
+ def cancelled?
36
+ cancelled_at? || redis.exists(cancelled_key)
37
+ end
38
+
39
+ def cancel
40
+ self.cancelled_at = Time.now
41
+ redis.setex(cancelled_key, 1.day, true)
42
+ save
43
+ end
44
+
45
+ def active_model_serializer
46
+ super || Massive::ProcessSerializer
47
+ end
48
+
49
+ protected
50
+ def redis
51
+ Massive.redis
52
+ end
53
+
54
+ def cancelled_key
55
+ "#{self.class.name.underscore}:#{id}:cancelled"
56
+ end
57
+
58
+ private
59
+ def total_weight
60
+ steps.map(&:weight).sum
61
+ end
62
+
63
+ def total_steps_processed_percentage
64
+ steps.inject(0) do |result, step|
65
+ result += step.processed_percentage * step.weight
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,12 @@
1
+ module Massive
2
+ class ProcessSerializer < ActiveModel::Serializer
3
+ attributes :id, :created_at, :updated_at, :processed_percentage
4
+ attribute :completed?, key: :completed
5
+
6
+ has_many :steps
7
+
8
+ def id
9
+ object.id.to_s
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,49 @@
1
+ module Massive
2
+ module Retry
3
+ extend ActiveSupport::Concern
4
+
5
+ included do
6
+ retry_interval 2
7
+ maximum_retries 10
8
+
9
+ def self.inherited(base)
10
+ super
11
+
12
+ base.retry_interval retry_interval
13
+ base.maximum_retries maximum_retries
14
+ end
15
+ end
16
+
17
+ def retrying(&block)
18
+ self.retries = 0
19
+
20
+ begin
21
+ block.call
22
+ rescue Massive::Cancelled, SignalException
23
+ # re-raise cancelled and signal exceptions since they are not an actual error
24
+ raise
25
+ rescue StandardError => e
26
+ self.retries += 1
27
+
28
+ if self.retries < self.class.maximum_retries
29
+ Kernel.sleep self.class.retry_interval
30
+ retry
31
+ else
32
+ raise e
33
+ end
34
+ end
35
+ end
36
+
37
+ module ClassMethods
38
+ def retry_interval(value=nil)
39
+ @retry_interval = value if value
40
+ @retry_interval
41
+ end
42
+
43
+ def maximum_retries(value=nil)
44
+ @maximum_retries = value if value
45
+ @maximum_retries
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,59 @@
1
+ module Massive
2
+ module Status
3
+ extend ActiveSupport::Concern
4
+
5
+ included do
6
+ field :started_at, type: Time
7
+ field :finished_at, type: Time
8
+ field :failed_at, type: Time
9
+ field :cancelled_at, type: Time
10
+
11
+ field :last_error, type: String
12
+ field :retries, type: Integer, default: 0
13
+
14
+ scope :started, ne(started_at: nil)
15
+ scope :not_started, where(started_at: nil)
16
+ scope :completed, ne(finished_at: nil)
17
+ scope :not_completed, where(finished_at: nil)
18
+ scope :failed, ne(failed_at: nil)
19
+ scope :cancelled, ne(cancelled_at: nil)
20
+ end
21
+
22
+ def start!
23
+ update_attributes(attributes_to_reset)
24
+ end
25
+
26
+ def started?
27
+ !failed? && started_at?
28
+ end
29
+
30
+ def completed?
31
+ !failed? && finished_at?
32
+ end
33
+
34
+ def failed?
35
+ failed_at?
36
+ end
37
+
38
+ def enqueued?
39
+ item = Resque.peek(self.class.queue)
40
+ item.present? && (item["class"] == self.class.name) && (item["args"] == args_for_resque)
41
+ end
42
+
43
+ protected
44
+
45
+ def attributes_to_reset
46
+ {
47
+ started_at: Time.now,
48
+ finished_at: nil,
49
+ failed_at: nil,
50
+ cancelled_at: nil,
51
+ retries: 0,
52
+ last_error: nil
53
+ }
54
+ end
55
+
56
+ def args_for_resque
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,143 @@
1
+ module Massive
2
+ class Step
3
+ include Mongoid::Document
4
+ include Mongoid::Timestamps
5
+
6
+ include Massive::Status
7
+ include Massive::MemoryConsumption
8
+ include Massive::TimingSupport
9
+ include Massive::Locking
10
+ include Massive::Notifications
11
+
12
+ embedded_in :process, class_name: 'Massive::Process'
13
+ embeds_many :jobs, class_name: 'Massive::Job'
14
+
15
+ field :total_count, type: Integer
16
+ field :weight, type: Integer, default: 1
17
+ field :job_class, type: String, default: -> { self.class.job_class }
18
+ field :execute_next, type: Boolean, default: false
19
+
20
+ define_model_callbacks :work
21
+ define_model_callbacks :complete
22
+
23
+ def self.perform(process_id, step_id)
24
+ Massive::Process.find_step(process_id, step_id).work
25
+ end
26
+
27
+ def self.queue
28
+ :massive_step
29
+ end
30
+
31
+ def self.calculates_total_count_with(&block)
32
+ define_method(:calculate_total_count, &block)
33
+ end
34
+
35
+ def self.limit_ratio(value=nil)
36
+ @limit_ratio = value if value
37
+ @limit_ratio
38
+ end
39
+
40
+ def self.job_class(value=nil)
41
+ @job_class = value if value
42
+ @job_class
43
+ end
44
+
45
+ def self.inherited(child)
46
+ super
47
+
48
+ child.job_class self.job_class
49
+ child.limit_ratio self.limit_ratio
50
+ end
51
+
52
+ limit_ratio 3000 => 1000, 0 => 100
53
+ job_class 'Massive::Job'
54
+
55
+ def enqueue
56
+ Resque.enqueue(self.class, process.id.to_s, id.to_s)
57
+ end
58
+
59
+ def start!
60
+ super
61
+ notify(:start)
62
+ end
63
+
64
+ def work
65
+ start!
66
+
67
+ run_callbacks :work do
68
+ process_step
69
+ end
70
+
71
+ complete
72
+ end
73
+
74
+ def process_step
75
+ self.jobs = number_of_jobs.times.map do |index|
76
+ job_class.constantize.new(job_params(index))
77
+ end
78
+ end
79
+
80
+ def complete
81
+ if completed_all_jobs? && !locked?(:complete)
82
+ run_callbacks :complete do
83
+ update_attributes finished_at: Time.now, failed_at: nil, memory_consumption: current_memory_consumption
84
+ notify(:complete)
85
+ end
86
+
87
+ process.enqueue_next if execute_next?
88
+ end
89
+ end
90
+
91
+ def completed_all_jobs?
92
+ reload if persisted?
93
+
94
+ jobs.all?(&:completed?)
95
+ end
96
+
97
+ def processed
98
+ jobs.map(&:processed).sum
99
+ end
100
+
101
+ def processed_percentage
102
+ total_count && total_count > 0 ? processed.to_f / total_count : 0
103
+ end
104
+
105
+ def processing_time
106
+ jobs.map(&:elapsed_time).sum
107
+ end
108
+
109
+ def limit
110
+ @limit ||= self.class.limit_ratio.find { |count, l| total_count >= count }.last
111
+ end
112
+
113
+ def calculate_total_count
114
+ 0
115
+ end
116
+
117
+ def active_model_serializer
118
+ super || Massive::StepSerializer
119
+ end
120
+
121
+ protected
122
+
123
+ def job_params(index)
124
+ {
125
+ offset: index * limit,
126
+ limit: limit,
127
+ step: self
128
+ }
129
+ end
130
+
131
+ def number_of_jobs
132
+ (total_count.to_f / limit).ceil
133
+ end
134
+
135
+ def attributes_to_reset
136
+ super.merge(total_count: total_count || calculate_total_count)
137
+ end
138
+
139
+ def args_for_resque
140
+ [process.id.to_s, id.to_s]
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,12 @@
1
+ module Massive
2
+ class StepSerializer < ActiveModel::Serializer
3
+ attributes :id, :created_at, :updated_at, :started_at, :finished_at, :failed_at,
4
+ :last_error, :retries, :memory_consumption, :total_count,
5
+ :processed, :processed_percentage, :processing_time, :elapsed_time,
6
+ :notifier_id
7
+
8
+ def id
9
+ object.id.to_s
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,10 @@
1
+ module Massive
2
+ module TimingSupport
3
+ def elapsed_time
4
+ start = started_at || 0
5
+ finish = finished_at || Time.now
6
+
7
+ started_at? ? finish - start : 0
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ module Massive
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'massive/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "massive"
8
+ gem.version = Massive::VERSION
9
+ gem.authors = ["Vicente Mundim"]
10
+ gem.email = ["vicente.mundim@gmail.com"]
11
+ gem.description = %q{Parallelize processing of large files and/or data using Resque, Redis and MongoDB}
12
+ gem.summary = %q{Parallelize processing of large files and/or data using Resque, Redis and MongoDB}
13
+
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+
19
+ gem.add_dependency "resque"
20
+ gem.add_dependency "mongoid", "~> 3.1.x"
21
+ gem.add_dependency "file_processor", "0.2.0"
22
+ gem.add_dependency "active_model_serializers"
23
+ end
@@ -0,0 +1,4 @@
1
+ class CustomJob < Massive::Job
2
+ retry_interval 5
3
+ maximum_retries 20
4
+ end
@@ -0,0 +1,19 @@
1
+ class CustomStep < Massive::Step
2
+ calculates_total_count_with { 100 }
3
+ job_class 'CustomJob'
4
+
5
+ limit_ratio 3000 => 1500, 2000 => 1000, 0 => 100
6
+
7
+ protected
8
+
9
+ def job_params(index)
10
+ {
11
+ offset: index * limit,
12
+ limit: limit,
13
+ custom_param: "some_param"
14
+ }
15
+ end
16
+ end
17
+
18
+ class InheritedStep < Massive::Step
19
+ end
@@ -0,0 +1,83 @@
1
+ require "spec_helper"
2
+
3
+ class Cancellable
4
+ include Massive::Cancelling
5
+
6
+ attr_accessor :cancelled, :work_count, :work_done_count, :cancelled_exception
7
+
8
+ def initialize(work_count)
9
+ self.work_count = work_count
10
+ end
11
+
12
+ def cancelled?
13
+ cancelled == true
14
+ end
15
+
16
+ def work(&block)
17
+ self.work_done_count = 0
18
+
19
+ work_count.times do |iteration|
20
+ cancelling do
21
+ block.call(self, iteration)
22
+ self.work_done_count += 1
23
+ end
24
+ end
25
+ rescue Massive::Cancelled => e
26
+ self.cancelled_exception = e
27
+ end
28
+ end
29
+
30
+ describe Massive::Cancelling do
31
+ let(:work_count) { 3 }
32
+ subject(:cancellable) { Cancellable.new(work_count) }
33
+
34
+ context "when it is never cancelled" do
35
+ it "does not cancel the work" do
36
+ cancellable.work { |cancellable| }
37
+ cancellable.work_done_count.should eq(cancellable.work_count)
38
+ end
39
+
40
+ it "does not raises a cancelled exception" do
41
+ cancellable.work { |cancellable| }
42
+ cancellable.cancelled_exception.should be_nil
43
+ end
44
+ end
45
+
46
+ context "when it is cancelled before actually performing any work" do
47
+ before { cancellable.cancelled = true }
48
+
49
+ it "cancels the work before the first iteration" do
50
+ cancellable.work { |cancellable| }
51
+ cancellable.work_done_count.should eq(0)
52
+ end
53
+
54
+ it "raises a cancelled exception" do
55
+ cancellable.work { |cancellable| }
56
+ cancellable.cancelled_exception.should be_present
57
+ end
58
+ end
59
+
60
+ context "when it is cancelled while performing some work" do
61
+ it "cancels the work before performing the iteration" do
62
+ cancellable.work { |cancellable, iteration| cancellable.cancelled = (iteration == work_count - 2) }
63
+ cancellable.work_done_count.should eq(2)
64
+ end
65
+
66
+ it "raises a cancelled exception" do
67
+ cancellable.work { |cancellable, iteration| cancellable.cancelled = (iteration == work_count - 2) }
68
+ cancellable.cancelled_exception.should be_present
69
+ end
70
+ end
71
+
72
+ context "when it is cancelled while performing the last iteration" do
73
+ it "performs all the work" do
74
+ cancellable.work { |cancellable, iteration| cancellable.cancelled = (iteration == work_count - 1) }
75
+ cancellable.work_done_count.should eq(work_count)
76
+ end
77
+
78
+ it "does not raise a cancelled exception" do
79
+ cancellable.work { |cancellable, iteration| cancellable.cancelled = (iteration == work_count - 1) }
80
+ cancellable.cancelled_exception.should be_nil
81
+ end
82
+ end
83
+ end