batches_task_processor 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -35
- data/lib/batches_task_processor/model.rb +53 -0
- data/lib/batches_task_processor/model_item.rb +8 -0
- data/lib/batches_task_processor/processor.rb +42 -88
- data/lib/batches_task_processor/processor_job.rb +11 -0
- data/lib/batches_task_processor/railtie.rb +4 -0
- data/lib/batches_task_processor/version.rb +1 -1
- data/lib/batches_task_processor.rb +5 -12
- data/lib/db/migrate/20220727101904_add_batches_task_processor.rb +25 -0
- data/lib/tasks/batches_task_processor_tasks.rake +2 -22
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b83bdfebcd837e7ee632a01502d5796ad29ad096e63486cd4d9513dd5882908
|
4
|
+
data.tar.gz: a8e2fbfafe6c76ca0ddab0810b24797dbc5ce3b6eceb84f15c2b587078a8d558
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2aca1fb9e37d1c3c7e7af5e51cc593bcbbf1d83f37654bb81f9ef0cc45740a5d33467f3ead66a20f93fedf25f5ac1b81568a49019673ee0f47bf61bf3fd2629
|
7
|
+
data.tar.gz: 43d671b910781c69ac56bee5c363213c7590c162e5f17cf48882e31ea5dcc20ae7311186b30c5d9b9fdbc037a3e342097543b46d084112477c794057ccc89612
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# BatchesTaskProcessor
|
2
|
-
Gem that allows to process huge amount of tasks in parallel using batches
|
3
|
-
|
2
|
+
Ruby Gem that allows to process huge amount of any kind of tasks in parallel using batches with the ability to cancel at any time.
|
3
|
+
The jobs created can be processed in background or in the foreground (inline) with the ability to rerun/retry later (excludes the already processed ones).
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
Add this line to your application's Gemfile:
|
@@ -8,45 +8,42 @@ Add this line to your application's Gemfile:
|
|
8
8
|
```ruby
|
9
9
|
gem "batches_task_processor"
|
10
10
|
```
|
11
|
-
And then execute: `bundle install`
|
11
|
+
And then execute: `bundle install && bundle exec rake db:migrate`
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
Sample Array:
|
13
|
+
## Usage
|
14
|
+
- Register a new task:
|
15
|
+
The following will process 200k items with 10 jobs parallelly each one in charge of 20k items (recommended `preload_job_items` for performance reasons):
|
17
16
|
```ruby
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
17
|
+
task = BatchesTaskProcessor::Model.create!(
|
18
|
+
key: 'my_process',
|
19
|
+
data: Article.all.limit(200000).pluck(:id),
|
20
|
+
qty_jobs: 10,
|
21
|
+
preload_job_items: 'Article.where(id: items)',
|
22
|
+
process_item: 'puts "my article ID: #{item.id}"'
|
23
|
+
)
|
24
|
+
task.start!
|
25
25
|
```
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+

|
27
|
+
|
28
|
+
## Task api
|
29
|
+
- `task.start!` starts the task (initializes the jobs)
|
30
|
+
- `task.cancel` cancels the task at any time and stops processing the items
|
31
|
+
- `task.export` exports the items that were processed in a csv file
|
32
|
+
- `task.status` prints the current status of the task
|
33
|
+
- `task.items` returns the items that were processed so far
|
34
|
+
Each item includes the following attributes: `# { key: 'value from items', result: "value returned from the process_item callback", error_details: "error message from the process_message callback if failed" }`
|
35
|
+
|
36
|
+
## TODO
|
37
|
+
- update tests
|
35
38
|
|
36
39
|
## Api
|
37
40
|
Settings:
|
38
|
-
- `
|
39
|
-
- `
|
40
|
-
- `
|
41
|
-
- `
|
42
|
-
|
43
|
-
- `
|
44
|
-
- `rake batches_task_processor:process_job` (Only for internal usage).
|
45
|
-
- `rake batches_task_processor:retry` Retries the processing of all jobs (ignores already processed).
|
46
|
-
- `rake batches_task_processor:status` Prints the process status.
|
47
|
-
- `rake batches_task_processor:cancel` Marks as cancelled the process and stops processing jobs.
|
48
|
-
- `rake batches_task_processor:clear` Removes all process logs or tmp data.
|
49
|
-
|
41
|
+
- `data` (Array<Integer|String>) Array of whole items to be processed.
|
42
|
+
- `key` (Mandatory) key to be used to identify the task.
|
43
|
+
- `queue_name` (String, default `default`) name of the background queue to be used (If `nil`, will run the process inline).
|
44
|
+
- `qty_jobs` (Optional) number of jobs to be created (all `data` items will be distributed across this qty of jobs). Default: `10`
|
45
|
+
- `process_item` (Mandatory) callback to be called to perform each item where `item` variable holds the current item value. Sample: `'Article.find(item).update_column(:title, "changed")'`
|
46
|
+
- `preload_job_items` (Optional) callback that allows to preload items list and/or associations where `items` variable holds the current chunk of items to be processed (by default returns the same list). Sample: `Article.where(id: items)`
|
50
47
|
|
51
48
|
## Contributing
|
52
49
|
Contribution directions go here.
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
module BatchesTaskProcessor
|
5
|
+
class Model < ActiveRecord::Base
|
6
|
+
self.table_name = 'batches_task_processors'
|
7
|
+
has_many :items, class_name: 'BatchesTaskProcessor::ModelItem', dependent: :destroy, foreign_key: :batches_task_processors_id
|
8
|
+
validates :process_item, presence: true
|
9
|
+
validates :key, presence: true
|
10
|
+
before_create :apply_data_uniqueness
|
11
|
+
# state: :pending, :processing, :finished, :canceled
|
12
|
+
|
13
|
+
def qty_items_job
|
14
|
+
@qty_items_job ||= (data.count.to_f / qty_jobs).ceil
|
15
|
+
end
|
16
|
+
|
17
|
+
def finish!
|
18
|
+
update!(state: :finished, finished_at: Time.current)
|
19
|
+
end
|
20
|
+
|
21
|
+
def all_processed?
|
22
|
+
items.count == data.count
|
23
|
+
end
|
24
|
+
|
25
|
+
# ********* user methods
|
26
|
+
def start!
|
27
|
+
Processor.new(id).call
|
28
|
+
end
|
29
|
+
|
30
|
+
def cancel
|
31
|
+
update!(state: :canceled)
|
32
|
+
end
|
33
|
+
|
34
|
+
def status
|
35
|
+
Rails.logger.info "Process status: #{task_model.items.count}/#{task_model.data.count}"
|
36
|
+
end
|
37
|
+
|
38
|
+
def export
|
39
|
+
path = Rails.root.join('tmp/batches_task_processor_result.csv')
|
40
|
+
data = items.pluck(:key, :result, :error_details)
|
41
|
+
data = [['Key', 'Result', 'Error details']] + data
|
42
|
+
File.write(path, data.map(&:to_csv).join)
|
43
|
+
Rails.logger.info "Exported to #{path}"
|
44
|
+
end
|
45
|
+
# ********* end user methods
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def apply_data_uniqueness
|
50
|
+
self.data = data.uniq
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -3,142 +3,96 @@
|
|
3
3
|
require 'active_support/all'
|
4
4
|
module BatchesTaskProcessor
|
5
5
|
class Processor
|
6
|
-
|
6
|
+
attr_reader :task_id
|
7
7
|
|
8
|
-
def
|
9
|
-
|
10
|
-
init_jobs
|
11
|
-
end
|
12
|
-
|
13
|
-
def process_job(job_no)
|
14
|
-
run_job(job_no.to_i, calculate_items)
|
8
|
+
def initialize(task_id = nil)
|
9
|
+
@task_id = task_id || ENV['RUNNER_TASK_ID']
|
15
10
|
end
|
16
11
|
|
17
|
-
def
|
12
|
+
def call
|
18
13
|
init_jobs
|
19
14
|
end
|
20
15
|
|
21
|
-
def
|
22
|
-
|
23
|
-
res[:jobs] = res[:jobs].times.map { |i| job_registry(i)[:items].count }
|
24
|
-
puts "Process status: #{res.inspect}"
|
25
|
-
end
|
26
|
-
|
27
|
-
def cancel
|
28
|
-
data = Rails.cache.read(RUNNER_JOB_KEY)
|
29
|
-
data[:cancelled] = true
|
30
|
-
Rails.cache.write(RUNNER_JOB_KEY, data)
|
31
|
-
end
|
32
|
-
|
33
|
-
def clear
|
34
|
-
res = Rails.cache.read(RUNNER_JOB_KEY)
|
35
|
-
res[:jobs].times.each { |i| job_registry(i, :delete) }
|
36
|
-
Rails.cache.delete(RUNNER_JOB_KEY)
|
16
|
+
def process_job(job_no)
|
17
|
+
run_job(job_no.to_i)
|
37
18
|
end
|
38
19
|
|
39
20
|
private
|
40
21
|
|
41
|
-
# ****** customizations
|
42
|
-
# @example ['article_id1', 'article_id2', 'article_id3']
|
43
|
-
# @example Article.where(created_at: 1.month_ago..Time.current)
|
44
|
-
def calculate_items
|
45
|
-
instance_exec(&BatchesTaskProcessor::Config.calculate_items)
|
46
|
-
end
|
47
|
-
|
48
22
|
# @example item.perform_my_action
|
49
23
|
def process_item(item)
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
def per_page
|
54
|
-
BatchesTaskProcessor::Config.per_page
|
24
|
+
instance_eval(task_model.process_item)
|
55
25
|
end
|
56
26
|
|
57
27
|
# @example Article.where(no: items)
|
58
28
|
def preload_job_items(items)
|
59
|
-
|
60
|
-
end
|
61
|
-
# ****** end customizations
|
62
|
-
|
63
|
-
def init_cache
|
64
|
-
items = calculate_items
|
65
|
-
jobs = (items.count.to_f / per_page).ceil
|
66
|
-
data = { jobs: jobs, count: items.count, date: Time.current, finished_jobs: [], cancelled: false }
|
67
|
-
main_registry(data)
|
29
|
+
instance_eval(task_model.preload_job_items || 'items')
|
68
30
|
end
|
69
31
|
|
70
32
|
def init_jobs
|
71
|
-
jobs =
|
33
|
+
jobs = task_model.qty_jobs
|
72
34
|
log "Initializing #{jobs} jobs..."
|
73
35
|
jobs.times.each do |index|
|
74
|
-
|
75
|
-
|
76
|
-
|
36
|
+
if task_model.queue_name
|
37
|
+
log "Scheduling ##{index} job..."
|
38
|
+
BatchesTaskProcessor::ProcessorJob.set(queue: task_model.queue_name).perform_later(task_id, index)
|
39
|
+
else
|
40
|
+
start_inline_job(index)
|
41
|
+
end
|
77
42
|
end
|
78
43
|
end
|
79
44
|
|
80
|
-
def
|
45
|
+
def start_inline_job(job_no)
|
46
|
+
log "Starting ##{job_no} job..."
|
47
|
+
env_vars = "RUNNER_JOB_NO=#{job_no} RUNNER_TASK_ID=#{task_id}"
|
48
|
+
pid = Process.spawn("#{env_vars} rake batches_task_processor:process_job &")
|
49
|
+
Process.detach(pid)
|
50
|
+
end
|
51
|
+
|
52
|
+
def run_job(job)
|
81
53
|
log "Running ##{job} job..."
|
82
|
-
|
54
|
+
items = job_items(job)
|
55
|
+
(items.try(:find_each) || items.each).with_index do |item, index|
|
83
56
|
key = item.try(:id) || item
|
84
57
|
break log('Process cancelled') if process_cancelled?
|
85
|
-
next log("Skipping #{key}...") if already_processed?(
|
58
|
+
next log("Skipping #{key}...") if already_processed?(key)
|
86
59
|
|
87
60
|
start_process_item(item, job, key, index)
|
88
61
|
end
|
89
62
|
|
90
|
-
mark_finished_job(job)
|
91
63
|
log "Finished #{job} job..."
|
64
|
+
task_model.finish! if task_model.all_processed?
|
92
65
|
end
|
93
66
|
|
94
|
-
def job_items(
|
95
|
-
|
67
|
+
def job_items(job)
|
68
|
+
res = task_model.data.each_slice(task_model.qty_items_job).to_a[job]
|
69
|
+
preload_job_items(res)
|
96
70
|
end
|
97
71
|
|
98
72
|
def start_process_item(item, job, key, index)
|
99
|
-
log "Processing #{job
|
100
|
-
process_item(item)
|
101
|
-
|
73
|
+
log "Processing key: #{key}, job: #{job}, counter: #{index}/#{task_model.qty_items_job}"
|
74
|
+
result = process_item(item)
|
75
|
+
task_model.items.create!(key: key, result: result.to_s[0..255])
|
102
76
|
rescue => e
|
103
|
-
|
77
|
+
task_model.items.create!(key: key, error_details: e.message)
|
104
78
|
log "Process failed #{job}/#{key}: #{e.message}"
|
105
79
|
end
|
106
80
|
|
107
|
-
def
|
108
|
-
|
109
|
-
new_data || Rails.cache.read(RUNNER_JOB_KEY)
|
110
|
-
end
|
111
|
-
|
112
|
-
def mark_finished_job(job)
|
113
|
-
main_registry(main_registry.merge(finished_jobs: main_registry[:finished_jobs] + [job]))
|
114
|
-
end
|
115
|
-
|
116
|
-
def job_registry(job, new_data = nil)
|
117
|
-
key = "#{RUNNER_JOB_KEY}/#{job}"
|
118
|
-
default_data = { items: [], errors: [] }
|
119
|
-
Rails.cache.write(key, default_data, expires_in: 1.week) unless Rails.cache.read(key)
|
120
|
-
Rails.cache.write(key, new_data, expires_in: 1.week) if new_data
|
121
|
-
Rails.cache.delete(key) if new_data == :delete
|
122
|
-
new_data || Rails.cache.read(key)
|
123
|
-
end
|
124
|
-
|
125
|
-
def update_job_cache(job, value, error = nil)
|
126
|
-
data = job_registry(job)
|
127
|
-
data[:items] << value
|
128
|
-
data[:errors] << [value, error] if error
|
129
|
-
job_registry(job, data)
|
130
|
-
end
|
131
|
-
|
132
|
-
def already_processed?(job, value)
|
133
|
-
job_registry(job)[:items].include?(value)
|
81
|
+
def already_processed?(key)
|
82
|
+
task_model.items.where(key: key).exists?
|
134
83
|
end
|
135
84
|
|
136
85
|
def process_cancelled?
|
137
|
-
|
86
|
+
task_model.state == 'cancelled'
|
138
87
|
end
|
139
88
|
|
140
89
|
def log(msg)
|
141
90
|
puts "BatchesTaskProcessor => #{msg}"
|
142
91
|
end
|
92
|
+
|
93
|
+
def task_model
|
94
|
+
klass = BatchesTaskProcessor::Model.all
|
95
|
+
task_id ? klass.find(task_id) : klass.last
|
96
|
+
end
|
143
97
|
end
|
144
98
|
end
|
@@ -6,5 +6,9 @@ module BatchesTaskProcessor
|
|
6
6
|
rake_tasks do
|
7
7
|
load 'tasks/batches_task_processor_tasks.rake'
|
8
8
|
end
|
9
|
+
initializer :append_migrations do |app|
|
10
|
+
path = File.join(File.expand_path('../../', __FILE__), 'db/migrate')
|
11
|
+
app.config.paths["db/migrate"] << path
|
12
|
+
end
|
9
13
|
end
|
10
14
|
end
|
@@ -1,18 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "batches_task_processor/version"
|
2
4
|
require "batches_task_processor/railtie"
|
3
5
|
require "batches_task_processor/processor"
|
4
|
-
|
6
|
+
require "batches_task_processor/processor_job"
|
7
|
+
require "batches_task_processor/model"
|
8
|
+
require "batches_task_processor/model_item"
|
5
9
|
|
6
10
|
module BatchesTaskProcessor
|
7
|
-
class Config
|
8
|
-
cattr_accessor(:per_page) { 5000 }
|
9
|
-
cattr_accessor(:calculate_items) { -> { raise('Implement calculate_items method') } }
|
10
|
-
cattr_accessor(:process_item) { -> (_item) { raise('Implement calculate_items method') } }
|
11
|
-
cattr_accessor(:preload_job_items) { -> (items) { items } }
|
12
|
-
|
13
|
-
|
14
|
-
def self.configure
|
15
|
-
yield self
|
16
|
-
end
|
17
|
-
end
|
18
11
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class AddBatchesTaskProcessor < ActiveRecord::Migration[5.0]
|
4
|
+
def change
|
5
|
+
create_table :batches_task_processors do |t|
|
6
|
+
t.string :key
|
7
|
+
t.string :state, default: :pending
|
8
|
+
t.json :data, default: []
|
9
|
+
t.integer :qty_jobs, default: 10
|
10
|
+
t.datetime :finished_at
|
11
|
+
t.text :preload_job_items
|
12
|
+
t.text :process_item, null: false
|
13
|
+
t.string :queue_name, default: :default
|
14
|
+
t.timestamps
|
15
|
+
end
|
16
|
+
|
17
|
+
create_table :batches_task_processor_items do |t|
|
18
|
+
t.belongs_to :batches_task_processors, foreign_key: true, index: { name: 'index_batches_task_processors_parent_id' }
|
19
|
+
t.string :key
|
20
|
+
t.text :result
|
21
|
+
t.text :error_details
|
22
|
+
t.timestamps
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -3,31 +3,11 @@
|
|
3
3
|
namespace :batches_task_processor do
|
4
4
|
desc 'Starts the Batches Task Processor'
|
5
5
|
task call: :environment do
|
6
|
-
BatchesTaskProcessor::Processor.new.call
|
6
|
+
BatchesTaskProcessor::Processor.new(ENV['RUNNER_TASK_ID']).call
|
7
7
|
end
|
8
8
|
|
9
9
|
desc 'Starts the Batches Task Processor'
|
10
10
|
task process_job: :environment do
|
11
|
-
BatchesTaskProcessor::Processor.new.process_job(ENV['RUNNER_JOB_NO'])
|
12
|
-
end
|
13
|
-
|
14
|
-
desc 'Retries the Batches Task Processor'
|
15
|
-
task retry: :environment do
|
16
|
-
BatchesTaskProcessor::Processor.new.retry
|
17
|
-
end
|
18
|
-
|
19
|
-
desc 'Prints the status of the Task Processor'
|
20
|
-
task status: :environment do
|
21
|
-
BatchesTaskProcessor::Processor.new.status
|
22
|
-
end
|
23
|
-
|
24
|
-
desc 'Cancels the Batches Task Processor'
|
25
|
-
task cancel: :environment do
|
26
|
-
BatchesTaskProcessor::Processor.new.cancel
|
27
|
-
end
|
28
|
-
|
29
|
-
desc 'Clears the Batches Task Processor cache'
|
30
|
-
task clear: :environment do
|
31
|
-
BatchesTaskProcessor::Processor.new.clear
|
11
|
+
BatchesTaskProcessor::Processor.new(ENV['RUNNER_TASK_ID']).process_job(ENV['RUNNER_JOB_NO'])
|
32
12
|
end
|
33
13
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: batches_task_processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Owen Peredo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -34,9 +34,13 @@ files:
|
|
34
34
|
- README.md
|
35
35
|
- Rakefile
|
36
36
|
- lib/batches_task_processor.rb
|
37
|
+
- lib/batches_task_processor/model.rb
|
38
|
+
- lib/batches_task_processor/model_item.rb
|
37
39
|
- lib/batches_task_processor/processor.rb
|
40
|
+
- lib/batches_task_processor/processor_job.rb
|
38
41
|
- lib/batches_task_processor/railtie.rb
|
39
42
|
- lib/batches_task_processor/version.rb
|
43
|
+
- lib/db/migrate/20220727101904_add_batches_task_processor.rb
|
40
44
|
- lib/tasks/batches_task_processor_tasks.rake
|
41
45
|
homepage: https://github.com/owen2345/batches-task-processor
|
42
46
|
licenses: []
|