med_pipe 0.1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +55 -0
- data/Rakefile +10 -0
- data/app/models/med_pipe/application_record.rb +11 -0
- data/app/models/med_pipe/pipeline_group.rb +5 -0
- data/app/models/med_pipe/pipeline_plan.rb +26 -0
- data/config/routes.rb +5 -0
- data/db/migrate/20241118063336_create_med_pipe_pipeline_plans.rb +21 -0
- data/db/migrate/20241122022123_create_med_pipe_pipeline_groups.rb +12 -0
- data/lib/med_pipe/batch_id_fetcher.rb +49 -0
- data/lib/med_pipe/batch_reader.rb +50 -0
- data/lib/med_pipe/engine.rb +7 -0
- data/lib/med_pipe/pipeline.rb +35 -0
- data/lib/med_pipe/pipeline_plan_consumer.rb +43 -0
- data/lib/med_pipe/pipeline_plan_producer.rb +34 -0
- data/lib/med_pipe/pipeline_runner_base.rb +16 -0
- data/lib/med_pipe/pipeline_task/counter.rb +19 -0
- data/lib/med_pipe/pipeline_task/plan_updater.rb +29 -0
- data/lib/med_pipe/pipeline_task/tsv_generator.rb +23 -0
- data/lib/med_pipe/pipeline_task.rb +4 -0
- data/lib/med_pipe/version.rb +5 -0
- data/lib/med_pipe.rb +20 -0
- data/lib/tasks/med_pipe_tasks.rake +6 -0
- data/spec/factories/med_pipe_pipeline_groups.rb +7 -0
- data/spec/factories/med_pipe_pipeline_plans.rb +10 -0
- metadata +107 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2d62e07ad3b1678e874749b5e34e930992476342d40ddad20be2dfd5b1da6593
|
4
|
+
data.tar.gz: 6d9c9dbffe01a6e7d4dd47c59a61f8233139d140600dc30d361cbe8d33c965ae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 198784f564acaa4e36cbfdfb8e8ded498645124ca6d6a74f07b32a6e0515f2fbb0ddb3a9df9c2c79e983f6cc3f3811c4aac5d03518237fdb4e1eaf9de6f36731
|
7
|
+
data.tar.gz: 51f1a3a9e7eaca4c62e304874022da628e4bf9c85de965c24328d12b5ea5c87abe0755f3b05e8f59a857d4d5615c869f058af9780e50f75af963118562ac67c5
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright mpg-taichi-sato
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# MedPipe <sup>BETA</sup>
|
2
|
+
100万 ~ 数10億程度のデータを処理するための仕組みを提供する Rails エンジンです。
|
3
|
+
|
4
|
+
## Concept
|
5
|
+
### MedPipe::Pipeline
|
6
|
+
apply で後述する PipelineTask を登録し、run で順番に実行します。
|
7
|
+
|
8
|
+
### MedPipe::PipelineTask
|
9
|
+
Pipeline に登録する処理の単位です。
|
10
|
+
DB からの読み込みや、S3 へのアップロード等やることを分割してタスク化します。
|
11
|
+
大量データを扱う際には Enumerable::Lazy を使うことで分割して処理をすることができます。
|
12
|
+
call を実装する必要があります
|
13
|
+
|
14
|
+
```.rb
|
15
|
+
@param context [Hash] Stores data during pipeline execution
|
16
|
+
@param prev_result [Object] The result of the previous task
|
17
|
+
def call(context, prev_result)
|
18
|
+
yield 次のTaskに渡すデータ
|
19
|
+
end
|
20
|
+
```
|
21
|
+
|
22
|
+
### MedPipe::PipelinePlan
|
23
|
+
Pipeline の状態、オプション、結果を保存するためのモデルです。
|
24
|
+
Task で使うためのオプションを渡す方法は PipelinePlan から取得するか、contextで伝搬するかの二択です。
|
25
|
+
|
26
|
+
### MedPipe::PipelineGroup
|
27
|
+
一つのジョブで実行する Plan をまとめるためのモデルです。
|
28
|
+
実行中に parallel_limit を 0 にすることで中断することができます。
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
1. Reader, Uploader 等の PipelineTask を作成 [Samples](https://github.com/medpeer-dev/med_pipe/tree/main/spec/dummy/app/models/pipeline_task)
|
33
|
+
2. PipelineRunner を作成 [Sample](https://github.com/medpeer-dev/med_pipe/blob/main/spec/dummy/app/models/sample_pipeline_runner.rb)
|
34
|
+
3. Pipeline を並列実行するためのジョブを作成 [Sample](https://github.com/medpeer-dev/med_pipe/blob/main/spec/dummy/app/jobs/sample_execute_pipeline_job.rb)
|
35
|
+
4. PipelinePlan を登録するコードを記述
|
36
|
+
5. 実行
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
Add this line to your application's Gemfile:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
gem "med_pipe"
|
43
|
+
```
|
44
|
+
|
45
|
+
### migrationファイルの追加
|
46
|
+
|
47
|
+
```shell
|
48
|
+
$ rails med_pipe:install:migrations
|
49
|
+
```
|
50
|
+
|
51
|
+
## Contributing
|
52
|
+
Bug reports and pull requests are welcome.
|
53
|
+
|
54
|
+
## License
|
55
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class MedPipe::PipelinePlan < MedPipe::ApplicationRecord
|
4
|
+
belongs_to :pipeline_group, class_name: "MedPipe::PipelineGroup", optional: true
|
5
|
+
|
6
|
+
scope :active, -> { where(status: %i[enqueued running]) }
|
7
|
+
|
8
|
+
validates :name, presence: true
|
9
|
+
validates :output_unit, presence: true
|
10
|
+
validates :status, presence: true
|
11
|
+
|
12
|
+
# TODO: Rails6記法のため、Rails8に上げる際に定義の仕方を変える
|
13
|
+
# https://zenn.dev/kanazawa/articles/8bc1fcbba3ef1d#enum%E3%81%AE%E5%AE%9A%E7%BE%A9%E6%96%B9%E6%B3%95%E3%81%8C%E5%A4%89%E3%82%8F%E3%82%8B
|
14
|
+
enum status: {
|
15
|
+
waiting: "waiting",
|
16
|
+
enqueued: "enqueued",
|
17
|
+
running: "running",
|
18
|
+
finished: "finished",
|
19
|
+
failed: "failed"
|
20
|
+
}, _prefix: true
|
21
|
+
|
22
|
+
enum output_unit: {
|
23
|
+
daily: "daily",
|
24
|
+
all: "all"
|
25
|
+
}, _prefix: true
|
26
|
+
end
|
data/config/routes.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class CreateMedPipePipelinePlans < ActiveRecord::Migration[7.2]
|
4
|
+
def change
|
5
|
+
create_table :med_pipe_pipeline_plans do |t|
|
6
|
+
t.string :name, null: false, comment: "パイプライン名"
|
7
|
+
t.integer :priority, null: false, default: 0, comment: "実行優先度"
|
8
|
+
t.string :status, null: false
|
9
|
+
t.string :output_unit, null: false, comment: "実行単位. 日ごと、全て等"
|
10
|
+
t.date :target_date, comment: "実行対象日. output_unit が daily の場合に指定"
|
11
|
+
t.bigint :data_count
|
12
|
+
t.string :file_name
|
13
|
+
t.bigint :file_size
|
14
|
+
t.string :upload_to
|
15
|
+
t.datetime :started_at
|
16
|
+
t.datetime :finished_at
|
17
|
+
|
18
|
+
t.timestamps
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class CreateMedPipePipelineGroups < ActiveRecord::Migration[7.2]
|
4
|
+
def change
|
5
|
+
create_table :med_pipe_pipeline_groups do |t|
|
6
|
+
t.integer :parallel_limit, null: false, default: 1, comment: "並列実行数"
|
7
|
+
t.timestamps
|
8
|
+
end
|
9
|
+
|
10
|
+
add_reference :med_pipe_pipeline_plans, :pipeline_group, null: false
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# idを最大max_load_size件ずつ分割取得するためのクラス
|
4
|
+
# 使い時:
|
5
|
+
# - 10万件以上のidを取得したい場合
|
6
|
+
# - 速度を改善するために in_batches を使いたくない場合
|
7
|
+
class MedPipe::BatchIdFetcher
|
8
|
+
def initialize(relation, batch_size: 1_000, max_load_size: 100_000)
|
9
|
+
@relation = relation
|
10
|
+
@batch_size = batch_size
|
11
|
+
@max_load_size = max_load_size
|
12
|
+
validate_parameters
|
13
|
+
end
|
14
|
+
|
15
|
+
def each
|
16
|
+
return enum_for(:each) unless block_given?
|
17
|
+
|
18
|
+
last_id = 0
|
19
|
+
cached_ids = []
|
20
|
+
|
21
|
+
loop do
|
22
|
+
loaded_ids = load_ids(last_id)
|
23
|
+
break if loaded_ids.blank?
|
24
|
+
|
25
|
+
last_id = loaded_ids.last
|
26
|
+
cached_ids.concat(loaded_ids)
|
27
|
+
yield(cached_ids.shift(@batch_size)) while cached_ids.size >= @batch_size
|
28
|
+
|
29
|
+
if loaded_ids.size < @max_load_size
|
30
|
+
yield(cached_ids) if cached_ids.present?
|
31
|
+
break
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def validate_parameters
|
39
|
+
raise ArgumentError, "batch_size must be greater than 0" if @batch_size <= 0
|
40
|
+
end
|
41
|
+
|
42
|
+
def load_ids(last_id)
|
43
|
+
if last_id.zero?
|
44
|
+
@relation.limit(@max_load_size).order(:id).ids
|
45
|
+
else
|
46
|
+
@relation.where("id > ?", last_id).order(:id).limit(@max_load_size).ids
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# 大量データを分割取得するためのクラス
|
4
|
+
# in_batches では scope が全クエリに含まれるが、本クラスではidの取得でのみ scope を使用する
|
5
|
+
class MedPipe::BatchReader
|
6
|
+
def initialize(model_class, scope: nil, pluck_columns: [:id], batch_size: 1_000,
|
7
|
+
max_id_load_size: 100_000)
|
8
|
+
@model_class = model_class
|
9
|
+
@scope = scope || model_class.all
|
10
|
+
@pluck_columns = pluck_columns
|
11
|
+
@batch_size = batch_size
|
12
|
+
@max_id_load_size = max_id_load_size
|
13
|
+
@around_load_callback = nil
|
14
|
+
validate_parameters
|
15
|
+
end
|
16
|
+
|
17
|
+
# EXAMPLE:
|
18
|
+
# MedPipe::BatchReader.new(User)
|
19
|
+
# .around_load { |&block| ApplicationRecord.connected_to(role: :reading, &block) }
|
20
|
+
def around_load(&block)
|
21
|
+
@around_load_callback = block
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
# @yieldparam [Array] pluck結果を1件ずつ渡す
|
26
|
+
def each(&block)
|
27
|
+
return enum_for(:each) unless block
|
28
|
+
|
29
|
+
each_ids = MedPipe::BatchIdFetcher.new(@scope, batch_size: @batch_size, max_load_size: @max_id_load_size).each
|
30
|
+
loop do
|
31
|
+
records = @around_load_callback&.call { batch_load(each_ids) } || batch_load(each_ids)
|
32
|
+
records.each(&block)
|
33
|
+
rescue StopIteration
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def validate_parameters
|
41
|
+
raise ArgumentError, "model_class must be a subclass of ApplicationRecord" unless @model_class < ApplicationRecord
|
42
|
+
end
|
43
|
+
|
44
|
+
def batch_load(each_ids)
|
45
|
+
# in_batches ではクエリキャッシュが無効になっているため、それに倣う
|
46
|
+
@model_class.uncached do
|
47
|
+
@model_class.where(id: each_ids.next).pluck(*@pluck_columns)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class MedPipe::Engine < Rails::Engine
|
4
|
+
# migrationファイルの生成コマンドを med_pipe_engine:install:migrations から med_pipe:install:migrations に変更
|
5
|
+
# https://edgeapi.rubyonrails.org/classes/Rails/Engine.html#class-Rails::Engine-label-Engine+name
|
6
|
+
engine_name "med_pipe"
|
7
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# 直列に繋いだtaskを順番に実行するクラス
|
4
|
+
class MedPipe::Pipeline
|
5
|
+
def initialize
|
6
|
+
@tasks = []
|
7
|
+
end
|
8
|
+
|
9
|
+
# @param task [Object] def call(context, prev_result, &block) を実装したクラス
|
10
|
+
def apply(task)
|
11
|
+
@tasks << task
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param context [Hash] Stores data during pipeline execution
|
16
|
+
def run(context = {}) = run_task_recursive(context)
|
17
|
+
# 展開すると以下のようになる
|
18
|
+
# @tasks[0].call(context, nil) do |prev_result|
|
19
|
+
# @tasks[1].call(context, prev_result) do |prev_result|
|
20
|
+
# @tasks[2].call(context, prev_result) do |prev_result|
|
21
|
+
# nil
|
22
|
+
# end
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def run_task_recursive(context, prev_result = nil, task_index = 0)
|
29
|
+
return prev_result if task_index >= @tasks.size
|
30
|
+
|
31
|
+
@tasks[task_index]&.call(context, prev_result) do |result|
|
32
|
+
run_task_recursive(context, result, task_index + 1)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# enqueued な pipeline plan を1つ取得 & 実行
|
4
|
+
class MedPipe::PipelinePlanConsumer
|
5
|
+
# @param [Proc] pipeline_runner pipeline plan から pipeline を作成し実行する
|
6
|
+
def initialize(pipeline_group:, pipeline_runner:)
|
7
|
+
@pipeline_group = pipeline_group
|
8
|
+
@pipeline_runner = pipeline_runner
|
9
|
+
end
|
10
|
+
|
11
|
+
# @return [PipelinePlan] 実行した pipeline plan。なければ nil
|
12
|
+
def run
|
13
|
+
pipeline_plan = fetch_and_run_pipeline_plan
|
14
|
+
return nil if pipeline_plan.nil?
|
15
|
+
|
16
|
+
@pipeline_runner.call(pipeline_plan)
|
17
|
+
complete_pipeline_plan(pipeline_plan)
|
18
|
+
pipeline_plan
|
19
|
+
rescue StandardError => e
|
20
|
+
error_pipeline_plan(pipeline_plan)
|
21
|
+
raise e
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def fetch_and_run_pipeline_plan
|
27
|
+
ApplicationRecord.transaction do
|
28
|
+
target_pipeline_plan = @pipeline_group.pipeline_plans.lock.status_enqueued.order(priority: :desc).first
|
29
|
+
return if target_pipeline_plan.nil?
|
30
|
+
|
31
|
+
target_pipeline_plan.update!(status: :running)
|
32
|
+
target_pipeline_plan
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def complete_pipeline_plan(pipeline_plan)
|
37
|
+
pipeline_plan.update!(status: :finished, finished_at: Time.current)
|
38
|
+
end
|
39
|
+
|
40
|
+
def error_pipeline_plan(pipeline_plan)
|
41
|
+
pipeline_plan.update!(status: :failed)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# prioryty が高いものから順に、並列数を踏まえて複数のPipelinePlanの状態をenqueuedに変更する
|
4
|
+
class MedPipe::PipelinePlanProducer
|
5
|
+
# @param pipeline_group [MedPipe::PipelineGroup]
|
6
|
+
def initialize(pipeline_group)
|
7
|
+
@pipeline_group = pipeline_group
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [Array<MedPipe::PipelinePlan>] Enqueued pipeline plans. 未実行ならnilを返す
|
11
|
+
def run
|
12
|
+
return if @pipeline_group.parallel_limit <= 0
|
13
|
+
|
14
|
+
@pipeline_group.with_lock do
|
15
|
+
enqueue_count = @pipeline_group.parallel_limit - @pipeline_group.pipeline_plans.active.count
|
16
|
+
enqueue(enqueue_count) if enqueue_count.positive?
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def enqueue(size)
|
23
|
+
target_pipeline_plans = fetch_target_pipeline_plans(size: size)
|
24
|
+
return if target_pipeline_plans.empty?
|
25
|
+
|
26
|
+
target_pipeline_plans.each do |pipline_plan|
|
27
|
+
pipline_plan.update!(status: :enqueued)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def fetch_target_pipeline_plans(size:)
|
32
|
+
@pipeline_group.pipeline_plans.status_waiting.order(priority: :desc).limit(size)
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# PipelinePlanConsumerに渡すPipelineRunnerの作成を補助するクラス
|
4
|
+
# call(pipeline_plan)さえ実装していれば良いため、必ずしも本クラスを使う必要はありません。
|
5
|
+
class MedPipe::PipelineRunnerBase
|
6
|
+
# PipelinePlanConsumerから呼び出されるメソッド
|
7
|
+
def call(pipeline_plan)
|
8
|
+
pipeline = build_pipeline(pipeline_plan)
|
9
|
+
context = { plan: pipeline_plan }
|
10
|
+
pipeline.run(context)
|
11
|
+
end
|
12
|
+
|
13
|
+
def build_pipeline(pipeline_plan)
|
14
|
+
raise NotImplementedError("#{pipeline_plan.name}に対応するPipelineを作成する処理をサブクラスで実装してください")
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class MedPipe::PipelineTask::Counter
|
4
|
+
def initialize
|
5
|
+
@count = 0
|
6
|
+
end
|
7
|
+
|
8
|
+
# @param context [Hash]
|
9
|
+
# @param input [Enumerable<Array<Object>>]
|
10
|
+
# @yieldparam [Enumerable<Array<Object>>] inputをそのまま流す
|
11
|
+
def call(context, input)
|
12
|
+
yield input.map { |x| increment(context); x } # rubocop:disable Style/Semicolon
|
13
|
+
end
|
14
|
+
|
15
|
+
def increment(context)
|
16
|
+
@count += 1
|
17
|
+
context[:data_count] = @count
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class MedPipe::PipelineTask::PlanUpdater
|
4
|
+
# @param save [Boolean] trueの場合、Planを保存する。finishにするために更新が走るためここで保存しないことをdefaultにしている
|
5
|
+
def initialize(save: false)
|
6
|
+
@save = save
|
7
|
+
end
|
8
|
+
|
9
|
+
# @param context [Hash]
|
10
|
+
# @param input [Enumerable<Array<Object>>]
|
11
|
+
# @yieldparam [Enumerable<Array<Object>>] inputをそのまま流す
|
12
|
+
def call(context, input)
|
13
|
+
update_plan(context)
|
14
|
+
block_given? ? yield(input) : input
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def update_plan(context)
|
20
|
+
return unless context[:plan]
|
21
|
+
|
22
|
+
plan = context[:plan]
|
23
|
+
plan.data_count = context[:data_count] if context[:data_count]
|
24
|
+
plan.file_name = context[:file_name] if context[:file_name]
|
25
|
+
plan.file_size = context[:file_size] if context[:file_size]
|
26
|
+
plan.upload_to = context[:upload_to] if context[:upload_to]
|
27
|
+
plan.save if @save
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
class MedPipe::PipelineTask::TsvGenerater
|
6
|
+
TSV_OPTION = { col_sep: "\t" }.freeze
|
7
|
+
|
8
|
+
# @param lines [Enumerable<Array<Object>>] to_s可能なオブジェクトの配列のEnumerable
|
9
|
+
# @yieldparam [File] 生成したtsvファイル
|
10
|
+
def call(_context, lines)
|
11
|
+
Tempfile.create do |file|
|
12
|
+
lines.each do |line|
|
13
|
+
# nil に置き換えることで""という文字列が出力されてしまうのを回避
|
14
|
+
normalized_line = line.map { |v| v == "" ? nil : v }
|
15
|
+
tsv_line = CSV.generate_line(normalized_line, **TSV_OPTION)
|
16
|
+
file.puts(tsv_line)
|
17
|
+
end
|
18
|
+
file.rewind
|
19
|
+
|
20
|
+
yield(file)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/med_pipe.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "med_pipe/version"
|
4
|
+
require "med_pipe/engine"
|
5
|
+
|
6
|
+
require "med_pipe/batch_id_fetcher"
|
7
|
+
require "med_pipe/batch_reader"
|
8
|
+
require "med_pipe/pipeline_plan_consumer"
|
9
|
+
require "med_pipe/pipeline_plan_producer"
|
10
|
+
require "med_pipe/pipeline_runner_base"
|
11
|
+
require "med_pipe/pipeline"
|
12
|
+
|
13
|
+
require "med_pipe/pipeline_task"
|
14
|
+
require "med_pipe/pipeline_task/counter"
|
15
|
+
require "med_pipe/pipeline_task/tsv_generator"
|
16
|
+
require "med_pipe/pipeline_task/plan_updater"
|
17
|
+
|
18
|
+
module MedPipe
|
19
|
+
# Your code goes here...
|
20
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
FactoryBot.define do
|
4
|
+
factory :med_pipe_pipeline_plan, class: "MedPipe::PipelinePlan" do
|
5
|
+
name { "dummy" }
|
6
|
+
output_unit { :all }
|
7
|
+
status { :waiting }
|
8
|
+
association :pipeline_group, factory: :med_pipe_pipeline_group
|
9
|
+
end
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: med_pipe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- mpg-taichi-sato
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rails
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 6.1.7
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '8.0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 6.1.7
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '8.0'
|
33
|
+
description: "# MedPipe <sup>BETA</sup>\n100万 ~ 数10億程度のデータを処理するための仕組みを提供する Rails エンジンです。\n\n##
|
34
|
+
Concept\n### MedPipe::Pipeline\napply で後述する PipelineTask を登録し、run で順番に実行します。\n\n###
|
35
|
+
MedPipe::PipelineTask\nPipeline に登録する処理の単位です。 \nDB からの読み込みや、S3 へのアップロード等やることを分割してタスク化します。
|
36
|
+
\ \n大量データを扱う際には Enumerable::Lazy を使うことで分割して処理をすることができます。 \ncall を実装する必要があります\n\n```.rb\n@param
|
37
|
+
context [Hash] Stores data during pipeline execution\n@param prev_result [Object]
|
38
|
+
The result of the previous task\ndef call(context, prev_result)\n yield 次のTaskに渡すデータ\nend\n```\n\n###
|
39
|
+
MedPipe::PipelinePlan\nPipeline の状態、オプション、結果を保存するためのモデルです。 \nTask で使うためのオプションを渡す方法は
|
40
|
+
PipelinePlan から取得するか、contextで伝搬するかの二択です。\n\n### MedPipe::PipelineGroup\n一つのジョブで実行する
|
41
|
+
Plan をまとめるためのモデルです。 \n実行中に parallel_limit を 0 にすることで中断することができます。\n\n## Usage\n\n1.
|
42
|
+
Reader, Uploader 等の PipelineTask を作成 [Samples](https://github.com/medpeer-dev/med_pipe/tree/main/spec/dummy/app/models/pipeline_task)\n2.
|
43
|
+
PipelineRunner を作成 [Sample](https://github.com/medpeer-dev/med_pipe/blob/main/spec/dummy/app/models/sample_pipeline_runner.rb)\n3.
|
44
|
+
Pipeline を並列実行するためのジョブを作成 [Sample](https://github.com/medpeer-dev/med_pipe/blob/main/spec/dummy/app/jobs/sample_execute_pipeline_job.rb)\n4.
|
45
|
+
PipelinePlan を登録するコードを記述\n5. 実行\n\n## Installation\nAdd this line to your application's
|
46
|
+
Gemfile:\n\n```ruby\ngem \"med_pipe\"\n```\n\n### migrationファイルの追加\n\n```shell\n$
|
47
|
+
rails med_pipe:install:migrations\n```\n\n## Contributing\nBug reports and pull
|
48
|
+
requests are welcome.\n\n## License\nThe gem is available as open source under the
|
49
|
+
terms of the [MIT License](https://opensource.org/licenses/MIT).\n"
|
50
|
+
email:
|
51
|
+
executables: []
|
52
|
+
extensions: []
|
53
|
+
extra_rdoc_files: []
|
54
|
+
files:
|
55
|
+
- MIT-LICENSE
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- app/models/med_pipe/application_record.rb
|
59
|
+
- app/models/med_pipe/pipeline_group.rb
|
60
|
+
- app/models/med_pipe/pipeline_plan.rb
|
61
|
+
- config/routes.rb
|
62
|
+
- db/migrate/20241118063336_create_med_pipe_pipeline_plans.rb
|
63
|
+
- db/migrate/20241122022123_create_med_pipe_pipeline_groups.rb
|
64
|
+
- lib/med_pipe.rb
|
65
|
+
- lib/med_pipe/batch_id_fetcher.rb
|
66
|
+
- lib/med_pipe/batch_reader.rb
|
67
|
+
- lib/med_pipe/engine.rb
|
68
|
+
- lib/med_pipe/pipeline.rb
|
69
|
+
- lib/med_pipe/pipeline_plan_consumer.rb
|
70
|
+
- lib/med_pipe/pipeline_plan_producer.rb
|
71
|
+
- lib/med_pipe/pipeline_runner_base.rb
|
72
|
+
- lib/med_pipe/pipeline_task.rb
|
73
|
+
- lib/med_pipe/pipeline_task/counter.rb
|
74
|
+
- lib/med_pipe/pipeline_task/plan_updater.rb
|
75
|
+
- lib/med_pipe/pipeline_task/tsv_generator.rb
|
76
|
+
- lib/med_pipe/version.rb
|
77
|
+
- lib/tasks/med_pipe_tasks.rake
|
78
|
+
- spec/factories/med_pipe_pipeline_groups.rb
|
79
|
+
- spec/factories/med_pipe_pipeline_plans.rb
|
80
|
+
homepage: https://github.com/medpeer-dev/med_pipe
|
81
|
+
licenses:
|
82
|
+
- MIT
|
83
|
+
metadata:
|
84
|
+
homepage_uri: https://github.com/medpeer-dev/med_pipe
|
85
|
+
source_code_uri: https://github.com/medpeer-dev/med_pipe
|
86
|
+
rubygems_mfa_required: 'true'
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: 3.0.0
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubygems_version: 3.5.22
|
103
|
+
signing_key:
|
104
|
+
specification_version: 4
|
105
|
+
summary: Provides a system for processing data ranging from 1 million to several billion
|
106
|
+
records
|
107
|
+
test_files: []
|