kraps 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ require "attachie"
2
+ require "attachie/s3_driver"
3
+ require "attachie/fake_driver"
4
+
5
+ module Kraps
6
+ module Drivers
7
+ module Driver
8
+ def with_prefix(path)
9
+ File.join(*[@prefix, path].compact)
10
+ end
11
+ end
12
+
13
+ class S3Driver
14
+ include Driver
15
+
16
+ attr_reader :driver, :bucket, :prefix
17
+
18
+ def initialize(s3_client:, bucket:, prefix: nil)
19
+ @driver = Attachie::S3Driver.new(s3_client)
20
+ @bucket = bucket
21
+ @prefix = prefix
22
+ end
23
+ end
24
+
25
+ class FakeDriver
26
+ include Driver
27
+
28
+ attr_reader :driver, :bucket, :prefix
29
+
30
+ def initialize(bucket:, prefix: nil)
31
+ @driver = Attachie::FakeDriver.new
32
+ @bucket = bucket
33
+ @prefix = prefix
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ Frame = Struct.new(:token, :partitions, keyword_init: true)
3
+ end
@@ -0,0 +1,34 @@
1
+ module Kraps
2
+ class Interval
3
+ include MonitorMixin
4
+
5
+ def initialize(timeout, &block)
6
+ super()
7
+
8
+ @thread_queue = TimeoutQueue.new
9
+ @main_queue = TimeoutQueue.new
10
+ @stopped = false
11
+
12
+ @thread = Thread.new do
13
+ until @stopped
14
+ item = @thread_queue.deq(timeout: timeout)
15
+
16
+ block.call unless @stopped
17
+
18
+ @main_queue.enq(1) if item
19
+ end
20
+ end
21
+ end
22
+
23
+ def fire(timeout:)
24
+ @thread_queue.enq(1)
25
+ @main_queue.deq(timeout: timeout)
26
+ end
27
+
28
+ def stop
29
+ @stopped = true
30
+ @thread_queue.enq(nil)
31
+ @thread.join
32
+ end
33
+ end
34
+ end
data/lib/kraps/job.rb ADDED
@@ -0,0 +1,62 @@
1
+ module Kraps
2
+ class Job
3
+ attr_reader :steps
4
+
5
+ def initialize(worker:)
6
+ @worker = worker
7
+ @steps = []
8
+ @partitions = 0
9
+ @partitioner = MapReduce::HashPartitioner.new(@partitions)
10
+ end
11
+
12
+ def parallelize(partitions:, partitioner: MapReduce::HashPartitioner.new(partitions), worker: @worker, &block)
13
+ fresh.tap do |job|
14
+ job.instance_eval do
15
+ @partitions = partitions
16
+ @partitioner = partitioner
17
+
18
+ @steps << Step.new(action: Actions::PARALLELIZE, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
19
+ end
20
+ end
21
+ end
22
+
23
+ def map(partitions: nil, partitioner: nil, worker: @worker, &block)
24
+ fresh.tap do |job|
25
+ job.instance_eval do
26
+ @partitions = partitions if partitions
27
+ @partitioner = partitioner || MapReduce::HashPartitioner.new(partitions) if partitioner || partitions
28
+
29
+ @steps << Step.new(action: Actions::MAP, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
30
+ end
31
+ end
32
+ end
33
+
34
+ def reduce(worker: @worker, &block)
35
+ fresh.tap do |job|
36
+ job.instance_eval do
37
+ @steps << Step.new(action: Actions::REDUCE, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
38
+ end
39
+ end
40
+ end
41
+
42
+ def each_partition(worker: @worker, &block)
43
+ fresh.tap do |job|
44
+ job.instance_eval do
45
+ @steps << Step.new(action: Actions::EACH_PARTITION, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
46
+ end
47
+ end
48
+ end
49
+
50
+ def repartition(partitions:, partitioner: nil, worker: @worker)
51
+ map(partitions: partitions, partitioner: partitioner, worker: worker) do |key, value, collector|
52
+ collector.call(key, value)
53
+ end
54
+ end
55
+
56
+ def fresh
57
+ dup.tap do |job|
58
+ job.instance_variable_set(:@steps, @steps.dup)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,29 @@
1
+ module Kraps
2
+ class Parallelizer
3
+ def self.each(enum, num_threads)
4
+ queue = Queue.new
5
+
6
+ enum.each { |element| queue.push element }
7
+
8
+ stopped = false
9
+
10
+ threads = Array.new(num_threads) do
11
+ Thread.new do
12
+ yield queue.pop(true) until stopped || queue.empty?
13
+ rescue ThreadError
14
+ # Queue empty
15
+ rescue StandardError => e
16
+ stopped = true
17
+
18
+ e
19
+ end
20
+ end
21
+
22
+ threads.each(&:join).each do |thread|
23
+ raise thread.value if thread.value.is_a?(Exception)
24
+ end
25
+
26
+ enum
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,160 @@
1
+ module Kraps
2
+ class Runner
3
+ def initialize(klass)
4
+ @klass = klass
5
+ end
6
+
7
+ def call(*args, **kwargs)
8
+ Array(@klass.new.call(*args, **kwargs)).tap do |jobs|
9
+ jobs.each_with_index do |job, job_index|
10
+ job.steps.each_with_index.inject(nil) do |frame, (_, step_index)|
11
+ StepRunner.new(
12
+ klass: @klass,
13
+ args: args,
14
+ kwargs: kwargs,
15
+ jobs: jobs,
16
+ job_index: job_index,
17
+ step_index: step_index,
18
+ frame: frame
19
+ ).call
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class StepRunner
26
+ def initialize(klass:, args:, kwargs:, jobs:, job_index:, step_index:, frame:)
27
+ @klass = klass
28
+ @args = args
29
+ @kwargs = kwargs
30
+ @jobs = jobs
31
+ @job_index = job_index
32
+ @job = @jobs[@job_index]
33
+ @step_index = step_index
34
+ @step = @job.steps[@step_index]
35
+ @frame = frame
36
+ end
37
+
38
+ def call
39
+ raise(InvalidAction, "Invalid action #{@step.action}") unless Actions::ALL.include?(@step.action)
40
+
41
+ @step.frame ||= send(:"perform_#{@step.action}")
42
+ end
43
+
44
+ private
45
+
46
+ def perform_parallelize
47
+ enum = Enumerator.new do |yielder|
48
+ collector = proc { |item| yielder << item }
49
+
50
+ @step.block.call(collector)
51
+ end
52
+
53
+ with_distributed_job do |distributed_job|
54
+ push_and_wait(distributed_job, enum) do |item, part|
55
+ enqueue(token: distributed_job.token, part: part, item: item)
56
+ end
57
+
58
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
59
+ end
60
+ end
61
+
62
+ def perform_map
63
+ with_distributed_job do |distributed_job|
64
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
65
+ enqueue(token: distributed_job.token, part: part, partition: partition)
66
+ end
67
+
68
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
69
+ end
70
+ end
71
+
72
+ def perform_reduce
73
+ with_distributed_job do |distributed_job|
74
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
75
+ enqueue(token: distributed_job.token, part: part, partition: partition)
76
+ end
77
+
78
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
79
+ end
80
+ end
81
+
82
+ def perform_each_partition
83
+ with_distributed_job do |distributed_job|
84
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
85
+ enqueue(token: distributed_job.token, part: part, partition: partition)
86
+ end
87
+
88
+ @frame
89
+ end
90
+ end
91
+
92
+ def enqueue(token:, part:, **rest)
93
+ Kraps.enqueuer.call(
94
+ @step.args[:worker],
95
+ JSON.generate(
96
+ job_index: @job_index,
97
+ step_index: @step_index,
98
+ frame: @frame.to_h,
99
+ token: token,
100
+ part: part,
101
+ klass: @klass,
102
+ args: @args,
103
+ kwargs: @kwargs,
104
+ **rest
105
+ )
106
+ )
107
+ end
108
+
109
+ def with_distributed_job
110
+ distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
111
+
112
+ yield(distributed_job)
113
+ rescue Interrupt
114
+ distributed_job&.stop
115
+ raise
116
+ end
117
+
118
+ def push_and_wait(distributed_job, enum)
119
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{distributed_job.token}, %a, %c/%C (%p%) => #{@step.action}")
120
+
121
+ begin
122
+ total = 0
123
+
124
+ interval = Interval.new(1) do
125
+ progress_bar.total = total
126
+ end
127
+
128
+ distributed_job.push_each(enum) do |item, part|
129
+ total += 1
130
+ interval.fire(timeout: 1)
131
+
132
+ yield(item, part)
133
+ end
134
+ ensure
135
+ interval&.stop
136
+ end
137
+
138
+ loop do
139
+ progress_bar.total = distributed_job.total
140
+ progress_bar.progress = progress_bar.total - distributed_job.count
141
+
142
+ break if distributed_job.finished? || distributed_job.stopped?
143
+
144
+ sleep(1)
145
+ end
146
+
147
+ raise(JobStopped, "The job was stopped") if distributed_job.stopped?
148
+ ensure
149
+ progress_bar&.stop
150
+ end
151
+
152
+ def build_progress_bar(format)
153
+ options = { format: format, total: 1, autofinish: false }
154
+ options[:output] = ProgressBar::Outputs::Null unless Kraps.show_progress?
155
+
156
+ ProgressBar.create(options)
157
+ end
158
+ end
159
+ end
160
+ end
data/lib/kraps/step.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ Step = Struct.new(:action, :args, :block, :frame, keyword_init: true)
3
+ end
@@ -0,0 +1,29 @@
1
+ module Kraps
2
+ class TempPath
3
+ attr_reader :path
4
+
5
+ def initialize(prefix: nil, suffix: nil)
6
+ @path = File.join(Dir.tmpdir, [prefix, SecureRandom.hex[0, 16], Process.pid, suffix].compact.join("."))
7
+
8
+ File.open(@path, File::CREAT | File::EXCL) {}
9
+
10
+ ObjectSpace.define_finalizer(self, self.class.finalize(@path))
11
+
12
+ return unless block_given?
13
+
14
+ begin
15
+ yield
16
+ ensure
17
+ unlink
18
+ end
19
+ end
20
+
21
+ def unlink
22
+ FileUtils.rm_f(@path)
23
+ end
24
+
25
+ def self.finalize(path)
26
+ proc { FileUtils.rm_f(path) }
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,34 @@
1
+ module Kraps
2
+ class TempPaths
3
+ include MonitorMixin
4
+ include Enumerable
5
+
6
+ def initialize
7
+ super
8
+
9
+ @temp_paths = []
10
+ end
11
+
12
+ def add
13
+ synchronize do
14
+ temp_path = TempPath.new
15
+ @temp_paths << temp_path
16
+ temp_path
17
+ end
18
+ end
19
+
20
+ def unlink
21
+ synchronize do
22
+ @temp_paths.each(&:unlink)
23
+ end
24
+ end
25
+
26
+ def each(&block)
27
+ return enum_for(__method__) unless block_given?
28
+
29
+ synchronize do
30
+ @temp_paths.each(&block)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,27 @@
1
+ module Kraps
2
+ class TimeoutQueue
3
+ include MonitorMixin
4
+
5
+ def initialize
6
+ super
7
+
8
+ @cond = new_cond
9
+ @queue = []
10
+ end
11
+
12
+ def enq(item)
13
+ synchronize do
14
+ @queue << item
15
+ @cond.signal
16
+ end
17
+ end
18
+
19
+ def deq(timeout:)
20
+ synchronize do
21
+ @cond.wait(timeout) if @queue.empty?
22
+
23
+ return @queue.empty? ? nil : @queue.shift
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,183 @@
1
+ module Kraps
2
+ class Worker
3
+ def initialize(json, memory_limit:, chunk_limit:, concurrency:)
4
+ @args = JSON.parse(json)
5
+ @memory_limit = memory_limit
6
+ @chunk_limit = chunk_limit
7
+ @concurrency = concurrency
8
+ end
9
+
10
+ def call(retries: 3)
11
+ return if distributed_job.stopped?
12
+
13
+ raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
14
+
15
+ with_retries(retries) do # TODO: allow to use queue based retries
16
+ send(:"perform_#{step.action}")
17
+
18
+ distributed_job.done(@args["part"])
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def perform_parallelize
25
+ implementation = Class.new do
26
+ def map(key)
27
+ yield(key, nil)
28
+ end
29
+ end
30
+
31
+ mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
32
+ mapper.map(@args["item"])
33
+
34
+ mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
35
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket)
36
+ end
37
+ end
38
+
39
+ def perform_map
40
+ temp_paths = TempPaths.new
41
+
42
+ files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
43
+
44
+ temp_paths_index = files.each_with_object({}) do |file, hash|
45
+ hash[file] = temp_paths.add
46
+ end
47
+
48
+ Parallelizer.each(files, @concurrency) do |file|
49
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
50
+ end
51
+
52
+ current_step = step
53
+
54
+ implementation = Object.new
55
+ implementation.define_singleton_method(:map) do |key, value, &block|
56
+ current_step.block.call(key, value, block)
57
+ end
58
+
59
+ mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
60
+
61
+ temp_paths.each do |temp_path|
62
+ File.open(temp_path.path) do |stream|
63
+ stream.each_line do |line|
64
+ key, value = JSON.parse(line)
65
+
66
+ mapper.map(key, value)
67
+ end
68
+ end
69
+ end
70
+
71
+ mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
72
+ Kraps.driver.driver.store(
73
+ Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket
74
+ )
75
+ end
76
+ ensure
77
+ temp_paths&.unlink
78
+ end
79
+
80
+ def perform_reduce
81
+ current_step = step
82
+
83
+ implementation = Object.new
84
+ implementation.define_singleton_method(:reduce) do |key, value1, value2|
85
+ current_step.block.call(key, value1, value2)
86
+ end
87
+
88
+ reducer = MapReduce::Reducer.new(implementation)
89
+
90
+ Parallelizer.each(Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")), @concurrency) do |file|
91
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, reducer.add_chunk)
92
+ end
93
+
94
+ tempfile = Tempfile.new
95
+
96
+ reducer.reduce(chunk_limit: @chunk_limit) do |key, value|
97
+ tempfile.puts(JSON.generate([key, value]))
98
+ end
99
+
100
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{@args["partition"]}/chunk.#{@args["part"]}.json"), tempfile.tap(&:rewind), Kraps.driver.bucket)
101
+ ensure
102
+ tempfile&.close(true)
103
+ end
104
+
105
+ def perform_each_partition
106
+ temp_paths = TempPaths.new
107
+
108
+ files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
109
+
110
+ temp_paths_index = files.each_with_object({}) do |file, hash|
111
+ hash[file] = temp_paths.add
112
+ end
113
+
114
+ Parallelizer.each(files, @concurrency) do |file|
115
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
116
+ end
117
+
118
+ enum = Enumerator::Lazy.new(temp_paths) do |yielder, temp_path|
119
+ File.open(temp_path.path) do |stream|
120
+ stream.each_line do |line|
121
+ yielder << JSON.parse(line)
122
+ end
123
+ end
124
+ end
125
+
126
+ step.block.call(@args["partition"], enum)
127
+ ensure
128
+ temp_paths&.unlink
129
+ end
130
+
131
+ def with_retries(num_retries)
132
+ retries = 0
133
+
134
+ begin
135
+ yield
136
+ rescue Kraps::Error
137
+ distributed_job.stop
138
+ rescue StandardError
139
+ sleep(5)
140
+ retries += 1
141
+
142
+ if retries >= num_retries
143
+ distributed_job.stop
144
+ raise
145
+ end
146
+
147
+ retry
148
+ end
149
+ end
150
+
151
+ def jobs
152
+ @jobs ||= Array(@args["klass"].constantize.new.call(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)))
153
+ end
154
+
155
+ def job
156
+ @job ||= begin
157
+ job_index = @args["job_index"]
158
+
159
+ jobs[job_index] || raise(InvalidJob, "Can't find job #{job_index}")
160
+ end
161
+ end
162
+
163
+ def steps
164
+ @steps ||= job.steps
165
+ end
166
+
167
+ def step
168
+ @step ||= begin
169
+ step_index = @args["step_index"]
170
+
171
+ steps[step_index] || raise(InvalidStep, "Can't find step #{step_index}")
172
+ end
173
+ end
174
+
175
+ def partitioner
176
+ @partitioner ||= step.args[:partitioner]
177
+ end
178
+
179
+ def distributed_job
180
+ @distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
181
+ end
182
+ end
183
+ end
data/lib/kraps.rb ADDED
@@ -0,0 +1,48 @@
1
+ require_relative "kraps/version"
2
+ require_relative "kraps/drivers"
3
+ require_relative "kraps/actions"
4
+ require_relative "kraps/parallelizer"
5
+ require_relative "kraps/temp_path"
6
+ require_relative "kraps/temp_paths"
7
+ require_relative "kraps/timeout_queue"
8
+ require_relative "kraps/interval"
9
+ require_relative "kraps/job"
10
+ require_relative "kraps/runner"
11
+ require_relative "kraps/step"
12
+ require_relative "kraps/frame"
13
+ require_relative "kraps/worker"
14
+ require "distributed_job"
15
+ require "ruby-progressbar"
16
+ require "ruby-progressbar/outputs/null"
17
+ require "map_reduce"
18
+ require "redis"
19
+
20
+ module Kraps
21
+ class Error < StandardError; end
22
+ class InvalidAction < Error; end
23
+ class InvalidStep < Error; end
24
+ class JobStopped < Error; end
25
+
26
+ def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
27
+ @driver = driver
28
+ @distributed_job_client = DistributedJob::Client.new(redis: redis, namespace: namespace, default_ttl: job_ttl)
29
+ @show_progress = show_progress
30
+ @enqueuer = enqueuer
31
+ end
32
+
33
+ def self.driver
34
+ @driver
35
+ end
36
+
37
+ def self.distributed_job_client
38
+ @distributed_job_client
39
+ end
40
+
41
+ def self.show_progress?
42
+ @show_progress
43
+ end
44
+
45
+ def self.enqueuer
46
+ @enqueuer
47
+ end
48
+ end