kraps 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,37 @@
1
+ require "attachie"
2
+ require "attachie/s3_driver"
3
+ require "attachie/fake_driver"
4
+
5
+ module Kraps
6
+ module Drivers
7
+ module Driver
8
+ def with_prefix(path)
9
+ File.join(*[@prefix, path].compact)
10
+ end
11
+ end
12
+
13
+ class S3Driver
14
+ include Driver
15
+
16
+ attr_reader :driver, :bucket, :prefix
17
+
18
+ def initialize(s3_client:, bucket:, prefix: nil)
19
+ @driver = Attachie::S3Driver.new(s3_client)
20
+ @bucket = bucket
21
+ @prefix = prefix
22
+ end
23
+ end
24
+
25
+ class FakeDriver
26
+ include Driver
27
+
28
+ attr_reader :driver, :bucket, :prefix
29
+
30
+ def initialize(bucket:, prefix: nil)
31
+ @driver = Attachie::FakeDriver.new
32
+ @bucket = bucket
33
+ @prefix = prefix
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ Frame = Struct.new(:token, :partitions, keyword_init: true)
3
+ end
@@ -0,0 +1,34 @@
1
+ module Kraps
2
+ class Interval
3
+ include MonitorMixin
4
+
5
+ def initialize(timeout, &block)
6
+ super()
7
+
8
+ @thread_queue = TimeoutQueue.new
9
+ @main_queue = TimeoutQueue.new
10
+ @stopped = false
11
+
12
+ @thread = Thread.new do
13
+ until @stopped
14
+ item = @thread_queue.deq(timeout: timeout)
15
+
16
+ block.call unless @stopped
17
+
18
+ @main_queue.enq(1) if item
19
+ end
20
+ end
21
+ end
22
+
23
+ def fire(timeout:)
24
+ @thread_queue.enq(1)
25
+ @main_queue.deq(timeout: timeout)
26
+ end
27
+
28
+ def stop
29
+ @stopped = true
30
+ @thread_queue.enq(nil)
31
+ @thread.join
32
+ end
33
+ end
34
+ end
data/lib/kraps/job.rb ADDED
@@ -0,0 +1,62 @@
1
+ module Kraps
2
+ class Job
3
+ attr_reader :steps
4
+
5
+ def initialize(worker:)
6
+ @worker = worker
7
+ @steps = []
8
+ @partitions = 0
9
+ @partitioner = MapReduce::HashPartitioner.new(@partitions)
10
+ end
11
+
12
+ def parallelize(partitions:, partitioner: MapReduce::HashPartitioner.new(partitions), worker: @worker, &block)
13
+ fresh.tap do |job|
14
+ job.instance_eval do
15
+ @partitions = partitions
16
+ @partitioner = partitioner
17
+
18
+ @steps << Step.new(action: Actions::PARALLELIZE, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
19
+ end
20
+ end
21
+ end
22
+
23
+ def map(partitions: nil, partitioner: nil, worker: @worker, &block)
24
+ fresh.tap do |job|
25
+ job.instance_eval do
26
+ @partitions = partitions if partitions
27
+ @partitioner = partitioner || MapReduce::HashPartitioner.new(partitions) if partitioner || partitions
28
+
29
+ @steps << Step.new(action: Actions::MAP, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
30
+ end
31
+ end
32
+ end
33
+
34
+ def reduce(worker: @worker, &block)
35
+ fresh.tap do |job|
36
+ job.instance_eval do
37
+ @steps << Step.new(action: Actions::REDUCE, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
38
+ end
39
+ end
40
+ end
41
+
42
+ def each_partition(worker: @worker, &block)
43
+ fresh.tap do |job|
44
+ job.instance_eval do
45
+ @steps << Step.new(action: Actions::EACH_PARTITION, args: { partitions: @partitions, partitioner: @partitioner, worker: worker }, block: block)
46
+ end
47
+ end
48
+ end
49
+
50
+ def repartition(partitions:, partitioner: nil, worker: @worker)
51
+ map(partitions: partitions, partitioner: partitioner, worker: worker) do |key, value, collector|
52
+ collector.call(key, value)
53
+ end
54
+ end
55
+
56
+ def fresh
57
+ dup.tap do |job|
58
+ job.instance_variable_set(:@steps, @steps.dup)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,29 @@
1
+ module Kraps
2
+ class Parallelizer
3
+ def self.each(enum, num_threads)
4
+ queue = Queue.new
5
+
6
+ enum.each { |element| queue.push element }
7
+
8
+ stopped = false
9
+
10
+ threads = Array.new(num_threads) do
11
+ Thread.new do
12
+ yield queue.pop(true) until stopped || queue.empty?
13
+ rescue ThreadError
14
+ # Queue empty
15
+ rescue StandardError => e
16
+ stopped = true
17
+
18
+ e
19
+ end
20
+ end
21
+
22
+ threads.each(&:join).each do |thread|
23
+ raise thread.value if thread.value.is_a?(Exception)
24
+ end
25
+
26
+ enum
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,160 @@
1
+ module Kraps
2
+ class Runner
3
+ def initialize(klass)
4
+ @klass = klass
5
+ end
6
+
7
+ def call(*args, **kwargs)
8
+ Array(@klass.new.call(*args, **kwargs)).tap do |jobs|
9
+ jobs.each_with_index do |job, job_index|
10
+ job.steps.each_with_index.inject(nil) do |frame, (_, step_index)|
11
+ StepRunner.new(
12
+ klass: @klass,
13
+ args: args,
14
+ kwargs: kwargs,
15
+ jobs: jobs,
16
+ job_index: job_index,
17
+ step_index: step_index,
18
+ frame: frame
19
+ ).call
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class StepRunner
26
+ def initialize(klass:, args:, kwargs:, jobs:, job_index:, step_index:, frame:)
27
+ @klass = klass
28
+ @args = args
29
+ @kwargs = kwargs
30
+ @jobs = jobs
31
+ @job_index = job_index
32
+ @job = @jobs[@job_index]
33
+ @step_index = step_index
34
+ @step = @job.steps[@step_index]
35
+ @frame = frame
36
+ end
37
+
38
+ def call
39
+ raise(InvalidAction, "Invalid action #{@step.action}") unless Actions::ALL.include?(@step.action)
40
+
41
+ @step.frame ||= send(:"perform_#{@step.action}")
42
+ end
43
+
44
+ private
45
+
46
+ def perform_parallelize
47
+ enum = Enumerator.new do |yielder|
48
+ collector = proc { |item| yielder << item }
49
+
50
+ @step.block.call(collector)
51
+ end
52
+
53
+ with_distributed_job do |distributed_job|
54
+ push_and_wait(distributed_job, enum) do |item, part|
55
+ enqueue(token: distributed_job.token, part: part, item: item)
56
+ end
57
+
58
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
59
+ end
60
+ end
61
+
62
+ def perform_map
63
+ with_distributed_job do |distributed_job|
64
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
65
+ enqueue(token: distributed_job.token, part: part, partition: partition)
66
+ end
67
+
68
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
69
+ end
70
+ end
71
+
72
+ def perform_reduce
73
+ with_distributed_job do |distributed_job|
74
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
75
+ enqueue(token: distributed_job.token, part: part, partition: partition)
76
+ end
77
+
78
+ Frame.new(token: distributed_job.token, partitions: @step.args[:partitions])
79
+ end
80
+ end
81
+
82
+ def perform_each_partition
83
+ with_distributed_job do |distributed_job|
84
+ push_and_wait(distributed_job, 0...@frame.partitions) do |partition, part|
85
+ enqueue(token: distributed_job.token, part: part, partition: partition)
86
+ end
87
+
88
+ @frame
89
+ end
90
+ end
91
+
92
+ def enqueue(token:, part:, **rest)
93
+ Kraps.enqueuer.call(
94
+ @step.args[:worker],
95
+ JSON.generate(
96
+ job_index: @job_index,
97
+ step_index: @step_index,
98
+ frame: @frame.to_h,
99
+ token: token,
100
+ part: part,
101
+ klass: @klass,
102
+ args: @args,
103
+ kwargs: @kwargs,
104
+ **rest
105
+ )
106
+ )
107
+ end
108
+
109
+ def with_distributed_job
110
+ distributed_job = Kraps.distributed_job_client.build(token: SecureRandom.hex)
111
+
112
+ yield(distributed_job)
113
+ rescue Interrupt
114
+ distributed_job&.stop
115
+ raise
116
+ end
117
+
118
+ def push_and_wait(distributed_job, enum)
119
+ progress_bar = build_progress_bar("#{@klass}: job #{@job_index + 1}/#{@jobs.size}, step #{@step_index + 1}/#{@job.steps.size}, token #{distributed_job.token}, %a, %c/%C (%p%) => #{@step.action}")
120
+
121
+ begin
122
+ total = 0
123
+
124
+ interval = Interval.new(1) do
125
+ progress_bar.total = total
126
+ end
127
+
128
+ distributed_job.push_each(enum) do |item, part|
129
+ total += 1
130
+ interval.fire(timeout: 1)
131
+
132
+ yield(item, part)
133
+ end
134
+ ensure
135
+ interval&.stop
136
+ end
137
+
138
+ loop do
139
+ progress_bar.total = distributed_job.total
140
+ progress_bar.progress = progress_bar.total - distributed_job.count
141
+
142
+ break if distributed_job.finished? || distributed_job.stopped?
143
+
144
+ sleep(1)
145
+ end
146
+
147
+ raise(JobStopped, "The job was stopped") if distributed_job.stopped?
148
+ ensure
149
+ progress_bar&.stop
150
+ end
151
+
152
+ def build_progress_bar(format)
153
+ options = { format: format, total: 1, autofinish: false }
154
+ options[:output] = ProgressBar::Outputs::Null unless Kraps.show_progress?
155
+
156
+ ProgressBar.create(options)
157
+ end
158
+ end
159
+ end
160
+ end
data/lib/kraps/step.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ Step = Struct.new(:action, :args, :block, :frame, keyword_init: true)
3
+ end
@@ -0,0 +1,29 @@
1
+ module Kraps
2
+ class TempPath
3
+ attr_reader :path
4
+
5
+ def initialize(prefix: nil, suffix: nil)
6
+ @path = File.join(Dir.tmpdir, [prefix, SecureRandom.hex[0, 16], Process.pid, suffix].compact.join("."))
7
+
8
+ File.open(@path, File::CREAT | File::EXCL) {}
9
+
10
+ ObjectSpace.define_finalizer(self, self.class.finalize(@path))
11
+
12
+ return unless block_given?
13
+
14
+ begin
15
+ yield
16
+ ensure
17
+ unlink
18
+ end
19
+ end
20
+
21
+ def unlink
22
+ FileUtils.rm_f(@path)
23
+ end
24
+
25
+ def self.finalize(path)
26
+ proc { FileUtils.rm_f(path) }
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,34 @@
1
+ module Kraps
2
+ class TempPaths
3
+ include MonitorMixin
4
+ include Enumerable
5
+
6
+ def initialize
7
+ super
8
+
9
+ @temp_paths = []
10
+ end
11
+
12
+ def add
13
+ synchronize do
14
+ temp_path = TempPath.new
15
+ @temp_paths << temp_path
16
+ temp_path
17
+ end
18
+ end
19
+
20
+ def unlink
21
+ synchronize do
22
+ @temp_paths.each(&:unlink)
23
+ end
24
+ end
25
+
26
+ def each(&block)
27
+ return enum_for(__method__) unless block_given?
28
+
29
+ synchronize do
30
+ @temp_paths.each(&block)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,27 @@
1
+ module Kraps
2
+ class TimeoutQueue
3
+ include MonitorMixin
4
+
5
+ def initialize
6
+ super
7
+
8
+ @cond = new_cond
9
+ @queue = []
10
+ end
11
+
12
+ def enq(item)
13
+ synchronize do
14
+ @queue << item
15
+ @cond.signal
16
+ end
17
+ end
18
+
19
+ def deq(timeout:)
20
+ synchronize do
21
+ @cond.wait(timeout) if @queue.empty?
22
+
23
+ return @queue.empty? ? nil : @queue.shift
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ module Kraps
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,183 @@
1
+ module Kraps
2
+ class Worker
3
+ def initialize(json, memory_limit:, chunk_limit:, concurrency:)
4
+ @args = JSON.parse(json)
5
+ @memory_limit = memory_limit
6
+ @chunk_limit = chunk_limit
7
+ @concurrency = concurrency
8
+ end
9
+
10
+ def call(retries: 3)
11
+ return if distributed_job.stopped?
12
+
13
+ raise(InvalidAction, "Invalid action #{step.action}") unless Actions::ALL.include?(step.action)
14
+
15
+ with_retries(retries) do # TODO: allow to use queue based retries
16
+ send(:"perform_#{step.action}")
17
+
18
+ distributed_job.done(@args["part"])
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def perform_parallelize
25
+ implementation = Class.new do
26
+ def map(key)
27
+ yield(key, nil)
28
+ end
29
+ end
30
+
31
+ mapper = MapReduce::Mapper.new(implementation.new, partitioner: partitioner, memory_limit: @memory_limit)
32
+ mapper.map(@args["item"])
33
+
34
+ mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
35
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket)
36
+ end
37
+ end
38
+
39
+ def perform_map
40
+ temp_paths = TempPaths.new
41
+
42
+ files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
43
+
44
+ temp_paths_index = files.each_with_object({}) do |file, hash|
45
+ hash[file] = temp_paths.add
46
+ end
47
+
48
+ Parallelizer.each(files, @concurrency) do |file|
49
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
50
+ end
51
+
52
+ current_step = step
53
+
54
+ implementation = Object.new
55
+ implementation.define_singleton_method(:map) do |key, value, &block|
56
+ current_step.block.call(key, value, block)
57
+ end
58
+
59
+ mapper = MapReduce::Mapper.new(implementation, partitioner: partitioner, memory_limit: @memory_limit)
60
+
61
+ temp_paths.each do |temp_path|
62
+ File.open(temp_path.path) do |stream|
63
+ stream.each_line do |line|
64
+ key, value = JSON.parse(line)
65
+
66
+ mapper.map(key, value)
67
+ end
68
+ end
69
+ end
70
+
71
+ mapper.shuffle do |partition, tempfile| # TODO: upload in parallel
72
+ Kraps.driver.driver.store(
73
+ Kraps.driver.with_prefix("#{@args["token"]}/#{partition}/chunk.#{@args["part"]}.json"), tempfile, Kraps.driver.bucket
74
+ )
75
+ end
76
+ ensure
77
+ temp_paths&.unlink
78
+ end
79
+
80
+ def perform_reduce
81
+ current_step = step
82
+
83
+ implementation = Object.new
84
+ implementation.define_singleton_method(:reduce) do |key, value1, value2|
85
+ current_step.block.call(key, value1, value2)
86
+ end
87
+
88
+ reducer = MapReduce::Reducer.new(implementation)
89
+
90
+ Parallelizer.each(Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")), @concurrency) do |file|
91
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, reducer.add_chunk)
92
+ end
93
+
94
+ tempfile = Tempfile.new
95
+
96
+ reducer.reduce(chunk_limit: @chunk_limit) do |key, value|
97
+ tempfile.puts(JSON.generate([key, value]))
98
+ end
99
+
100
+ Kraps.driver.driver.store(Kraps.driver.with_prefix("#{@args["token"]}/#{@args["partition"]}/chunk.#{@args["part"]}.json"), tempfile.tap(&:rewind), Kraps.driver.bucket)
101
+ ensure
102
+ tempfile&.close(true)
103
+ end
104
+
105
+ def perform_each_partition
106
+ temp_paths = TempPaths.new
107
+
108
+ files = Kraps.driver.driver.list(Kraps.driver.bucket, prefix: Kraps.driver.with_prefix("#{@args["frame"]["token"]}/#{@args["partition"]}/")).sort
109
+
110
+ temp_paths_index = files.each_with_object({}) do |file, hash|
111
+ hash[file] = temp_paths.add
112
+ end
113
+
114
+ Parallelizer.each(files, @concurrency) do |file|
115
+ Kraps.driver.driver.download(file, Kraps.driver.bucket, temp_paths_index[file].path)
116
+ end
117
+
118
+ enum = Enumerator::Lazy.new(temp_paths) do |yielder, temp_path|
119
+ File.open(temp_path.path) do |stream|
120
+ stream.each_line do |line|
121
+ yielder << JSON.parse(line)
122
+ end
123
+ end
124
+ end
125
+
126
+ step.block.call(@args["partition"], enum)
127
+ ensure
128
+ temp_paths&.unlink
129
+ end
130
+
131
+ def with_retries(num_retries)
132
+ retries = 0
133
+
134
+ begin
135
+ yield
136
+ rescue Kraps::Error
137
+ distributed_job.stop
138
+ rescue StandardError
139
+ sleep(5)
140
+ retries += 1
141
+
142
+ if retries >= num_retries
143
+ distributed_job.stop
144
+ raise
145
+ end
146
+
147
+ retry
148
+ end
149
+ end
150
+
151
+ def jobs
152
+ @jobs ||= Array(@args["klass"].constantize.new.call(*@args["args"], **@args["kwargs"].transform_keys(&:to_sym)))
153
+ end
154
+
155
+ def job
156
+ @job ||= begin
157
+ job_index = @args["job_index"]
158
+
159
+ jobs[job_index] || raise(InvalidJob, "Can't find job #{job_index}")
160
+ end
161
+ end
162
+
163
+ def steps
164
+ @steps ||= job.steps
165
+ end
166
+
167
+ def step
168
+ @step ||= begin
169
+ step_index = @args["step_index"]
170
+
171
+ steps[step_index] || raise(InvalidStep, "Can't find step #{step_index}")
172
+ end
173
+ end
174
+
175
+ def partitioner
176
+ @partitioner ||= step.args[:partitioner]
177
+ end
178
+
179
+ def distributed_job
180
+ @distributed_job ||= Kraps.distributed_job_client.build(token: @args["token"])
181
+ end
182
+ end
183
+ end
data/lib/kraps.rb ADDED
@@ -0,0 +1,48 @@
1
+ require_relative "kraps/version"
2
+ require_relative "kraps/drivers"
3
+ require_relative "kraps/actions"
4
+ require_relative "kraps/parallelizer"
5
+ require_relative "kraps/temp_path"
6
+ require_relative "kraps/temp_paths"
7
+ require_relative "kraps/timeout_queue"
8
+ require_relative "kraps/interval"
9
+ require_relative "kraps/job"
10
+ require_relative "kraps/runner"
11
+ require_relative "kraps/step"
12
+ require_relative "kraps/frame"
13
+ require_relative "kraps/worker"
14
+ require "distributed_job"
15
+ require "ruby-progressbar"
16
+ require "ruby-progressbar/outputs/null"
17
+ require "map_reduce"
18
+ require "redis"
19
+
20
+ module Kraps
21
+ class Error < StandardError; end
22
+ class InvalidAction < Error; end
23
+ class InvalidStep < Error; end
24
+ class JobStopped < Error; end
25
+
26
+ def self.configure(driver:, redis: Redis.new, namespace: nil, job_ttl: 24 * 60 * 60, show_progress: true, enqueuer: ->(worker, json) { worker.perform_async(json) })
27
+ @driver = driver
28
+ @distributed_job_client = DistributedJob::Client.new(redis: redis, namespace: namespace, default_ttl: job_ttl)
29
+ @show_progress = show_progress
30
+ @enqueuer = enqueuer
31
+ end
32
+
33
+ def self.driver
34
+ @driver
35
+ end
36
+
37
+ def self.distributed_job_client
38
+ @distributed_job_client
39
+ end
40
+
41
+ def self.show_progress?
42
+ @show_progress
43
+ end
44
+
45
+ def self.enqueuer
46
+ @enqueuer
47
+ end
48
+ end