rapidflow 0.1.0 β 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -1
- data/README.md +78 -63
- data/lib/rapidflow/batch.rb +9 -25
- data/lib/rapidflow/batch_builder.rb +16 -0
- data/lib/rapidflow/counter.rb +1 -1
- data/lib/rapidflow/errors.rb +7 -0
- data/lib/rapidflow/pipeline.rb +1 -1
- data/lib/rapidflow/stage.rb +9 -1
- data/lib/rapidflow/version.rb +2 -2
- data/lib/rapidflow/work_item.rb +1 -1
- data/lib/rapidflow.rb +3 -1
- data/scripts/benchmark/benchmark_api_request_process_and_storing.rb +11 -11
- data/scripts/benchmark/benchmark_images.rb +6 -6
- data/scripts/benchmark/simulated_data_processing.rb +6 -6
- data/sig/rapidflow.rbs +1 -1
- data/test/rapidflow/batch/config_error_test.rb +43 -0
- data/test/rapidflow/batch/error_handling_test.rb +211 -0
- data/test/rapidflow/batch_test.rb +71 -222
- data/test/rapidflow/counter_test.rb +1 -1
- data/test/rapidflow/pipeline_test.rb +67 -0
- data/test/rapidflow/stage_test.rb +110 -0
- data/test/rapidflow/work_item_test.rb +1 -1
- metadata +7 -2
- data/.github/workflows/main.yml +0 -35
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 29bd4bb0143b3647bad175cce56fe0a3b519114c5f951dddb853d2ea233f07ce
|
|
4
|
+
data.tar.gz: 95453103f015ff84017a6f9bc1b99f606a7864e6517d523297f544711f2b8bc2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05ac1a3b0ca195158082ba1abe3b698b80ccbbbc476d330b69167178223f39f93d4c3d16572472cceb7c4a910eecd06d765cbfab7ff7b60220fe1b82e8476eff
|
|
7
|
+
data.tar.gz: 7b7934103e0055302043c5cc3a17000f42f3d6e4fa4d405a3106239ee64c486d6c807963bf9407b442b1106b728993128824372eeece399476f986b668bf3080
|
data/CHANGELOG.md
CHANGED
|
@@ -5,7 +5,16 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [
|
|
8
|
+
## [0.2.0] - 2025.11.11
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Validation for `workers` count. Worker count should be a positive integer.
|
|
13
|
+
|
|
14
|
+
### Changed
|
|
15
|
+
|
|
16
|
+
- Rename module `Rapidflow` to `RapidFlow`.
|
|
17
|
+
- Move custom error classes from `RapidFlow::Batch` class under to `RapidFlow` module.
|
|
9
18
|
|
|
10
19
|
## [0.1.0] - 2025.11.01
|
|
11
20
|
|
data/README.md
CHANGED
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
# π
|
|
1
|
+
# π RapidFlow
|
|
2
2
|
|
|
3
3
|
βοΈπβ‘οΈπ¦π¨ππ
|
|
4
4
|
> A Ruby library for concurrent batch data processing through lightweight, composable flows.
|
|
5
5
|
|
|
6
|
+
[](https://badge.fury.io/rb/rapidflow)
|
|
6
7
|
[](LICENSE)
|
|
7
8
|
|
|
8
|
-
>
|
|
9
|
-
> may change without backward compatibility guarantees
|
|
9
|
+
> Note: β οΈ This library is at a very early stage of development. The interfaces and APIs
|
|
10
|
+
> may change without backward compatibility guarantees in minor versions (0.[minor version].[patch]).
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
and data processing.
|
|
12
|
+
RapidFlow is a lightweight, concurrent pipeline processor for Ruby that transforms data through multiple stages using Ruby Threads.
|
|
13
|
+
Perfect for I/O-bound operations like web scraping, API calls, and data processing.
|
|
14
14
|
|
|
15
15
|
## Features
|
|
16
16
|
|
|
17
|
-
- π **Concurrent Processing** - Multiple workers per stage process items
|
|
17
|
+
- π **Concurrent Processing** - Multiple workers per stage process items concurrently
|
|
18
18
|
- π **True Pipelining** - Different stages process different items simultaneously
|
|
19
19
|
- π¦ **Order Preservation** - Results returned in the same order items were pushed
|
|
20
20
|
- π‘οΈ **Error Handling** - Captures exceptions without stopping the flow
|
|
@@ -54,7 +54,7 @@ Create a batch instance.
|
|
|
54
54
|
require 'rapidflow'
|
|
55
55
|
|
|
56
56
|
# Create a 3-stage processing batch. Workers can be configured per stage basis or will use the default amount if omitted.
|
|
57
|
-
scraper =
|
|
57
|
+
scraper = RapidFlow::Batch.build do
|
|
58
58
|
stage ->(url) { fetch_html(url) }, workers: 8 # Stage 1: Fetch HTML
|
|
59
59
|
stage ->(html) { parse_data(html) }, workers: 2 # Stage 2: Parse data
|
|
60
60
|
stage ->(data) { save_to_db(data) } # Stage 3: Save to a database
|
|
@@ -64,7 +64,7 @@ end
|
|
|
64
64
|
Alternatively, you can also initialize the batch with the following syntax:
|
|
65
65
|
|
|
66
66
|
```ruby
|
|
67
|
-
batch =
|
|
67
|
+
batch = RapidFlow::Batch.new(
|
|
68
68
|
{ fn: ->(url) { fetch_html(url) }, workers: 8 }, # Stage 1: Fetch HTML.
|
|
69
69
|
{ fn: ->(html) { parse_data(html) }, workers: 2 }, # Stage 2: Parse data
|
|
70
70
|
{ fn: ->(data) { save_to_db(data) } } # Stage 3: Save to database
|
|
@@ -88,8 +88,7 @@ Note that Once you call `Batch#results`, it will block the batch until all proce
|
|
|
88
88
|
longer push items to the batch instance.
|
|
89
89
|
|
|
90
90
|
The results are returned in the same order as the original items were pushed. Each result is an array of
|
|
91
|
-
`[data, error]`.
|
|
92
|
-
Otherwise `data` represent the final data that was successfully processed and `error` will be `nil`.
|
|
91
|
+
`[data, error]`. No error means the item successfully were processed through the stages.
|
|
93
92
|
|
|
94
93
|
```ruby
|
|
95
94
|
results.each_with_index do |(data, error), index|
|
|
@@ -101,12 +100,58 @@ results.each_with_index do |(data, error), index|
|
|
|
101
100
|
end
|
|
102
101
|
```
|
|
103
102
|
|
|
103
|
+
## Error Handling
|
|
104
|
+
|
|
105
|
+
RapidFlow continues running even when errors occur, instead of stopping the entire pipeline.
|
|
106
|
+
|
|
107
|
+
When an item encounters an error at any stage, RapidFlow captures that error and moves the item to the
|
|
108
|
+
final resultsβskipping all remaining stages for that particular item.
|
|
109
|
+
|
|
110
|
+
Each result comes as a pair: `[data, error]`.
|
|
111
|
+
- If processing failed: `error` contains the Error instance, and `data` holds whatever transformed data existed
|
|
112
|
+
from the last successful stage (or original input data if the error occurred at the first stage).
|
|
113
|
+
- If processing succeeded: `data` contains the fully processed result, and `error` is `nil`.
|
|
114
|
+
|
|
115
|
+
```ruby
|
|
116
|
+
batch = RapidFlow::Batch.new(
|
|
117
|
+
{ fn: ->(url) { HTTP.get(url).body } }, # May raise network errors
|
|
118
|
+
{ fn: ->(body) { JSON.parse(body) } } # May raise JSON parse errors
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
urls.each { |url| batch.push(url) }
|
|
122
|
+
results = batch.results
|
|
123
|
+
|
|
124
|
+
results.each_with_index do |(data, error), index|
|
|
125
|
+
if error
|
|
126
|
+
# Original input if error happened at first stage. Otherwise, transformed data from the previous stage before the error happened
|
|
127
|
+
# It is preserved in 'data' for debugging if needed.
|
|
128
|
+
puts "Data state before error #{data}"
|
|
129
|
+
|
|
130
|
+
puts "Failed to process #{urls[index]}: #{error.class} - #{error.message}"
|
|
131
|
+
# Log error, retry, or handle gracefully
|
|
132
|
+
|
|
133
|
+
puts "Error backtrace: "
|
|
134
|
+
pp error.backtrace
|
|
135
|
+
# As any Exception contains the backtrace(https://docs.ruby-lang.org/en/master/Exception.html#method-i-backtrace),
|
|
136
|
+
# for further debugging, you can look into backtrace.
|
|
137
|
+
else
|
|
138
|
+
puts "Success: #{data}"
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Error behavior:**
|
|
144
|
+
- Exceptions are caught and returned with results
|
|
145
|
+
- The transformed data from the previous stage is preserved when an error occurs
|
|
146
|
+
- Errors in early stages skip remaining stages until they reach the result queue
|
|
147
|
+
- Other items continue processing (errors don't stop the batch)
|
|
148
|
+
|
|
104
149
|
## Usage Examples
|
|
105
150
|
|
|
106
151
|
### Web Scraping Pipeline
|
|
107
152
|
|
|
108
153
|
```ruby
|
|
109
|
-
scraper =
|
|
154
|
+
scraper = RapidFlow::Batch.build do
|
|
110
155
|
stage ->(url) {
|
|
111
156
|
# Fetch HTML (may take 1-2 seconds per URL)
|
|
112
157
|
HTTP.get(url).to_s
|
|
@@ -136,7 +181,7 @@ results = scraper.results
|
|
|
136
181
|
### Image Processing Pipeline
|
|
137
182
|
|
|
138
183
|
```ruby
|
|
139
|
-
processor =
|
|
184
|
+
processor = RapidFlow::Batch.build do
|
|
140
185
|
stage ->(path) { MiniMagick::Image.open(path) }, workers: 4 # Stage 1: Load image
|
|
141
186
|
stage ->(img) { img.resize('800x600'); img }, workers: 4 # Stage 2: Resize
|
|
142
187
|
stage ->(img) { img.colorspace('Gray'); img }, workers: 4 # Stage 3: Convert to grayscale
|
|
@@ -152,7 +197,7 @@ puts "Processed #{results.count { |_, err| err.nil? }} images successfully"
|
|
|
152
197
|
### API Data Enrichment
|
|
153
198
|
|
|
154
199
|
```ruby
|
|
155
|
-
enricher =
|
|
200
|
+
enricher = RapidFlow::Batch.build do
|
|
156
201
|
stage ->(user_id) {
|
|
157
202
|
# Fetch user data from API
|
|
158
203
|
api_client.get("/users/#{user_id}").parse
|
|
@@ -181,7 +226,7 @@ enriched_users = enricher.results
|
|
|
181
226
|
|
|
182
227
|
```ruby
|
|
183
228
|
# Extract, Transform, Load
|
|
184
|
-
etl =
|
|
229
|
+
etl = RapidFlow::Batch.build do
|
|
185
230
|
stage ->(filename) {
|
|
186
231
|
# Extract: Read CSV file
|
|
187
232
|
CSV.read(filename, headers: true).map(&:to_h)
|
|
@@ -211,45 +256,15 @@ puts "Loaded #{total_records} records"
|
|
|
211
256
|
```ruby
|
|
212
257
|
# Sometimes you just need parallel processing without multiple stages
|
|
213
258
|
# Fetch 20 URLs concurrently
|
|
214
|
-
fetcher =
|
|
259
|
+
fetcher = RapidFlow::Batch.new({ fn: ->(url) { HTTP.get(url).body }, workers: 20 })
|
|
215
260
|
|
|
216
261
|
urls.each { |url| fetcher.push(url) }
|
|
217
262
|
pages = fetcher.results
|
|
218
263
|
```
|
|
219
264
|
|
|
220
|
-
## Error Handling
|
|
221
|
-
|
|
222
|
-
Rapidflow captures exceptions without stopping the pipeline:
|
|
223
|
-
|
|
224
|
-
```ruby
|
|
225
|
-
batch = Rapidflow::Batch.new(
|
|
226
|
-
{ fn: ->(url) { HTTP.get(url).body } }, # May raise network errors
|
|
227
|
-
{ fn: ->(body) { JSON.parse(body) } } # May raise JSON parse errors
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
urls.each { |url| batch.push(url) }
|
|
231
|
-
results = batch.results
|
|
232
|
-
|
|
233
|
-
results.each_with_index do |(data, error), index|
|
|
234
|
-
if error
|
|
235
|
-
# Original input is preserved in 'data' for debugging
|
|
236
|
-
puts "Failed to process #{urls[index]}: #{error.class} - #{error.message}"
|
|
237
|
-
# Log error, retry, or handle gracefully
|
|
238
|
-
else
|
|
239
|
-
puts "Success: #{data}"
|
|
240
|
-
end
|
|
241
|
-
end
|
|
242
|
-
```
|
|
243
|
-
|
|
244
|
-
**Error behavior:**
|
|
245
|
-
- Exceptions are caught and returned with results
|
|
246
|
-
- The original data is preserved when an error occurs
|
|
247
|
-
- Errors in early stages passed down to remaining stages until they reach the result queue
|
|
248
|
-
- Other items continue processing (errors don't stop the batch)
|
|
249
|
-
|
|
250
265
|
## Architecture
|
|
251
266
|
|
|
252
|
-
|
|
267
|
+
RapidFlow uses a multi-stage pipeline architecture with concurrent workers at each stage.
|
|
253
268
|
|
|
254
269
|
### Pipeline Flow
|
|
255
270
|
|
|
@@ -353,10 +368,10 @@ Choose based on your workload:
|
|
|
353
368
|
|
|
354
369
|
```ruby
|
|
355
370
|
# High I/O workload - many workers
|
|
356
|
-
|
|
371
|
+
RapidFlow::Batch.new({ fn: lambda1, workers: 100 }, { fn: lambda2, workers: 50 })
|
|
357
372
|
|
|
358
373
|
# CPU-intensive - fewer workers
|
|
359
|
-
|
|
374
|
+
RapidFlow::Batch.new({ fn: lambda1, workers: 2 }, { fn: lambda2, workers: 2 })
|
|
360
375
|
```
|
|
361
376
|
|
|
362
377
|
### Balancing Workers for Stages
|
|
@@ -364,8 +379,8 @@ Rapidflow::Batch.new({ fn: lambda1, workers: 2 }, { fn: lambda2, workers: 2 })
|
|
|
364
379
|
For the best throughput, workers should be assigned based on the I/O-bound workload of each stage:
|
|
365
380
|
|
|
366
381
|
```ruby
|
|
367
|
-
# β Same number of workers even though stages have different I/O
|
|
368
|
-
|
|
382
|
+
# β Same number of workers even though stages have different I/O load
|
|
383
|
+
RapidFlow::Batch.build do
|
|
369
384
|
stage ->(x) { sleep(10); x }, workers: 4 # 10 seconds - SLOW! (Assume a heavy or long-running I/O task)
|
|
370
385
|
stage ->(x) { sleep(0.1); x }, workers: 4 # 0.1 seconds - fast
|
|
371
386
|
stage ->(x) { sleep(0.1); x }, workers: 4 # 0.1 seconds - fast
|
|
@@ -373,7 +388,7 @@ Rapidflow::Batch.build do
|
|
|
373
388
|
end
|
|
374
389
|
|
|
375
390
|
# β
Balanced - workers are assigned based of I/O load
|
|
376
|
-
|
|
391
|
+
RapidFlow::Batch.build do
|
|
377
392
|
stage ->(x) { sleep(10); x }, workers: 16 # 10 seconds - SLOW!
|
|
378
393
|
stage ->(x) { sleep(0.1); x }, workers: 2 # 0.1 seconds - fast
|
|
379
394
|
stage ->(x) { sleep(0.1); x }, workers: 2 # 0.1 seconds - fast
|
|
@@ -383,7 +398,7 @@ end
|
|
|
383
398
|
|
|
384
399
|
### Memory Considerations
|
|
385
400
|
|
|
386
|
-
- Each queue can grow unbounded
|
|
401
|
+
- Each queue can grow unboundedβdon't push millions of items without consuming results
|
|
387
402
|
- Workers hold items in memory during processing
|
|
388
403
|
- Memory usage β (items in queues + items being processed) Γ item size
|
|
389
404
|
|
|
@@ -403,11 +418,11 @@ end
|
|
|
403
418
|
- Share mutable state between workers without synchronization
|
|
404
419
|
- Push millions of items without processing results (memory issue)
|
|
405
420
|
- Create dependencies between items (order of execution not guaranteed)
|
|
406
|
-
- Nest
|
|
421
|
+
- Nest RapidFlow instances (use a single multi-stage batch instead)
|
|
407
422
|
|
|
408
423
|
## Comparison with Alternatives
|
|
409
424
|
|
|
410
|
-
| Feature |
|
|
425
|
+
| Feature | RapidFlow | Thread Pool | Sidekiq | Concurrent-Ruby |
|
|
411
426
|
|--------------------------|-----------|---------------|----------------|-----------------|
|
|
412
427
|
| **Multi-stage pipeline** | β
| β | β οΈ (manual) | β |
|
|
413
428
|
| **Order preservation** | β
| β | β | β |
|
|
@@ -424,13 +439,13 @@ The following result is taken from a benchmark run of [./scripts/benchmark/bench
|
|
|
424
439
|
```bash
|
|
425
440
|
/scripts/benchmark$ ruby benchmark_api_request_process_and_storing.rb 40 32
|
|
426
441
|
================================================================================
|
|
427
|
-
|
|
442
|
+
RapidFlow API Request, Process & Store Benchmark
|
|
428
443
|
================================================================================
|
|
429
444
|
|
|
430
445
|
Configuration:
|
|
431
446
|
API: dummyjson.com
|
|
432
447
|
User IDs to process: 1 to 40
|
|
433
|
-
Workers per stage (
|
|
448
|
+
Workers per stage (RapidFlow): 32
|
|
434
449
|
Stages: Fetch User β Fetch Product β Merge Data β Save to File
|
|
435
450
|
|
|
436
451
|
Processing 40 user IDs...
|
|
@@ -447,7 +462,7 @@ Results: 40 successful, 0 failed
|
|
|
447
462
|
2. RAPIDFLOW CONCURRENT PROCESSING
|
|
448
463
|
--------------------------------------------------------------------------------
|
|
449
464
|
user system total real
|
|
450
|
-
|
|
465
|
+
RapidFlow (32 workers): 0.217776 0.084002 0.301778 ( 0.612455)
|
|
451
466
|
|
|
452
467
|
Results: 40 successful, 0 failed
|
|
453
468
|
|
|
@@ -456,7 +471,7 @@ SUMMARY
|
|
|
456
471
|
================================================================================
|
|
457
472
|
|
|
458
473
|
Synchronous time: 13.18s
|
|
459
|
-
|
|
474
|
+
RapidFlow time: 0.61s
|
|
460
475
|
|
|
461
476
|
Speedup: 21.52x faster
|
|
462
477
|
Time saved: 12.57s
|
|
@@ -466,7 +481,7 @@ Performance gain: 2052.1%
|
|
|
466
481
|
FILE VERIFICATION
|
|
467
482
|
--------------------------------------------------------------------------------
|
|
468
483
|
Synchronous output: 40 files created
|
|
469
|
-
|
|
484
|
+
RapidFlow output: 40 files created
|
|
470
485
|
|
|
471
486
|
Sample output file: data_1.json
|
|
472
487
|
User ID: 1
|
|
@@ -481,11 +496,11 @@ PERFORMANCE ANALYSIS
|
|
|
481
496
|
|
|
482
497
|
Average time per item:
|
|
483
498
|
Synchronous: 329.51ms
|
|
484
|
-
|
|
499
|
+
RapidFlow: 15.31ms
|
|
485
500
|
|
|
486
501
|
Throughput (items/second):
|
|
487
502
|
Synchronous: 3.03 items/sec
|
|
488
|
-
|
|
503
|
+
RapidFlow: 65.31 items/sec
|
|
489
504
|
```
|
|
490
505
|
|
|
491
506
|
## Development
|
|
@@ -505,7 +520,7 @@ to be a safe, welcoming space for collaboration, and contributors are expected t
|
|
|
505
520
|
|
|
506
521
|
## Code of Conduct
|
|
507
522
|
|
|
508
|
-
Everyone interacting in the
|
|
523
|
+
Everyone interacting in the RapidFlow project's codebases, issue trackers, chat rooms and mailing lists is expected
|
|
509
524
|
to follow the [code of conduct](https://github.com/sinaru/rapidflow/blob/main/CODE_OF_CONDUCT.md).
|
|
510
525
|
|
|
511
526
|
## License
|
data/lib/rapidflow/batch.rb
CHANGED
|
@@ -1,17 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module RapidFlow
|
|
4
4
|
class Batch
|
|
5
|
-
class ConfigError < RuntimeError; end
|
|
6
|
-
class RunError < RuntimeError; end
|
|
7
|
-
|
|
8
5
|
# DSL entrypoint
|
|
9
6
|
def self.build(&block)
|
|
10
|
-
builder =
|
|
7
|
+
builder = BatchBuilder.new
|
|
11
8
|
builder.instance_eval(&block) if block
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
batch = new(*builder.stages)
|
|
10
|
+
batch.start
|
|
11
|
+
batch
|
|
15
12
|
end
|
|
16
13
|
|
|
17
14
|
# Initialize with a list of stage configs: { fn: -> (input) { }, workers: Integer }, ...
|
|
@@ -27,13 +24,13 @@ module Rapidflow
|
|
|
27
24
|
@locked = false
|
|
28
25
|
@locked_mutex = Mutex.new
|
|
29
26
|
|
|
30
|
-
# to track if
|
|
27
|
+
# to track if batch is running
|
|
31
28
|
@running = false
|
|
32
29
|
@running_mutex = Mutex.new
|
|
33
30
|
end
|
|
34
31
|
|
|
35
32
|
def start
|
|
36
|
-
raise ConfigError, "Unable to start the
|
|
33
|
+
raise RapidFlow::ConfigError, "Unable to start the batch without any stages" if @stages.empty?
|
|
37
34
|
|
|
38
35
|
@stages.each(&:start)
|
|
39
36
|
mark_run!
|
|
@@ -57,19 +54,6 @@ module Rapidflow
|
|
|
57
54
|
|
|
58
55
|
private
|
|
59
56
|
|
|
60
|
-
# DSL builder
|
|
61
|
-
class Builder
|
|
62
|
-
attr_reader :stages
|
|
63
|
-
|
|
64
|
-
def initialize
|
|
65
|
-
@stages = []
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def stage(lambda_fn, workers: 4)
|
|
69
|
-
@stages << { fn: lambda_fn, workers: workers }
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
57
|
def build_stages
|
|
74
58
|
stages = []
|
|
75
59
|
@lambdas.each_with_index do |lambda_fn, stage_index|
|
|
@@ -99,13 +83,13 @@ module Rapidflow
|
|
|
99
83
|
|
|
100
84
|
def ensure_not_finalized!
|
|
101
85
|
@locked_mutex.synchronize do
|
|
102
|
-
raise RunError, "Cannot push to a locked
|
|
86
|
+
raise RapidFlow::RunError, "Cannot push to a locked batch when results are requested" if @locked
|
|
103
87
|
end
|
|
104
88
|
end
|
|
105
89
|
|
|
106
90
|
def ensure_running!
|
|
107
91
|
@running_mutex.synchronize do
|
|
108
|
-
raise RunError, "Batch has not started" unless @running
|
|
92
|
+
raise RapidFlow::RunError, "Batch has not started" unless @running
|
|
109
93
|
end
|
|
110
94
|
end
|
|
111
95
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RapidFlow
|
|
4
|
+
# DSL builder
|
|
5
|
+
class BatchBuilder
|
|
6
|
+
attr_reader :stages
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@stages = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def stage(lambda_fn, workers: 4)
|
|
13
|
+
@stages << { fn: lambda_fn, workers: workers }
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
data/lib/rapidflow/counter.rb
CHANGED
data/lib/rapidflow/pipeline.rb
CHANGED
data/lib/rapidflow/stage.rb
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module RapidFlow
|
|
4
4
|
# Represents a processing stage in the pipeline
|
|
5
5
|
class Stage
|
|
6
6
|
def initialize(stage_index:, lambda_fn:, workers:, is_final:, pipeline:)
|
|
7
|
+
validate_worker!(workers)
|
|
8
|
+
|
|
7
9
|
@stage_index = stage_index
|
|
8
10
|
@lambda_fn = lambda_fn
|
|
9
11
|
@workers = workers
|
|
@@ -56,5 +58,11 @@ module Rapidflow
|
|
|
56
58
|
@pipeline.enqueue(@stage_index + 1, work_item)
|
|
57
59
|
@pipeline.decrement_active_workers if @is_final
|
|
58
60
|
end
|
|
61
|
+
|
|
62
|
+
def validate_worker!(workers)
|
|
63
|
+
return if workers.kind_of?(Integer) && workers.positive?
|
|
64
|
+
|
|
65
|
+
raise RapidFlow::ConfigError, "Worker count should be a positive number for stage"
|
|
66
|
+
end
|
|
59
67
|
end
|
|
60
68
|
end
|
data/lib/rapidflow/version.rb
CHANGED
data/lib/rapidflow/work_item.rb
CHANGED
data/lib/rapidflow.rb
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "rapidflow/version"
|
|
4
|
+
require_relative "rapidflow/errors"
|
|
4
5
|
require_relative "rapidflow/counter"
|
|
5
6
|
require_relative "rapidflow/pipeline"
|
|
6
7
|
require_relative "rapidflow/work_item"
|
|
7
8
|
require_relative "rapidflow/stage"
|
|
9
|
+
require_relative "rapidflow/batch_builder"
|
|
8
10
|
require_relative "rapidflow/batch"
|
|
9
11
|
|
|
10
|
-
module
|
|
12
|
+
module RapidFlow
|
|
11
13
|
end
|
|
@@ -82,11 +82,11 @@ def process_data_synchronously(user_ids, output_dir)
|
|
|
82
82
|
results
|
|
83
83
|
end
|
|
84
84
|
|
|
85
|
-
# Solution 2:
|
|
85
|
+
# Solution 2: RapidFlow concurrent processing
|
|
86
86
|
def process_data_with_rapidflow(user_ids, output_dir, workers: 8)
|
|
87
87
|
FileUtils.mkdir_p(output_dir)
|
|
88
88
|
|
|
89
|
-
belt =
|
|
89
|
+
belt = RapidFlow::Batch.build do
|
|
90
90
|
# Stage 1: Fetch user data from API
|
|
91
91
|
stage ->(user_id) {
|
|
92
92
|
ApiClient.fetch_user(user_id)
|
|
@@ -119,13 +119,13 @@ end
|
|
|
119
119
|
# Run benchmark
|
|
120
120
|
def run_benchmark(max_user_id: 30, workers: 8)
|
|
121
121
|
puts "=" * 80
|
|
122
|
-
puts "
|
|
122
|
+
puts "RapidFlow API Request, Process & Store Benchmark"
|
|
123
123
|
puts "=" * 80
|
|
124
124
|
puts
|
|
125
125
|
puts "Configuration:"
|
|
126
126
|
puts " API: dummyjson.com"
|
|
127
127
|
puts " User IDs to process: 1 to #{max_user_id}"
|
|
128
|
-
puts " Workers per stage (
|
|
128
|
+
puts " Workers per stage (RapidFlow): #{workers}"
|
|
129
129
|
puts " Stages: Fetch User β Fetch Product β Merge Data β Save to File"
|
|
130
130
|
puts
|
|
131
131
|
|
|
@@ -167,7 +167,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
167
167
|
end
|
|
168
168
|
puts
|
|
169
169
|
|
|
170
|
-
# Benchmark
|
|
170
|
+
# Benchmark RapidFlow
|
|
171
171
|
puts "-" * 80
|
|
172
172
|
puts "2. RAPIDFLOW CONCURRENT PROCESSING"
|
|
173
173
|
puts "-" * 80
|
|
@@ -176,7 +176,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
176
176
|
rapidflow_results = nil
|
|
177
177
|
|
|
178
178
|
Benchmark.bm(30) do |x|
|
|
179
|
-
rapidflow_time = x.report("
|
|
179
|
+
rapidflow_time = x.report("RapidFlow (#{workers} workers):") do
|
|
180
180
|
rapidflow_results = process_data_with_rapidflow(user_ids, "tmp/output_rapidflow", workers: workers)
|
|
181
181
|
end
|
|
182
182
|
end
|
|
@@ -210,7 +210,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
210
210
|
puts "=" * 80
|
|
211
211
|
puts
|
|
212
212
|
puts "Synchronous time: #{sync_real_time.round(2)}s"
|
|
213
|
-
puts "
|
|
213
|
+
puts "RapidFlow time: #{rapidflow_real_time.round(2)}s"
|
|
214
214
|
puts
|
|
215
215
|
puts "Speedup: #{speedup.round(2)}x faster"
|
|
216
216
|
puts "Time saved: #{time_saved.round(2)}s"
|
|
@@ -226,7 +226,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
226
226
|
rapidflow_files = Dir.glob("tmp/output_rapidflow/data_*.json").length
|
|
227
227
|
|
|
228
228
|
puts "Synchronous output: #{sync_files} files created"
|
|
229
|
-
puts "
|
|
229
|
+
puts "RapidFlow output: #{rapidflow_files} files created"
|
|
230
230
|
puts
|
|
231
231
|
|
|
232
232
|
# Sample file content verification
|
|
@@ -256,7 +256,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
256
256
|
|
|
257
257
|
puts "Average time per item:"
|
|
258
258
|
puts " Synchronous: #{(avg_time_per_item_sync * 1000).round(2)}ms"
|
|
259
|
-
puts "
|
|
259
|
+
puts " RapidFlow: #{(avg_time_per_item_rapid * 1000).round(2)}ms"
|
|
260
260
|
puts
|
|
261
261
|
|
|
262
262
|
throughput_sync = max_user_id / sync_real_time
|
|
@@ -264,7 +264,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
264
264
|
|
|
265
265
|
puts "Throughput (items/second):"
|
|
266
266
|
puts " Synchronous: #{throughput_sync.round(2)} items/sec"
|
|
267
|
-
puts "
|
|
267
|
+
puts " RapidFlow: #{throughput_rapid.round(2)} items/sec"
|
|
268
268
|
puts
|
|
269
269
|
|
|
270
270
|
# Cleanup prompt
|
|
@@ -273,7 +273,7 @@ def run_benchmark(max_user_id: 30, workers: 8)
|
|
|
273
273
|
puts "-" * 80
|
|
274
274
|
puts
|
|
275
275
|
puts "Synchronous files: tmp/output_sync/"
|
|
276
|
-
puts "
|
|
276
|
+
puts "RapidFlow files: tmp/output_rapidflow/"
|
|
277
277
|
puts
|
|
278
278
|
puts "To clean up output directories, run:"
|
|
279
279
|
puts " rm -rf tmp/output_sync tmp/output_rapidflow"
|
|
@@ -89,11 +89,11 @@ def process_images_synchronously(image_paths, output_dir)
|
|
|
89
89
|
results
|
|
90
90
|
end
|
|
91
91
|
|
|
92
|
-
# Solution 2:
|
|
92
|
+
# Solution 2: RapidFlow concurrent processing
|
|
93
93
|
def process_images_with_rapidflow(image_paths, output_dir, workers: 4)
|
|
94
94
|
FileUtils.mkdir_p(output_dir)
|
|
95
95
|
|
|
96
|
-
belt =
|
|
96
|
+
belt = RapidFlow::Batch.build do
|
|
97
97
|
# Stage 1: Load image
|
|
98
98
|
stage ->(path) { ImageProcessor.load_image(path) }, workers: workers
|
|
99
99
|
|
|
@@ -114,7 +114,7 @@ end
|
|
|
114
114
|
# Run benchmark
|
|
115
115
|
def run_benchmark(sample_image_path, image_count: 50, workers: 4)
|
|
116
116
|
puts "=" * 80
|
|
117
|
-
puts "
|
|
117
|
+
puts "RapidFlow Image Processing Benchmark"
|
|
118
118
|
puts "=" * 80
|
|
119
119
|
puts
|
|
120
120
|
puts "Configuration:"
|
|
@@ -157,7 +157,7 @@ def run_benchmark(sample_image_path, image_count: 50, workers: 4)
|
|
|
157
157
|
end
|
|
158
158
|
puts
|
|
159
159
|
|
|
160
|
-
# Benchmark
|
|
160
|
+
# Benchmark RapidFlow
|
|
161
161
|
puts "-" * 80
|
|
162
162
|
puts "2. RAPIDFLOW CONCURRENT PROCESSING"
|
|
163
163
|
puts "-" * 80
|
|
@@ -166,7 +166,7 @@ def run_benchmark(sample_image_path, image_count: 50, workers: 4)
|
|
|
166
166
|
rapidflow_results = nil
|
|
167
167
|
|
|
168
168
|
Benchmark.bm(30) do |x|
|
|
169
|
-
rapidflow_time = x.report("
|
|
169
|
+
rapidflow_time = x.report("RapidFlow (#{workers} workers):") do
|
|
170
170
|
rapidflow_results = process_images_with_rapidflow(image_paths, "tmp/output_rapidflow", workers: workers)
|
|
171
171
|
end
|
|
172
172
|
end
|
|
@@ -191,7 +191,7 @@ def run_benchmark(sample_image_path, image_count: 50, workers: 4)
|
|
|
191
191
|
puts "=" * 80
|
|
192
192
|
puts
|
|
193
193
|
puts "Synchronous time: #{sync_real_time.round(2)}s"
|
|
194
|
-
puts "
|
|
194
|
+
puts "RapidFlow time: #{rapidflow_real_time.round(2)}s"
|
|
195
195
|
puts
|
|
196
196
|
puts "Speedup: #{speedup.round(2)}x faster"
|
|
197
197
|
puts "Time saved: #{time_saved.round(2)}s"
|