fractor 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +227 -102
- data/README.adoc +113 -1940
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/continuous_chat_common/message_protocol.rb +1 -1
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +2 -2
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/simple/README.adoc +347 -0
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +44 -8
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +60 -65
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +32 -0
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +382 -269
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +20 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +73 -0
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -101
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +92 -4
- metadata +179 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -5,71 +5,659 @@
|
|
|
5
5
|
|
|
6
6
|
toc::[]
|
|
7
7
|
|
|
8
|
-
==
|
|
8
|
+
== Purpose
|
|
9
9
|
|
|
10
|
-
The Hierarchical Hasher example demonstrates
|
|
10
|
+
The Hierarchical Hasher example demonstrates parallel file processing using a map-reduce pattern with Fractor. It showcases how to break large files into chunks, process them concurrently, and aggregate results while preserving order. This is a fundamental pattern for processing large datasets efficiently using parallel workers.
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
== Focus
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
* Demonstrating parallel data chunking patterns
|
|
16
|
-
* Showcasing result aggregation techniques
|
|
14
|
+
This example demonstrates:
|
|
17
15
|
|
|
18
|
-
|
|
16
|
+
* **Chunking patterns** for parallel data processing
|
|
17
|
+
* **Position-aware processing** to maintain data order
|
|
18
|
+
* **Result aggregation** with sorting and combining
|
|
19
|
+
* **Map-reduce architecture** in Fractor
|
|
20
|
+
* **Parallel I/O processing** for large files
|
|
21
|
+
* **Worker pool utilization** for CPU-bound tasks
|
|
19
22
|
|
|
20
|
-
|
|
23
|
+
== Architecture
|
|
21
24
|
|
|
22
|
-
===
|
|
25
|
+
=== Data Flow Overview
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
[source]
|
|
28
|
+
----
|
|
29
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
30
|
+
│ Input File │
|
|
31
|
+
│ "Lorem ipsum dolor sit amet consectetur adipiscing..." │
|
|
32
|
+
└─────────────────────────────────────────────────────────────┘
|
|
33
|
+
│
|
|
34
|
+
│ File.read(chunk_size)
|
|
35
|
+
▼
|
|
36
|
+
┌───────────────┐
|
|
37
|
+
│ File Chunking │
|
|
38
|
+
└───────────────┘
|
|
39
|
+
│
|
|
40
|
+
┌───────────────┼───────────────┐
|
|
41
|
+
│ │ │
|
|
42
|
+
▼ ▼ ▼
|
|
43
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
44
|
+
│ ChunkWork │ │ ChunkWork │ │ ChunkWork │
|
|
45
|
+
│ start=0 │ │ start=1024 │ │ start=2048 │
|
|
46
|
+
│ length=1024 │ │ length=1024 │ │ length=1024 │
|
|
47
|
+
│ data=[...] │ │ data=[...] │ │ data=[...] │
|
|
48
|
+
└──────────────┘ └──────────────┘ └──────────────┘
|
|
49
|
+
│ │ │
|
|
50
|
+
│ Parallel │ Processing │
|
|
51
|
+
│ │ │
|
|
52
|
+
▼ ▼ ▼
|
|
53
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
54
|
+
│ HashWorker 1 │ │ HashWorker 2 │ │ HashWorker 3 │
|
|
55
|
+
│ SHA256(...) │ │ SHA256(...) │ │ SHA256(...) │
|
|
56
|
+
└──────────────┘ └──────────────┘ └──────────────┘
|
|
57
|
+
│ │ │
|
|
58
|
+
│ │ │
|
|
59
|
+
▼ ▼ ▼
|
|
60
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
61
|
+
│ WorkResult │ │ WorkResult │ │ WorkResult │
|
|
62
|
+
│ start=0 │ │ start=1024 │ │ start=2048 │
|
|
63
|
+
│ hash=a3f2... │ │ hash=7b9c... │ │ hash=e5d1... │
|
|
64
|
+
└──────────────┘ └──────────────┘ └──────────────┘
|
|
65
|
+
│ │ │
|
|
66
|
+
└───────────────┼───────────────┘
|
|
67
|
+
│ Sort by start position
|
|
68
|
+
▼
|
|
69
|
+
┌───────────────┐
|
|
70
|
+
│ Aggregation │
|
|
71
|
+
│ a3f2...\n │
|
|
72
|
+
│ 7b9c...\n │
|
|
73
|
+
│ e5d1... │
|
|
74
|
+
└───────────────┘
|
|
75
|
+
│
|
|
76
|
+
│ SHA256(combined)
|
|
77
|
+
▼
|
|
78
|
+
┌───────────────┐
|
|
79
|
+
│ Final Hash │
|
|
80
|
+
│ c4e8a9b2f... │
|
|
81
|
+
└───────────────┘
|
|
82
|
+
----
|
|
83
|
+
|
|
84
|
+
=== Map-Reduce Pattern
|
|
85
|
+
|
|
86
|
+
[source]
|
|
87
|
+
----
|
|
88
|
+
Map Phase (Parallel Processing)
|
|
89
|
+
┌─────────────────────────────────────────────────────────┐
|
|
90
|
+
│ File Chunk 0 → HashWorker → hash_0 (a3f2...) │
|
|
91
|
+
│ File Chunk 1 → HashWorker → hash_1 (7b9c...) │
|
|
92
|
+
│ File Chunk 2 → HashWorker → hash_2 (e5d1...) │
|
|
93
|
+
│ File Chunk 3 → HashWorker → hash_3 (f1a8...) │
|
|
94
|
+
│ ... │
|
|
95
|
+
│ File Chunk N → HashWorker → hash_N (d9c4...) │
|
|
96
|
+
└─────────────────────────────────────────────────────────┘
|
|
97
|
+
│
|
|
98
|
+
▼
|
|
99
|
+
Reduce Phase (Sequential Aggregation)
|
|
100
|
+
┌─────────────────────────────────────────────────────────┐
|
|
101
|
+
│ 1. Sort results by chunk position │
|
|
102
|
+
│ 2. Concatenate: hash_0\nhash_1\nhash_2\n...hash_N │
|
|
103
|
+
│ 3. Final hash: SHA256(concatenated_hashes) │
|
|
104
|
+
└─────────────────────────────────────────────────────────┘
|
|
105
|
+
----
|
|
106
|
+
|
|
107
|
+
=== Performance Comparison
|
|
108
|
+
|
|
109
|
+
[source]
|
|
110
|
+
----
|
|
111
|
+
Sequential Processing:
|
|
112
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
113
|
+
│ Chunk 0 │ Chunk 1 │ Chunk 2 │ Chunk 3 │ ... │
|
|
114
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
115
|
+
Time: 100 units
|
|
116
|
+
|
|
117
|
+
Parallel Processing (4 workers):
|
|
118
|
+
━━━━━━━━━━━━━━━━━━━━━
|
|
119
|
+
│ Chunk 0 │
|
|
120
|
+
│ Chunk 1 │ (Worker 1)
|
|
121
|
+
│ Chunk 2 │ (Worker 2)
|
|
122
|
+
│ Chunk 3 │ (Worker 3)
|
|
123
|
+
━━━━━━━━━━━━━━━━━━━━━
|
|
124
|
+
Time: ~25 units (4x speedup)
|
|
125
|
+
|
|
126
|
+
Actual speedup depends on:
|
|
127
|
+
- Number of workers vs cores
|
|
128
|
+
- Chunk size vs overhead ratio
|
|
129
|
+
- I/O vs CPU bottleneck
|
|
130
|
+
----
|
|
131
|
+
|
|
132
|
+
== Key Components
|
|
133
|
+
|
|
134
|
+
=== ChunkWork: Position-Aware Work Unit
|
|
135
|
+
|
|
136
|
+
The `ChunkWork` class represents a chunk of the file with position metadata:
|
|
137
|
+
|
|
138
|
+
[source,ruby]
|
|
139
|
+
----
|
|
140
|
+
class ChunkWork < Fractor::Work
|
|
141
|
+
def initialize(data, start = 0, length = nil)
|
|
142
|
+
super({
|
|
143
|
+
data: data, # <1>
|
|
144
|
+
start: start, # <2>
|
|
145
|
+
length: length || data.bytesize, # <3>
|
|
146
|
+
})
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def data
|
|
150
|
+
input[:data]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def start
|
|
154
|
+
input[:start]
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def length
|
|
158
|
+
input[:length]
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
----
|
|
162
|
+
<1> The actual chunk data to be hashed
|
|
163
|
+
<2> Starting byte position in the original file (for ordering)
|
|
164
|
+
<3> Length of this chunk in bytes
|
|
165
|
+
|
|
166
|
+
Why position tracking matters:
|
|
167
|
+
|
|
168
|
+
* Enables **correct result ordering** during aggregation
|
|
169
|
+
* Supports **resumable processing** for interrupted jobs
|
|
170
|
+
* Allows **parallel validation** against sequential processing
|
|
171
|
+
* Facilitates **chunk-level debugging** and error tracking
|
|
25
172
|
|
|
26
|
-
|
|
27
|
-
* The starting position within the file
|
|
28
|
-
* The length of the chunk
|
|
173
|
+
=== HashWorker: Independent Chunk Processor
|
|
29
174
|
|
|
30
|
-
|
|
175
|
+
The `HashWorker` processes chunks without dependencies:
|
|
31
176
|
|
|
32
|
-
|
|
177
|
+
[source,ruby]
|
|
178
|
+
----
|
|
179
|
+
class HashWorker < Fractor::Worker
|
|
180
|
+
def process(work)
|
|
181
|
+
sleep(rand(0.01..0.05)) # <1>
|
|
182
|
+
|
|
183
|
+
hash = Digest::SHA256.hexdigest(work.data) # <2>
|
|
184
|
+
|
|
185
|
+
Fractor::WorkResult.new(
|
|
186
|
+
result: {
|
|
187
|
+
start: work.start, # <3>
|
|
188
|
+
length: work.length,
|
|
189
|
+
hash: hash,
|
|
190
|
+
},
|
|
191
|
+
work: work,
|
|
192
|
+
)
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
----
|
|
196
|
+
<1> Simulates variable processing time (real work would be I/O or computation)
|
|
197
|
+
<2> Calculates SHA-256 hash for the chunk (Ractor-safe)
|
|
198
|
+
<3> Preserves position metadata for correct aggregation
|
|
199
|
+
|
|
200
|
+
Key design aspects:
|
|
201
|
+
|
|
202
|
+
* **Stateless processing**: Each chunk is independent
|
|
203
|
+
* **Position preservation**: Results include original position
|
|
204
|
+
* **Error handling**: Wrapped in begin/rescue for robustness
|
|
205
|
+
* **Ractor compatibility**: Uses SHA-256 instead of SHA-3
|
|
206
|
+
|
|
207
|
+
=== FileHasher: Orchestration and Aggregation
|
|
208
|
+
|
|
209
|
+
The `FileHasher` orchestrates the entire process:
|
|
210
|
+
|
|
211
|
+
[source,ruby]
|
|
212
|
+
----
|
|
213
|
+
class FileHasher
|
|
214
|
+
def hash_file
|
|
215
|
+
supervisor = Fractor::Supervisor.new(
|
|
216
|
+
worker_pools: [
|
|
217
|
+
{ worker_class: HashWorker, num_workers: @worker_count }, # <1>
|
|
218
|
+
],
|
|
219
|
+
)
|
|
33
220
|
|
|
34
|
-
|
|
35
|
-
|
|
221
|
+
load_file_chunks(supervisor) # <2>
|
|
222
|
+
supervisor.run # <3>
|
|
36
223
|
|
|
37
|
-
|
|
224
|
+
@final_hash = finalize_hash(supervisor.results) # <4>
|
|
225
|
+
end
|
|
38
226
|
|
|
39
|
-
|
|
227
|
+
private
|
|
40
228
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
229
|
+
def load_file_chunks(supervisor)
|
|
230
|
+
File.open(@file_path, "rb") do |file|
|
|
231
|
+
start_pos = 0
|
|
232
|
+
while (chunk = file.read(@chunk_size)) # <5>
|
|
233
|
+
work_items << ChunkWork.new(chunk, start_pos, chunk.length)
|
|
234
|
+
start_pos += chunk.length
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
supervisor.add_work_items(work_items)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def finalize_hash(results_aggregator)
|
|
241
|
+
sorted_results = results_aggregator.results.sort_by do |result|
|
|
242
|
+
result.result[:start] # <6>
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
combined_hash_string = sorted_results.map do |result|
|
|
246
|
+
result.result[:hash]
|
|
247
|
+
end.join("\n") # <7>
|
|
248
|
+
|
|
249
|
+
Digest::SHA256.hexdigest(combined_hash_string) # <8>
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
----
|
|
253
|
+
<1> Create worker pool with configurable size
|
|
254
|
+
<2> Break file into chunks with position tracking
|
|
255
|
+
<3> Execute parallel processing
|
|
256
|
+
<4> Aggregate results into final hash
|
|
257
|
+
<5> Read file in chunks (default 1KB)
|
|
258
|
+
<6> Sort by original position to maintain order
|
|
259
|
+
<7> Combine chunk hashes with newline separator
|
|
260
|
+
<8> Create final hash from combined hashes
|
|
261
|
+
|
|
262
|
+
Orchestration phases:
|
|
263
|
+
|
|
264
|
+
1. **Chunking**: Break file into fixed-size pieces
|
|
265
|
+
2. **Distribution**: Send chunks to worker pool
|
|
266
|
+
3. **Parallel Execution**: Workers process chunks concurrently
|
|
267
|
+
4. **Collection**: Gather all results
|
|
268
|
+
5. **Aggregation**: Sort, combine, and finalize
|
|
45
269
|
|
|
46
270
|
== Usage
|
|
47
271
|
|
|
272
|
+
.Basic usage
|
|
273
|
+
[example]
|
|
274
|
+
====
|
|
275
|
+
[source,bash]
|
|
276
|
+
----
|
|
277
|
+
# Use default 4 workers
|
|
278
|
+
ruby hierarchical_hasher.rb sample.txt
|
|
279
|
+
|
|
280
|
+
# Use 8 workers for better parallelization
|
|
281
|
+
ruby hierarchical_hasher.rb large_file.dat 8
|
|
282
|
+
|
|
283
|
+
# Process a large log file
|
|
284
|
+
ruby hierarchical_hasher.rb /var/log/system.log 16
|
|
285
|
+
----
|
|
286
|
+
====
|
|
287
|
+
|
|
288
|
+
.Programmatic usage
|
|
289
|
+
[example]
|
|
290
|
+
====
|
|
291
|
+
[source,ruby]
|
|
292
|
+
----
|
|
293
|
+
require_relative "hierarchical_hasher"
|
|
294
|
+
|
|
295
|
+
# Create hasher with custom chunk size
|
|
296
|
+
hasher = HierarchicalHasher::FileHasher.new(
|
|
297
|
+
"large_file.dat",
|
|
298
|
+
chunk_size: 4096, # 4KB chunks
|
|
299
|
+
worker_count: 8 # 8 parallel workers
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Process the file
|
|
303
|
+
final_hash = hasher.hash_file
|
|
304
|
+
|
|
305
|
+
puts "Final hash: #{final_hash}"
|
|
306
|
+
----
|
|
307
|
+
====
|
|
308
|
+
|
|
309
|
+
== Expected Output
|
|
310
|
+
|
|
311
|
+
[source,text]
|
|
312
|
+
----
|
|
313
|
+
Starting hierarchical hasher with 4 workers...
|
|
314
|
+
Processing file: sample.txt
|
|
315
|
+
Final SHA-256 hash: c4e8a9b2f1d3e5a7c9b1f3d5e7a9c1b3d5e7a9b1c3d5e7a9b1c3d5e7a9b1c3d5
|
|
316
|
+
Processing completed in 0.234567 seconds
|
|
317
|
+
----
|
|
318
|
+
|
|
319
|
+
== Learning Points
|
|
320
|
+
|
|
321
|
+
=== 1. Chunking Strategy
|
|
322
|
+
|
|
323
|
+
The example uses fixed-size chunking:
|
|
324
|
+
|
|
325
|
+
[source,ruby]
|
|
326
|
+
----
|
|
327
|
+
chunk_size = 1024 # 1KB chunks
|
|
328
|
+
|
|
329
|
+
File.open(file_path, "rb") do |file|
|
|
330
|
+
while (chunk = file.read(chunk_size))
|
|
331
|
+
# Process chunk
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
----
|
|
335
|
+
|
|
336
|
+
**Considerations**:
|
|
337
|
+
|
|
338
|
+
* **Small chunks** (< 1KB): Higher overhead, more parallelism
|
|
339
|
+
* **Medium chunks** (1-10KB): Balanced overhead and parallelism
|
|
340
|
+
* **Large chunks** (> 100KB): Lower overhead, less parallelism
|
|
341
|
+
|
|
342
|
+
**Rule of thumb**: Chunk size should be 10-100x the processing time to amortize overhead.
|
|
343
|
+
|
|
344
|
+
=== 2. Position-Aware Processing
|
|
345
|
+
|
|
346
|
+
Position tracking enables correct ordering:
|
|
347
|
+
|
|
348
|
+
[source,ruby]
|
|
349
|
+
----
|
|
350
|
+
# Without position tracking (WRONG for ordered results)
|
|
351
|
+
results.map { |r| r.hash }.join("\n")
|
|
352
|
+
|
|
353
|
+
# With position tracking (CORRECT)
|
|
354
|
+
results.sort_by { |r| r.start }.map { |r| r.hash }.join("\n")
|
|
355
|
+
----
|
|
356
|
+
|
|
357
|
+
**Why it matters**:
|
|
358
|
+
|
|
359
|
+
* Workers complete in **non-deterministic order**
|
|
360
|
+
* Results must be **reassembled in file order**
|
|
361
|
+
* Position metadata is **minimal overhead** (8 bytes per chunk)
|
|
362
|
+
|
|
363
|
+
=== 3. Map-Reduce Pattern
|
|
364
|
+
|
|
365
|
+
The example implements a classic map-reduce:
|
|
366
|
+
|
|
367
|
+
**Map phase** (parallel):
|
|
368
|
+
[source,ruby]
|
|
369
|
+
----
|
|
370
|
+
chunks.map do |chunk|
|
|
371
|
+
Digest::SHA256.hexdigest(chunk.data)
|
|
372
|
+
end
|
|
373
|
+
----
|
|
374
|
+
|
|
375
|
+
**Reduce phase** (sequential):
|
|
48
376
|
[source,ruby]
|
|
49
377
|
----
|
|
50
|
-
|
|
51
|
-
|
|
378
|
+
hashes.sort_by { |r| r.start }
|
|
379
|
+
.map { |r| r.hash }
|
|
380
|
+
.join("\n")
|
|
381
|
+
.then { |combined| Digest::SHA256.hexdigest(combined) }
|
|
382
|
+
----
|
|
383
|
+
|
|
384
|
+
**Key insight**: Map is parallelizable, reduce requires ordering.
|
|
385
|
+
|
|
386
|
+
=== 4. Performance Characteristics
|
|
52
387
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
ruby hierarchical_hasher.rb large_file.dat 8 # Use 8 workers
|
|
388
|
+
**Speedup formula**:
|
|
389
|
+
[source]
|
|
56
390
|
----
|
|
391
|
+
Speedup = T_sequential / T_parallel
|
|
392
|
+
≈ N_workers (ideal)
|
|
393
|
+
< N_workers (actual, due to overhead)
|
|
394
|
+
|
|
395
|
+
Actual speedup = N_workers × η
|
|
396
|
+
where η = efficiency factor (0.6-0.9 typical)
|
|
397
|
+
----
|
|
398
|
+
|
|
399
|
+
**Bottlenecks**:
|
|
57
400
|
|
|
58
|
-
|
|
401
|
+
* **I/O bound**: Limited by disk read speed
|
|
402
|
+
* **CPU bound**: Limited by hashing computation
|
|
403
|
+
* **Overhead**: Ractor creation, communication, synchronization
|
|
59
404
|
|
|
60
|
-
|
|
61
|
-
2. Each chunk is assigned to a worker for processing
|
|
62
|
-
3. Workers calculate SHA-256 hashes for their assigned chunks
|
|
63
|
-
4. Results are collected and sorted by their original position in the file
|
|
64
|
-
5. The individual chunk hashes are concatenated with newlines
|
|
65
|
-
6. A final SHA-256 hash is calculated on the combined hash string
|
|
405
|
+
**Optimization strategies**:
|
|
66
406
|
|
|
67
|
-
|
|
407
|
+
* Increase chunk size to reduce overhead
|
|
408
|
+
* Match worker count to available cores
|
|
409
|
+
* Use buffered I/O for faster reading
|
|
410
|
+
* Consider memory constraints for large files
|
|
68
411
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
412
|
+
=== 5. Ractor Compatibility
|
|
413
|
+
|
|
414
|
+
The example uses SHA-256 instead of SHA-3:
|
|
415
|
+
|
|
416
|
+
[source,ruby]
|
|
417
|
+
----
|
|
418
|
+
# Ractor-safe
|
|
419
|
+
Digest::SHA256.hexdigest(data)
|
|
420
|
+
|
|
421
|
+
# Not Ractor-safe in some Ruby versions
|
|
422
|
+
# Digest::SHA3.hexdigest(data)
|
|
423
|
+
----
|
|
72
424
|
|
|
73
|
-
|
|
425
|
+
**Ractor requirements**:
|
|
74
426
|
|
|
75
|
-
|
|
427
|
+
* All data must be **immutable** or **copied**
|
|
428
|
+
* Libraries must be **thread-safe**
|
|
429
|
+
* No shared mutable state
|
|
430
|
+
|
|
431
|
+
=== 6. Error Handling
|
|
432
|
+
|
|
433
|
+
The worker includes error handling:
|
|
434
|
+
|
|
435
|
+
[source,ruby]
|
|
436
|
+
----
|
|
437
|
+
begin
|
|
438
|
+
hash = Digest::SHA256.hexdigest(work.data)
|
|
439
|
+
Fractor::WorkResult.new(result: { hash: hash }, work: work)
|
|
440
|
+
rescue StandardError => e
|
|
441
|
+
Fractor::WorkResult.new(error: e.message, work: work)
|
|
442
|
+
end
|
|
443
|
+
----
|
|
444
|
+
|
|
445
|
+
**Best practices**:
|
|
446
|
+
|
|
447
|
+
* Wrap processing in `begin/rescue`
|
|
448
|
+
* Include context in error messages
|
|
449
|
+
* Return `WorkResult` with error, not raise
|
|
450
|
+
* Allow supervisor to handle failures
|
|
451
|
+
|
|
452
|
+
== Use Cases and Patterns
|
|
453
|
+
|
|
454
|
+
=== Large File Processing
|
|
455
|
+
|
|
456
|
+
Process files too large for memory:
|
|
457
|
+
|
|
458
|
+
[source,ruby]
|
|
459
|
+
----
|
|
460
|
+
# Process a 10GB log file
|
|
461
|
+
hasher = FileHasher.new(
|
|
462
|
+
"huge.log",
|
|
463
|
+
chunk_size: 1_048_576, # 1MB chunks
|
|
464
|
+
worker_count: 16
|
|
465
|
+
)
|
|
466
|
+
hasher.hash_file
|
|
467
|
+
----
|
|
468
|
+
|
|
469
|
+
**Benefits**:
|
|
470
|
+
|
|
471
|
+
* **Streaming processing**: No need to load entire file
|
|
472
|
+
* **Parallel speedup**: 10-15x faster on 16 cores
|
|
473
|
+
* **Memory efficient**: Only chunks in memory
|
|
474
|
+
|
|
475
|
+
=== Content-Addressable Storage
|
|
476
|
+
|
|
477
|
+
Create unique identifiers for files:
|
|
478
|
+
|
|
479
|
+
[source,ruby]
|
|
480
|
+
----
|
|
481
|
+
# Store file by its hash
|
|
482
|
+
file_hash = hasher.hash_file
|
|
483
|
+
storage_path = "store/#{file_hash[0..2]}/#{file_hash}"
|
|
484
|
+
FileUtils.cp(file_path, storage_path)
|
|
485
|
+
----
|
|
486
|
+
|
|
487
|
+
**Use cases**:
|
|
488
|
+
|
|
489
|
+
* Deduplication systems
|
|
490
|
+
* Content-addressable storage
|
|
491
|
+
* Distributed file systems
|
|
492
|
+
|
|
493
|
+
=== Data Integrity Verification
|
|
494
|
+
|
|
495
|
+
Verify file integrity after transfer:
|
|
496
|
+
|
|
497
|
+
[source,ruby]
|
|
498
|
+
----
|
|
499
|
+
# Before transfer
|
|
500
|
+
original_hash = FileHasher.new(source_file).hash_file
|
|
501
|
+
|
|
502
|
+
# After transfer
|
|
503
|
+
transferred_hash = FileHasher.new(dest_file).hash_file
|
|
504
|
+
|
|
505
|
+
if original_hash == transferred_hash
|
|
506
|
+
puts "Transfer verified"
|
|
507
|
+
else
|
|
508
|
+
puts "Corruption detected"
|
|
509
|
+
end
|
|
510
|
+
----
|
|
511
|
+
|
|
512
|
+
=== Parallel Checksum Validation
|
|
513
|
+
|
|
514
|
+
Validate multiple files concurrently:
|
|
515
|
+
|
|
516
|
+
[source,ruby]
|
|
517
|
+
----
|
|
518
|
+
files.each do |file|
|
|
519
|
+
supervisor.add_work_item(
|
|
520
|
+
FileHashWork.new(file, expected_hash: checksums[file])
|
|
521
|
+
)
|
|
522
|
+
end
|
|
523
|
+
----
|
|
524
|
+
|
|
525
|
+
=== Pattern: Hierarchical Reduction
|
|
526
|
+
|
|
527
|
+
Extend to multi-level hierarchies:
|
|
528
|
+
|
|
529
|
+
[source]
|
|
530
|
+
----
|
|
531
|
+
Level 0: Individual chunks → chunk hashes
|
|
532
|
+
[c0, c1, c2, c3, c4, c5, c6, c7]
|
|
533
|
+
↓
|
|
534
|
+
Level 1: Group into blocks → block hashes
|
|
535
|
+
[b0={c0,c1}, b1={c2,c3}, b2={c4,c5}, b3={c6,c7}]
|
|
536
|
+
↓
|
|
537
|
+
Level 2: Group blocks → section hashes
|
|
538
|
+
[s0={b0,b1}, s1={b2,b3}]
|
|
539
|
+
↓
|
|
540
|
+
Level 3: Final hash
|
|
541
|
+
final={s0,s1}
|
|
542
|
+
----
|
|
543
|
+
|
|
544
|
+
**Benefits**:
|
|
545
|
+
|
|
546
|
+
* Allows **incremental verification**
|
|
547
|
+
* Supports **partial updates**
|
|
548
|
+
* Enables **merkle tree construction**
|
|
549
|
+
|
|
550
|
+
== Performance Tuning
|
|
551
|
+
|
|
552
|
+
=== Chunk Size Selection
|
|
553
|
+
|
|
554
|
+
[source,ruby]
|
|
555
|
+
----
|
|
556
|
+
# For CPU-bound hashing
|
|
557
|
+
chunk_size = 4096 # 4KB - many small chunks
|
|
558
|
+
|
|
559
|
+
# For I/O-bound processing
|
|
560
|
+
chunk_size = 1_048_576 # 1MB - fewer large chunks
|
|
561
|
+
|
|
562
|
+
# Adaptive sizing
|
|
563
|
+
chunk_size = [
|
|
564
|
+
file_size / (worker_count * 100), # Target ~100 chunks per worker
|
|
565
|
+
4096 # Minimum chunk size
|
|
566
|
+
].max
|
|
567
|
+
----
|
|
568
|
+
|
|
569
|
+
=== Worker Count Optimization
|
|
570
|
+
|
|
571
|
+
[source,ruby]
|
|
572
|
+
----
|
|
573
|
+
# CPU-bound: Match core count
|
|
574
|
+
worker_count = Etc.nprocessors
|
|
575
|
+
|
|
576
|
+
# I/O-bound: Can exceed core count
|
|
577
|
+
worker_count = Etc.nprocessors * 2
|
|
578
|
+
|
|
579
|
+
# Mixed workload: Use 1.5x cores
|
|
580
|
+
worker_count = (Etc.nprocessors * 1.5).to_i
|
|
581
|
+
----
|
|
582
|
+
|
|
583
|
+
=== Memory Considerations
|
|
584
|
+
|
|
585
|
+
[source,ruby]
|
|
586
|
+
----
|
|
587
|
+
# Memory usage ≈ chunk_size × worker_count × 2
|
|
588
|
+
# (2x for input chunk + output result)
|
|
589
|
+
|
|
590
|
+
max_memory = 512 * 1024 * 1024 # 512MB
|
|
591
|
+
chunk_size = max_memory / (worker_count * 2)
|
|
592
|
+
----
|
|
593
|
+
|
|
594
|
+
== Next Steps
|
|
595
|
+
|
|
596
|
+
After understanding hierarchical hashing, explore:
|
|
597
|
+
|
|
598
|
+
* **link:../pipeline_processing/README.adoc[Pipeline Processing]**: Multi-stage transformations
|
|
599
|
+
* **link:../scatter_gather/README.adoc[Scatter-Gather]**: Dynamic work distribution
|
|
600
|
+
* **link:../producer_subscriber/README.adoc[Producer-Subscriber]**: Streaming data patterns
|
|
601
|
+
* **link:../workflow/README.adoc[Workflow System]**: Complex multi-step pipelines
|
|
602
|
+
|
|
603
|
+
== Advanced Topics
|
|
604
|
+
|
|
605
|
+
=== Resumable Processing
|
|
606
|
+
|
|
607
|
+
Add checkpointing for large files:
|
|
608
|
+
|
|
609
|
+
[source,ruby]
|
|
610
|
+
----
|
|
611
|
+
def hash_file_resumable(checkpoint_file = nil)
|
|
612
|
+
completed = load_checkpoint(checkpoint_file) || []
|
|
613
|
+
|
|
614
|
+
chunks.each_with_index do |chunk, i|
|
|
615
|
+
next if completed.include?(i)
|
|
616
|
+
|
|
617
|
+
process_chunk(chunk)
|
|
618
|
+
save_checkpoint(checkpoint_file, completed << i)
|
|
619
|
+
end
|
|
620
|
+
end
|
|
621
|
+
----
|
|
622
|
+
|
|
623
|
+
=== Progress Tracking
|
|
624
|
+
|
|
625
|
+
Monitor processing progress:
|
|
626
|
+
|
|
627
|
+
[source,ruby]
|
|
628
|
+
----
|
|
629
|
+
def hash_file_with_progress
|
|
630
|
+
total_chunks = (file_size / chunk_size.to_f).ceil
|
|
631
|
+
|
|
632
|
+
supervisor.on_result do |result|
|
|
633
|
+
completed = supervisor.results.size
|
|
634
|
+
progress = (completed / total_chunks.to_f * 100).round(2)
|
|
635
|
+
puts "Progress: #{progress}% (#{completed}/#{total_chunks})"
|
|
636
|
+
end
|
|
637
|
+
|
|
638
|
+
supervisor.run
|
|
639
|
+
end
|
|
640
|
+
----
|
|
641
|
+
|
|
642
|
+
=== Merkle Tree Construction
|
|
643
|
+
|
|
644
|
+
Build a merkle tree for verification:
|
|
645
|
+
|
|
646
|
+
[source,ruby]
|
|
647
|
+
----
|
|
648
|
+
def build_merkle_tree
|
|
649
|
+
# Level 0: Leaf hashes (chunks)
|
|
650
|
+
leaves = hash_all_chunks
|
|
651
|
+
|
|
652
|
+
# Build tree bottom-up
|
|
653
|
+
tree = [leaves]
|
|
654
|
+
while tree.last.size > 1
|
|
655
|
+
parent_level = tree.last.each_slice(2).map do |pair|
|
|
656
|
+
Digest::SHA256.hexdigest(pair.join)
|
|
657
|
+
end
|
|
658
|
+
tree << parent_level
|
|
659
|
+
end
|
|
660
|
+
|
|
661
|
+
tree.last.first # Root hash
|
|
662
|
+
end
|
|
663
|
+
----
|