fractor 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +227 -102
  3. data/README.adoc +113 -1940
  4. data/docs/.lycheeignore +16 -0
  5. data/docs/Gemfile +24 -0
  6. data/docs/README.md +157 -0
  7. data/docs/_config.yml +151 -0
  8. data/docs/_features/error-handling.adoc +1192 -0
  9. data/docs/_features/index.adoc +80 -0
  10. data/docs/_features/monitoring.adoc +589 -0
  11. data/docs/_features/signal-handling.adoc +202 -0
  12. data/docs/_features/workflows.adoc +1235 -0
  13. data/docs/_guides/continuous-mode.adoc +736 -0
  14. data/docs/_guides/cookbook.adoc +1133 -0
  15. data/docs/_guides/index.adoc +55 -0
  16. data/docs/_guides/pipeline-mode.adoc +730 -0
  17. data/docs/_guides/troubleshooting.adoc +358 -0
  18. data/docs/_pages/architecture.adoc +1390 -0
  19. data/docs/_pages/core-concepts.adoc +1392 -0
  20. data/docs/_pages/design-principles.adoc +862 -0
  21. data/docs/_pages/getting-started.adoc +290 -0
  22. data/docs/_pages/installation.adoc +143 -0
  23. data/docs/_reference/api.adoc +1080 -0
  24. data/docs/_reference/error-reporting.adoc +670 -0
  25. data/docs/_reference/examples.adoc +181 -0
  26. data/docs/_reference/index.adoc +96 -0
  27. data/docs/_reference/troubleshooting.adoc +862 -0
  28. data/docs/_tutorials/complex-workflows.adoc +1022 -0
  29. data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
  30. data/docs/_tutorials/first-application.adoc +384 -0
  31. data/docs/_tutorials/index.adoc +48 -0
  32. data/docs/_tutorials/long-running-services.adoc +931 -0
  33. data/docs/assets/images/favicon-16.png +0 -0
  34. data/docs/assets/images/favicon-32.png +0 -0
  35. data/docs/assets/images/favicon-48.png +0 -0
  36. data/docs/assets/images/favicon.ico +0 -0
  37. data/docs/assets/images/favicon.png +0 -0
  38. data/docs/assets/images/favicon.svg +45 -0
  39. data/docs/assets/images/fractor-icon.svg +49 -0
  40. data/docs/assets/images/fractor-logo.svg +61 -0
  41. data/docs/index.adoc +131 -0
  42. data/docs/lychee.toml +39 -0
  43. data/examples/api_aggregator/README.adoc +627 -0
  44. data/examples/api_aggregator/api_aggregator.rb +376 -0
  45. data/examples/auto_detection/README.adoc +407 -29
  46. data/examples/continuous_chat_common/message_protocol.rb +1 -1
  47. data/examples/error_reporting.rb +207 -0
  48. data/examples/file_processor/README.adoc +170 -0
  49. data/examples/file_processor/file_processor.rb +615 -0
  50. data/examples/file_processor/sample_files/invalid.csv +1 -0
  51. data/examples/file_processor/sample_files/orders.xml +24 -0
  52. data/examples/file_processor/sample_files/products.json +23 -0
  53. data/examples/file_processor/sample_files/users.csv +6 -0
  54. data/examples/hierarchical_hasher/README.adoc +629 -41
  55. data/examples/image_processor/README.adoc +610 -0
  56. data/examples/image_processor/image_processor.rb +349 -0
  57. data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
  58. data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
  59. data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
  60. data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
  61. data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
  62. data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
  63. data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
  64. data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
  65. data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
  66. data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
  67. data/examples/image_processor/test_images/sample_1.png +1 -0
  68. data/examples/image_processor/test_images/sample_10.png +1 -0
  69. data/examples/image_processor/test_images/sample_2.png +1 -0
  70. data/examples/image_processor/test_images/sample_3.png +1 -0
  71. data/examples/image_processor/test_images/sample_4.png +1 -0
  72. data/examples/image_processor/test_images/sample_5.png +1 -0
  73. data/examples/image_processor/test_images/sample_6.png +1 -0
  74. data/examples/image_processor/test_images/sample_7.png +1 -0
  75. data/examples/image_processor/test_images/sample_8.png +1 -0
  76. data/examples/image_processor/test_images/sample_9.png +1 -0
  77. data/examples/log_analyzer/README.adoc +662 -0
  78. data/examples/log_analyzer/log_analyzer.rb +579 -0
  79. data/examples/log_analyzer/sample_logs/apache.log +20 -0
  80. data/examples/log_analyzer/sample_logs/json.log +15 -0
  81. data/examples/log_analyzer/sample_logs/nginx.log +15 -0
  82. data/examples/log_analyzer/sample_logs/rails.log +29 -0
  83. data/examples/multi_work_type/README.adoc +576 -26
  84. data/examples/performance_monitoring.rb +120 -0
  85. data/examples/pipeline_processing/README.adoc +740 -26
  86. data/examples/pipeline_processing/pipeline_processing.rb +2 -2
  87. data/examples/priority_work_example.rb +155 -0
  88. data/examples/producer_subscriber/README.adoc +889 -46
  89. data/examples/scatter_gather/README.adoc +829 -27
  90. data/examples/simple/README.adoc +347 -0
  91. data/examples/specialized_workers/README.adoc +622 -26
  92. data/examples/specialized_workers/specialized_workers.rb +44 -8
  93. data/examples/stream_processor/README.adoc +206 -0
  94. data/examples/stream_processor/stream_processor.rb +284 -0
  95. data/examples/web_scraper/README.adoc +625 -0
  96. data/examples/web_scraper/web_scraper.rb +285 -0
  97. data/examples/workflow/README.adoc +406 -0
  98. data/examples/workflow/circuit_breaker/README.adoc +360 -0
  99. data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
  100. data/examples/workflow/conditional/README.adoc +483 -0
  101. data/examples/workflow/conditional/conditional_workflow.rb +215 -0
  102. data/examples/workflow/dead_letter_queue/README.adoc +374 -0
  103. data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
  104. data/examples/workflow/fan_out/README.adoc +381 -0
  105. data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
  106. data/examples/workflow/retry/README.adoc +248 -0
  107. data/examples/workflow/retry/retry_workflow.rb +195 -0
  108. data/examples/workflow/simple_linear/README.adoc +267 -0
  109. data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
  110. data/examples/workflow/simplified/README.adoc +329 -0
  111. data/examples/workflow/simplified/simplified_workflow.rb +222 -0
  112. data/exe/fractor +10 -0
  113. data/lib/fractor/cli.rb +288 -0
  114. data/lib/fractor/configuration.rb +307 -0
  115. data/lib/fractor/continuous_server.rb +60 -65
  116. data/lib/fractor/error_formatter.rb +72 -0
  117. data/lib/fractor/error_report_generator.rb +152 -0
  118. data/lib/fractor/error_reporter.rb +244 -0
  119. data/lib/fractor/error_statistics.rb +147 -0
  120. data/lib/fractor/execution_tracer.rb +162 -0
  121. data/lib/fractor/logger.rb +230 -0
  122. data/lib/fractor/main_loop_handler.rb +406 -0
  123. data/lib/fractor/main_loop_handler3.rb +135 -0
  124. data/lib/fractor/main_loop_handler4.rb +299 -0
  125. data/lib/fractor/performance_metrics_collector.rb +181 -0
  126. data/lib/fractor/performance_monitor.rb +215 -0
  127. data/lib/fractor/performance_report_generator.rb +202 -0
  128. data/lib/fractor/priority_work.rb +93 -0
  129. data/lib/fractor/priority_work_queue.rb +189 -0
  130. data/lib/fractor/result_aggregator.rb +32 -0
  131. data/lib/fractor/shutdown_handler.rb +168 -0
  132. data/lib/fractor/signal_handler.rb +80 -0
  133. data/lib/fractor/supervisor.rb +382 -269
  134. data/lib/fractor/supervisor_logger.rb +88 -0
  135. data/lib/fractor/version.rb +1 -1
  136. data/lib/fractor/work.rb +12 -0
  137. data/lib/fractor/work_distribution_manager.rb +151 -0
  138. data/lib/fractor/work_queue.rb +20 -0
  139. data/lib/fractor/work_result.rb +181 -9
  140. data/lib/fractor/worker.rb +73 -0
  141. data/lib/fractor/workflow/builder.rb +210 -0
  142. data/lib/fractor/workflow/chain_builder.rb +169 -0
  143. data/lib/fractor/workflow/circuit_breaker.rb +183 -0
  144. data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
  145. data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
  146. data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
  147. data/lib/fractor/workflow/execution_hooks.rb +39 -0
  148. data/lib/fractor/workflow/execution_strategy.rb +225 -0
  149. data/lib/fractor/workflow/execution_trace.rb +134 -0
  150. data/lib/fractor/workflow/helpers.rb +191 -0
  151. data/lib/fractor/workflow/job.rb +290 -0
  152. data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
  153. data/lib/fractor/workflow/logger.rb +110 -0
  154. data/lib/fractor/workflow/pre_execution_context.rb +193 -0
  155. data/lib/fractor/workflow/retry_config.rb +156 -0
  156. data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
  157. data/lib/fractor/workflow/retry_strategy.rb +93 -0
  158. data/lib/fractor/workflow/structured_logger.rb +30 -0
  159. data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
  160. data/lib/fractor/workflow/visualizer.rb +211 -0
  161. data/lib/fractor/workflow/workflow_context.rb +132 -0
  162. data/lib/fractor/workflow/workflow_executor.rb +669 -0
  163. data/lib/fractor/workflow/workflow_result.rb +55 -0
  164. data/lib/fractor/workflow/workflow_validator.rb +295 -0
  165. data/lib/fractor/workflow.rb +333 -0
  166. data/lib/fractor/wrapped_ractor.rb +66 -101
  167. data/lib/fractor/wrapped_ractor3.rb +161 -0
  168. data/lib/fractor/wrapped_ractor4.rb +242 -0
  169. data/lib/fractor.rb +92 -4
  170. metadata +179 -6
  171. data/tests/sample.rb.bak +0 -309
  172. data/tests/sample_working.rb.bak +0 -209
@@ -5,71 +5,659 @@
5
5
 
6
6
  toc::[]
7
7
 
8
- == Overview
8
+ == Purpose
9
9
 
10
- The Hierarchical Hasher example demonstrates how to use the Fractor framework to process a file in parallel by breaking it into chunks, hashing each chunk independently, and then combining the results into a final hash.
10
+ The Hierarchical Hasher example demonstrates parallel file processing using a map-reduce pattern with Fractor. It showcases how to break large files into chunks, process them concurrently, and aggregate results while preserving order. This is a fundamental pattern for processing large datasets efficiently using parallel workers.
11
11
 
12
- This example is particularly useful for:
12
+ == Focus
13
13
 
14
- * Processing large files efficiently
15
- * Demonstrating parallel data chunking patterns
16
- * Showcasing result aggregation techniques
14
+ This example demonstrates:
17
15
 
18
- == Implementation Details
16
+ * **Chunking patterns** for parallel data processing
17
+ * **Position-aware processing** to maintain data order
18
+ * **Result aggregation** with sorting and combining
19
+ * **Map-reduce architecture** in Fractor
20
+ * **Parallel I/O processing** for large files
21
+ * **Worker pool utilization** for CPU-bound tasks
19
22
 
20
- The example consists of the following key components:
23
+ == Architecture
21
24
 
22
- === ChunkWork
25
+ === Data Flow Overview
23
26
 
24
- A subclass of `Fractor::Work` that represents a chunk of a file to be hashed. Each `ChunkWork` instance contains:
27
+ [source]
28
+ ----
29
+ ┌─────────────────────────────────────────────────────────────┐
30
+ │ Input File │
31
+ │ "Lorem ipsum dolor sit amet consectetur adipiscing..." │
32
+ └─────────────────────────────────────────────────────────────┘
33
+
34
+ │ File.read(chunk_size)
35
+
36
+ ┌───────────────┐
37
+ │ File Chunking │
38
+ └───────────────┘
39
+
40
+ ┌───────────────┼───────────────┐
41
+ │ │ │
42
+ ▼ ▼ ▼
43
+ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
44
+ │ ChunkWork │ │ ChunkWork │ │ ChunkWork │
45
+ │ start=0 │ │ start=1024 │ │ start=2048 │
46
+ │ length=1024 │ │ length=1024 │ │ length=1024 │
47
+ │ data=[...] │ │ data=[...] │ │ data=[...] │
48
+ └──────────────┘ └──────────────┘ └──────────────┘
49
+ │ │ │
50
+ │ Parallel │ Processing │
51
+ │ │ │
52
+ ▼ ▼ ▼
53
+ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
54
+ │ HashWorker 1 │ │ HashWorker 2 │ │ HashWorker 3 │
55
+ │ SHA256(...) │ │ SHA256(...) │ │ SHA256(...) │
56
+ └──────────────┘ └──────────────┘ └──────────────┘
57
+ │ │ │
58
+ │ │ │
59
+ ▼ ▼ ▼
60
+ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
61
+ │ WorkResult │ │ WorkResult │ │ WorkResult │
62
+ │ start=0 │ │ start=1024 │ │ start=2048 │
63
+ │ hash=a3f2... │ │ hash=7b9c... │ │ hash=e5d1... │
64
+ └──────────────┘ └──────────────┘ └──────────────┘
65
+ │ │ │
66
+ └───────────────┼───────────────┘
67
+ │ Sort by start position
68
+
69
+ ┌───────────────┐
70
+ │ Aggregation │
71
+ │ a3f2...\n │
72
+ │ 7b9c...\n │
73
+ │ e5d1... │
74
+ └───────────────┘
75
+
76
+ │ SHA256(combined)
77
+
78
+ ┌───────────────┐
79
+ │ Final Hash │
80
+ │ c4e8a9b2f... │
81
+ └───────────────┘
82
+ ----
83
+
84
+ === Map-Reduce Pattern
85
+
86
+ [source]
87
+ ----
88
+ Map Phase (Parallel Processing)
89
+ ┌─────────────────────────────────────────────────────────┐
90
+ │ File Chunk 0 → HashWorker → hash_0 (a3f2...) │
91
+ │ File Chunk 1 → HashWorker → hash_1 (7b9c...) │
92
+ │ File Chunk 2 → HashWorker → hash_2 (e5d1...) │
93
+ │ File Chunk 3 → HashWorker → hash_3 (f1a8...) │
94
+ │ ... │
95
+ │ File Chunk N → HashWorker → hash_N (d9c4...) │
96
+ └─────────────────────────────────────────────────────────┘
97
+
98
+
99
+ Reduce Phase (Sequential Aggregation)
100
+ ┌─────────────────────────────────────────────────────────┐
101
+ │ 1. Sort results by chunk position │
102
+ │ 2. Concatenate: hash_0\nhash_1\nhash_2\n...hash_N │
103
+ │ 3. Final hash: SHA256(concatenated_hashes) │
104
+ └─────────────────────────────────────────────────────────┘
105
+ ----
106
+
107
+ === Performance Comparison
108
+
109
+ [source]
110
+ ----
111
+ Sequential Processing:
112
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
113
+ │ Chunk 0 │ Chunk 1 │ Chunk 2 │ Chunk 3 │ ... │
114
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
115
+ Time: 100 units
116
+
117
+ Parallel Processing (4 workers):
118
+ ━━━━━━━━━━━━━━━━━━━━━
119
+ │ Chunk 0 │
120
+ │ Chunk 1 │ (Worker 1)
121
+ │ Chunk 2 │ (Worker 2)
122
+ │ Chunk 3 │ (Worker 3)
123
+ ━━━━━━━━━━━━━━━━━━━━━
124
+ Time: ~25 units (4x speedup)
125
+
126
+ Actual speedup depends on:
127
+ - Number of workers vs cores
128
+ - Chunk size vs overhead ratio
129
+ - I/O vs CPU bottleneck
130
+ ----
131
+
132
+ == Key Components
133
+
134
+ === ChunkWork: Position-Aware Work Unit
135
+
136
+ The `ChunkWork` class represents a chunk of the file with position metadata:
137
+
138
+ [source,ruby]
139
+ ----
140
+ class ChunkWork < Fractor::Work
141
+ def initialize(data, start = 0, length = nil)
142
+ super({
143
+ data: data, # <1>
144
+ start: start, # <2>
145
+ length: length || data.bytesize, # <3>
146
+ })
147
+ end
148
+
149
+ def data
150
+ input[:data]
151
+ end
152
+
153
+ def start
154
+ input[:start]
155
+ end
156
+
157
+ def length
158
+ input[:length]
159
+ end
160
+ end
161
+ ----
162
+ <1> The actual chunk data to be hashed
163
+ <2> Starting byte position in the original file (for ordering)
164
+ <3> Length of this chunk in bytes
165
+
166
+ Why position tracking matters:
167
+
168
+ * Enables **correct result ordering** during aggregation
169
+ * Supports **resumable processing** for interrupted jobs
170
+ * Allows **parallel validation** against sequential processing
171
+ * Facilitates **chunk-level debugging** and error tracking
25
172
 
26
- * The chunk data
27
- * The starting position within the file
28
- * The length of the chunk
173
+ === HashWorker: Independent Chunk Processor
29
174
 
30
- === HashWorker
175
+ The `HashWorker` processes chunks without dependencies:
31
176
 
32
- A subclass of `Fractor::Worker` that processes `ChunkWork` instances by:
177
+ [source,ruby]
178
+ ----
179
+ class HashWorker < Fractor::Worker
180
+ def process(work)
181
+ sleep(rand(0.01..0.05)) # <1>
182
+
183
+ hash = Digest::SHA256.hexdigest(work.data) # <2>
184
+
185
+ Fractor::WorkResult.new(
186
+ result: {
187
+ start: work.start, # <3>
188
+ length: work.length,
189
+ hash: hash,
190
+ },
191
+ work: work,
192
+ )
193
+ end
194
+ end
195
+ ----
196
+ <1> Simulates variable processing time (real work would be I/O or computation)
197
+ <2> Calculates SHA-256 hash for the chunk (Ractor-safe)
198
+ <3> Preserves position metadata for correct aggregation
199
+
200
+ Key design aspects:
201
+
202
+ * **Stateless processing**: Each chunk is independent
203
+ * **Position preservation**: Results include original position
204
+ * **Error handling**: Wrapped in begin/rescue for robustness
205
+ * **Ractor compatibility**: Uses SHA-256 instead of SHA-3
206
+
207
+ === FileHasher: Orchestration and Aggregation
208
+
209
+ The `FileHasher` orchestrates the entire process:
210
+
211
+ [source,ruby]
212
+ ----
213
+ class FileHasher
214
+ def hash_file
215
+ supervisor = Fractor::Supervisor.new(
216
+ worker_pools: [
217
+ { worker_class: HashWorker, num_workers: @worker_count }, # <1>
218
+ ],
219
+ )
33
220
 
34
- 1. Calculating a SHA-256 hash for the chunk
35
- 2. Returning a work result containing the hash, start position, and length
221
+ load_file_chunks(supervisor) # <2>
222
+ supervisor.run # <3>
36
223
 
37
- === FileHasher
224
+ @final_hash = finalize_hash(supervisor.results) # <4>
225
+ end
38
226
 
39
- The main orchestration class that:
227
+ private
40
228
 
41
- 1. Breaks a file into chunks of a specified size
42
- 2. Creates a `Fractor::Supervisor` with the `HashWorker` and `ChunkWork` classes
43
- 3. Processes all chunks in parallel
44
- 4. Aggregates the results to create a final hash by combining all chunk hashes
229
+ def load_file_chunks(supervisor)
230
+ File.open(@file_path, "rb") do |file|
231
+ start_pos = 0
232
+ while (chunk = file.read(@chunk_size)) # <5>
233
+ work_items << ChunkWork.new(chunk, start_pos, chunk.length)
234
+ start_pos += chunk.length
235
+ end
236
+ end
237
+ supervisor.add_work_items(work_items)
238
+ end
239
+
240
+ def finalize_hash(results_aggregator)
241
+ sorted_results = results_aggregator.results.sort_by do |result|
242
+ result.result[:start] # <6>
243
+ end
244
+
245
+ combined_hash_string = sorted_results.map do |result|
246
+ result.result[:hash]
247
+ end.join("\n") # <7>
248
+
249
+ Digest::SHA256.hexdigest(combined_hash_string) # <8>
250
+ end
251
+ end
252
+ ----
253
+ <1> Create worker pool with configurable size
254
+ <2> Break file into chunks with position tracking
255
+ <3> Execute parallel processing
256
+ <4> Aggregate results into final hash
257
+ <5> Read file in chunks (default 1KB)
258
+ <6> Sort by original position to maintain order
259
+ <7> Combine chunk hashes with newline separator
260
+ <8> Create final hash from combined hashes
261
+
262
+ Orchestration phases:
263
+
264
+ 1. **Chunking**: Break file into fixed-size pieces
265
+ 2. **Distribution**: Send chunks to worker pool
266
+ 3. **Parallel Execution**: Workers process chunks concurrently
267
+ 4. **Collection**: Gather all results
268
+ 5. **Aggregation**: Sort, combine, and finalize
45
269
 
46
270
  == Usage
47
271
 
272
+ .Basic usage
273
+ [example]
274
+ ====
275
+ [source,bash]
276
+ ----
277
+ # Use default 4 workers
278
+ ruby hierarchical_hasher.rb sample.txt
279
+
280
+ # Use 8 workers for better parallelization
281
+ ruby hierarchical_hasher.rb large_file.dat 8
282
+
283
+ # Process a large log file
284
+ ruby hierarchical_hasher.rb /var/log/system.log 16
285
+ ----
286
+ ====
287
+
288
+ .Programmatic usage
289
+ [example]
290
+ ====
291
+ [source,ruby]
292
+ ----
293
+ require_relative "hierarchical_hasher"
294
+
295
+ # Create hasher with custom chunk size
296
+ hasher = HierarchicalHasher::FileHasher.new(
297
+ "large_file.dat",
298
+ chunk_size: 4096, # 4KB chunks
299
+ worker_count: 8 # 8 parallel workers
300
+ )
301
+
302
+ # Process the file
303
+ final_hash = hasher.hash_file
304
+
305
+ puts "Final hash: #{final_hash}"
306
+ ----
307
+ ====
308
+
309
+ == Expected Output
310
+
311
+ [source,text]
312
+ ----
313
+ Starting hierarchical hasher with 4 workers...
314
+ Processing file: sample.txt
315
+ Final SHA-256 hash: c4e8a9b2f1d3e5a7c9b1f3d5e7a9c1b3d5e7a9b1c3d5e7a9b1c3d5e7a9b1c3d5
316
+ Processing completed in 0.234567 seconds
317
+ ----
318
+
319
+ == Learning Points
320
+
321
+ === 1. Chunking Strategy
322
+
323
+ The example uses fixed-size chunking:
324
+
325
+ [source,ruby]
326
+ ----
327
+ chunk_size = 1024 # 1KB chunks
328
+
329
+ File.open(file_path, "rb") do |file|
330
+ while (chunk = file.read(chunk_size))
331
+ # Process chunk
332
+ end
333
+ end
334
+ ----
335
+
336
+ **Considerations**:
337
+
338
+ * **Small chunks** (< 1KB): Higher overhead, more parallelism
339
+ * **Medium chunks** (1-10KB): Balanced overhead and parallelism
340
+ * **Large chunks** (> 100KB): Lower overhead, less parallelism
341
+
342
+ **Rule of thumb**: Chunk size should be 10-100x the processing time to amortize overhead.
343
+
344
+ === 2. Position-Aware Processing
345
+
346
+ Position tracking enables correct ordering:
347
+
348
+ [source,ruby]
349
+ ----
350
+ # Without position tracking (WRONG for ordered results)
351
+ results.map { |r| r.hash }.join("\n")
352
+
353
+ # With position tracking (CORRECT)
354
+ results.sort_by { |r| r.start }.map { |r| r.hash }.join("\n")
355
+ ----
356
+
357
+ **Why it matters**:
358
+
359
+ * Workers complete in **non-deterministic order**
360
+ * Results must be **reassembled in file order**
361
+ * Position metadata is **minimal overhead** (8 bytes per chunk)
362
+
363
+ === 3. Map-Reduce Pattern
364
+
365
+ The example implements a classic map-reduce:
366
+
367
+ **Map phase** (parallel):
368
+ [source,ruby]
369
+ ----
370
+ chunks.map do |chunk|
371
+ Digest::SHA256.hexdigest(chunk.data)
372
+ end
373
+ ----
374
+
375
+ **Reduce phase** (sequential):
48
376
  [source,ruby]
49
377
  ----
50
- # Basic usage
51
- ruby hierarchical_hasher.rb <file_path> [worker_count]
378
+ hashes.sort_by { |r| r.start }
379
+ .map { |r| r.hash }
380
+ .join("\n")
381
+ .then { |combined| Digest::SHA256.hexdigest(combined) }
382
+ ----
383
+
384
+ **Key insight**: Map is parallelizable, reduce requires ordering.
385
+
386
+ === 4. Performance Characteristics
52
387
 
53
- # Examples
54
- ruby hierarchical_hasher.rb sample.txt # Use default 4 workers
55
- ruby hierarchical_hasher.rb large_file.dat 8 # Use 8 workers
388
+ **Speedup formula**:
389
+ [source]
56
390
  ----
391
+ Speedup = T_sequential / T_parallel
392
+ ≈ N_workers (ideal)
393
+ < N_workers (actual, due to overhead)
394
+
395
+ Actual speedup = N_workers × η
396
+ where η = efficiency factor (0.6-0.9 typical)
397
+ ----
398
+
399
+ **Bottlenecks**:
57
400
 
58
- == How It Works
401
+ * **I/O bound**: Limited by disk read speed
402
+ * **CPU bound**: Limited by hashing computation
403
+ * **Overhead**: Ractor creation, communication, synchronization
59
404
 
60
- 1. The file is divided into 1KB chunks (configurable)
61
- 2. Each chunk is assigned to a worker for processing
62
- 3. Workers calculate SHA-256 hashes for their assigned chunks
63
- 4. Results are collected and sorted by their original position in the file
64
- 5. The individual chunk hashes are concatenated with newlines
65
- 6. A final SHA-256 hash is calculated on the combined hash string
405
+ **Optimization strategies**:
66
406
 
67
- == Performance Considerations
407
+ * Increase chunk size to reduce overhead
408
+ * Match worker count to available cores
409
+ * Use buffered I/O for faster reading
410
+ * Consider memory constraints for large files
68
411
 
69
- * The chunk size can be adjusted to optimize performance for different file types
70
- * The number of workers can be increased for better parallelization on multi-core systems
71
- * Very small files may not benefit from parallelization due to the overhead
412
+ === 5. Ractor Compatibility
413
+
414
+ The example uses SHA-256 instead of SHA-3:
415
+
416
+ [source,ruby]
417
+ ----
418
+ # Ractor-safe
419
+ Digest::SHA256.hexdigest(data)
420
+
421
+ # Not Ractor-safe in some Ruby versions
422
+ # Digest::SHA3.hexdigest(data)
423
+ ----
72
424
 
73
- == Ractor Compatibility Note
425
+ **Ractor requirements**:
74
426
 
75
- This example uses SHA-256 instead of SHA3 because the SHA3 implementation in some Ruby versions is not Ractor-compatible.
427
+ * All data must be **immutable** or **copied**
428
+ * Libraries must be **thread-safe**
429
+ * No shared mutable state
430
+
431
+ === 6. Error Handling
432
+
433
+ The worker includes error handling:
434
+
435
+ [source,ruby]
436
+ ----
437
+ begin
438
+ hash = Digest::SHA256.hexdigest(work.data)
439
+ Fractor::WorkResult.new(result: { hash: hash }, work: work)
440
+ rescue StandardError => e
441
+ Fractor::WorkResult.new(error: e.message, work: work)
442
+ end
443
+ ----
444
+
445
+ **Best practices**:
446
+
447
+ * Wrap processing in `begin/rescue`
448
+ * Include context in error messages
449
+ * Return `WorkResult` with error, not raise
450
+ * Allow supervisor to handle failures
451
+
452
+ == Use Cases and Patterns
453
+
454
+ === Large File Processing
455
+
456
+ Process files too large for memory:
457
+
458
+ [source,ruby]
459
+ ----
460
+ # Process a 10GB log file
461
+ hasher = FileHasher.new(
462
+ "huge.log",
463
+ chunk_size: 1_048_576, # 1MB chunks
464
+ worker_count: 16
465
+ )
466
+ hasher.hash_file
467
+ ----
468
+
469
+ **Benefits**:
470
+
471
+ * **Streaming processing**: No need to load entire file
472
+ * **Parallel speedup**: 10-15x faster on 16 cores
473
+ * **Memory efficient**: Only chunks in memory
474
+
475
+ === Content-Addressable Storage
476
+
477
+ Create unique identifiers for files:
478
+
479
+ [source,ruby]
480
+ ----
481
+ # Store file by its hash
482
+ file_hash = hasher.hash_file
483
+ storage_path = "store/#{file_hash[0..2]}/#{file_hash}"
484
+ FileUtils.cp(file_path, storage_path)
485
+ ----
486
+
487
+ **Use cases**:
488
+
489
+ * Deduplication systems
490
+ * Content-addressable storage
491
+ * Distributed file systems
492
+
493
+ === Data Integrity Verification
494
+
495
+ Verify file integrity after transfer:
496
+
497
+ [source,ruby]
498
+ ----
499
+ # Before transfer
500
+ original_hash = FileHasher.new(source_file).hash_file
501
+
502
+ # After transfer
503
+ transferred_hash = FileHasher.new(dest_file).hash_file
504
+
505
+ if original_hash == transferred_hash
506
+ puts "Transfer verified"
507
+ else
508
+ puts "Corruption detected"
509
+ end
510
+ ----
511
+
512
+ === Parallel Checksum Validation
513
+
514
+ Validate multiple files concurrently:
515
+
516
+ [source,ruby]
517
+ ----
518
+ files.each do |file|
519
+ supervisor.add_work_item(
520
+ FileHashWork.new(file, expected_hash: checksums[file])
521
+ )
522
+ end
523
+ ----
524
+
525
+ === Pattern: Hierarchical Reduction
526
+
527
+ Extend to multi-level hierarchies:
528
+
529
+ [source]
530
+ ----
531
+ Level 0: Individual chunks → chunk hashes
532
+ [c0, c1, c2, c3, c4, c5, c6, c7]
533
+
534
+ Level 1: Group into blocks → block hashes
535
+ [b0={c0,c1}, b1={c2,c3}, b2={c4,c5}, b3={c6,c7}]
536
+
537
+ Level 2: Group blocks → section hashes
538
+ [s0={b0,b1}, s1={b2,b3}]
539
+
540
+ Level 3: Final hash
541
+ final={s0,s1}
542
+ ----
543
+
544
+ **Benefits**:
545
+
546
+ * Allows **incremental verification**
547
+ * Supports **partial updates**
548
+ * Enables **merkle tree construction**
549
+
550
+ == Performance Tuning
551
+
552
+ === Chunk Size Selection
553
+
554
+ [source,ruby]
555
+ ----
556
+ # For CPU-bound hashing
557
+ chunk_size = 4096 # 4KB - many small chunks
558
+
559
+ # For I/O-bound processing
560
+ chunk_size = 1_048_576 # 1MB - fewer large chunks
561
+
562
+ # Adaptive sizing
563
+ chunk_size = [
564
+ file_size / (worker_count * 100), # Target ~100 chunks per worker
565
+ 4096 # Minimum chunk size
566
+ ].max
567
+ ----
568
+
569
+ === Worker Count Optimization
570
+
571
+ [source,ruby]
572
+ ----
573
+ # CPU-bound: Match core count
574
+ worker_count = Etc.nprocessors
575
+
576
+ # I/O-bound: Can exceed core count
577
+ worker_count = Etc.nprocessors * 2
578
+
579
+ # Mixed workload: Use 1.5x cores
580
+ worker_count = (Etc.nprocessors * 1.5).to_i
581
+ ----
582
+
583
+ === Memory Considerations
584
+
585
+ [source,ruby]
586
+ ----
587
+ # Memory usage ≈ chunk_size × worker_count × 2
588
+ # (2x for input chunk + output result)
589
+
590
+ max_memory = 512 * 1024 * 1024 # 512MB
591
+ chunk_size = max_memory / (worker_count * 2)
592
+ ----
593
+
594
+ == Next Steps
595
+
596
+ After understanding hierarchical hashing, explore:
597
+
598
+ * **link:../pipeline_processing/README.adoc[Pipeline Processing]**: Multi-stage transformations
599
+ * **link:../scatter_gather/README.adoc[Scatter-Gather]**: Dynamic work distribution
600
+ * **link:../producer_subscriber/README.adoc[Producer-Subscriber]**: Streaming data patterns
601
+ * **link:../workflow/README.adoc[Workflow System]**: Complex multi-step pipelines
602
+
603
+ == Advanced Topics
604
+
605
+ === Resumable Processing
606
+
607
+ Add checkpointing for large files:
608
+
609
+ [source,ruby]
610
+ ----
611
+ def hash_file_resumable(checkpoint_file = nil)
612
+ completed = load_checkpoint(checkpoint_file) || []
613
+
614
+ chunks.each_with_index do |chunk, i|
615
+ next if completed.include?(i)
616
+
617
+ process_chunk(chunk)
618
+ save_checkpoint(checkpoint_file, completed << i)
619
+ end
620
+ end
621
+ ----
622
+
623
+ === Progress Tracking
624
+
625
+ Monitor processing progress:
626
+
627
+ [source,ruby]
628
+ ----
629
+ def hash_file_with_progress
630
+ total_chunks = (file_size / chunk_size.to_f).ceil
631
+
632
+ supervisor.on_result do |result|
633
+ completed = supervisor.results.size
634
+ progress = (completed / total_chunks.to_f * 100).round(2)
635
+ puts "Progress: #{progress}% (#{completed}/#{total_chunks})"
636
+ end
637
+
638
+ supervisor.run
639
+ end
640
+ ----
641
+
642
+ === Merkle Tree Construction
643
+
644
+ Build a merkle tree for verification:
645
+
646
+ [source,ruby]
647
+ ----
648
+ def build_merkle_tree
649
+ # Level 0: Leaf hashes (chunks)
650
+ leaves = hash_all_chunks
651
+
652
+ # Build tree bottom-up
653
+ tree = [leaves]
654
+ while tree.last.size > 1
655
+ parent_level = tree.last.each_slice(2).map do |pair|
656
+ Digest::SHA256.hexdigest(pair.join)
657
+ end
658
+ tree << parent_level
659
+ end
660
+
661
+ tree.last.first # Root hash
662
+ end
663
+ ----