fractor 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +227 -102
  3. data/README.adoc +113 -1940
  4. data/docs/.lycheeignore +16 -0
  5. data/docs/Gemfile +24 -0
  6. data/docs/README.md +157 -0
  7. data/docs/_config.yml +151 -0
  8. data/docs/_features/error-handling.adoc +1192 -0
  9. data/docs/_features/index.adoc +80 -0
  10. data/docs/_features/monitoring.adoc +589 -0
  11. data/docs/_features/signal-handling.adoc +202 -0
  12. data/docs/_features/workflows.adoc +1235 -0
  13. data/docs/_guides/continuous-mode.adoc +736 -0
  14. data/docs/_guides/cookbook.adoc +1133 -0
  15. data/docs/_guides/index.adoc +55 -0
  16. data/docs/_guides/pipeline-mode.adoc +730 -0
  17. data/docs/_guides/troubleshooting.adoc +358 -0
  18. data/docs/_pages/architecture.adoc +1390 -0
  19. data/docs/_pages/core-concepts.adoc +1392 -0
  20. data/docs/_pages/design-principles.adoc +862 -0
  21. data/docs/_pages/getting-started.adoc +290 -0
  22. data/docs/_pages/installation.adoc +143 -0
  23. data/docs/_reference/api.adoc +1080 -0
  24. data/docs/_reference/error-reporting.adoc +670 -0
  25. data/docs/_reference/examples.adoc +181 -0
  26. data/docs/_reference/index.adoc +96 -0
  27. data/docs/_reference/troubleshooting.adoc +862 -0
  28. data/docs/_tutorials/complex-workflows.adoc +1022 -0
  29. data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
  30. data/docs/_tutorials/first-application.adoc +384 -0
  31. data/docs/_tutorials/index.adoc +48 -0
  32. data/docs/_tutorials/long-running-services.adoc +931 -0
  33. data/docs/assets/images/favicon-16.png +0 -0
  34. data/docs/assets/images/favicon-32.png +0 -0
  35. data/docs/assets/images/favicon-48.png +0 -0
  36. data/docs/assets/images/favicon.ico +0 -0
  37. data/docs/assets/images/favicon.png +0 -0
  38. data/docs/assets/images/favicon.svg +45 -0
  39. data/docs/assets/images/fractor-icon.svg +49 -0
  40. data/docs/assets/images/fractor-logo.svg +61 -0
  41. data/docs/index.adoc +131 -0
  42. data/docs/lychee.toml +39 -0
  43. data/examples/api_aggregator/README.adoc +627 -0
  44. data/examples/api_aggregator/api_aggregator.rb +376 -0
  45. data/examples/auto_detection/README.adoc +407 -29
  46. data/examples/continuous_chat_common/message_protocol.rb +1 -1
  47. data/examples/error_reporting.rb +207 -0
  48. data/examples/file_processor/README.adoc +170 -0
  49. data/examples/file_processor/file_processor.rb +615 -0
  50. data/examples/file_processor/sample_files/invalid.csv +1 -0
  51. data/examples/file_processor/sample_files/orders.xml +24 -0
  52. data/examples/file_processor/sample_files/products.json +23 -0
  53. data/examples/file_processor/sample_files/users.csv +6 -0
  54. data/examples/hierarchical_hasher/README.adoc +629 -41
  55. data/examples/image_processor/README.adoc +610 -0
  56. data/examples/image_processor/image_processor.rb +349 -0
  57. data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
  58. data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
  59. data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
  60. data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
  61. data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
  62. data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
  63. data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
  64. data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
  65. data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
  66. data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
  67. data/examples/image_processor/test_images/sample_1.png +1 -0
  68. data/examples/image_processor/test_images/sample_10.png +1 -0
  69. data/examples/image_processor/test_images/sample_2.png +1 -0
  70. data/examples/image_processor/test_images/sample_3.png +1 -0
  71. data/examples/image_processor/test_images/sample_4.png +1 -0
  72. data/examples/image_processor/test_images/sample_5.png +1 -0
  73. data/examples/image_processor/test_images/sample_6.png +1 -0
  74. data/examples/image_processor/test_images/sample_7.png +1 -0
  75. data/examples/image_processor/test_images/sample_8.png +1 -0
  76. data/examples/image_processor/test_images/sample_9.png +1 -0
  77. data/examples/log_analyzer/README.adoc +662 -0
  78. data/examples/log_analyzer/log_analyzer.rb +579 -0
  79. data/examples/log_analyzer/sample_logs/apache.log +20 -0
  80. data/examples/log_analyzer/sample_logs/json.log +15 -0
  81. data/examples/log_analyzer/sample_logs/nginx.log +15 -0
  82. data/examples/log_analyzer/sample_logs/rails.log +29 -0
  83. data/examples/multi_work_type/README.adoc +576 -26
  84. data/examples/performance_monitoring.rb +120 -0
  85. data/examples/pipeline_processing/README.adoc +740 -26
  86. data/examples/pipeline_processing/pipeline_processing.rb +2 -2
  87. data/examples/priority_work_example.rb +155 -0
  88. data/examples/producer_subscriber/README.adoc +889 -46
  89. data/examples/scatter_gather/README.adoc +829 -27
  90. data/examples/simple/README.adoc +347 -0
  91. data/examples/specialized_workers/README.adoc +622 -26
  92. data/examples/specialized_workers/specialized_workers.rb +44 -8
  93. data/examples/stream_processor/README.adoc +206 -0
  94. data/examples/stream_processor/stream_processor.rb +284 -0
  95. data/examples/web_scraper/README.adoc +625 -0
  96. data/examples/web_scraper/web_scraper.rb +285 -0
  97. data/examples/workflow/README.adoc +406 -0
  98. data/examples/workflow/circuit_breaker/README.adoc +360 -0
  99. data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
  100. data/examples/workflow/conditional/README.adoc +483 -0
  101. data/examples/workflow/conditional/conditional_workflow.rb +215 -0
  102. data/examples/workflow/dead_letter_queue/README.adoc +374 -0
  103. data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
  104. data/examples/workflow/fan_out/README.adoc +381 -0
  105. data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
  106. data/examples/workflow/retry/README.adoc +248 -0
  107. data/examples/workflow/retry/retry_workflow.rb +195 -0
  108. data/examples/workflow/simple_linear/README.adoc +267 -0
  109. data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
  110. data/examples/workflow/simplified/README.adoc +329 -0
  111. data/examples/workflow/simplified/simplified_workflow.rb +222 -0
  112. data/exe/fractor +10 -0
  113. data/lib/fractor/cli.rb +288 -0
  114. data/lib/fractor/configuration.rb +307 -0
  115. data/lib/fractor/continuous_server.rb +60 -65
  116. data/lib/fractor/error_formatter.rb +72 -0
  117. data/lib/fractor/error_report_generator.rb +152 -0
  118. data/lib/fractor/error_reporter.rb +244 -0
  119. data/lib/fractor/error_statistics.rb +147 -0
  120. data/lib/fractor/execution_tracer.rb +162 -0
  121. data/lib/fractor/logger.rb +230 -0
  122. data/lib/fractor/main_loop_handler.rb +406 -0
  123. data/lib/fractor/main_loop_handler3.rb +135 -0
  124. data/lib/fractor/main_loop_handler4.rb +299 -0
  125. data/lib/fractor/performance_metrics_collector.rb +181 -0
  126. data/lib/fractor/performance_monitor.rb +215 -0
  127. data/lib/fractor/performance_report_generator.rb +202 -0
  128. data/lib/fractor/priority_work.rb +93 -0
  129. data/lib/fractor/priority_work_queue.rb +189 -0
  130. data/lib/fractor/result_aggregator.rb +32 -0
  131. data/lib/fractor/shutdown_handler.rb +168 -0
  132. data/lib/fractor/signal_handler.rb +80 -0
  133. data/lib/fractor/supervisor.rb +382 -269
  134. data/lib/fractor/supervisor_logger.rb +88 -0
  135. data/lib/fractor/version.rb +1 -1
  136. data/lib/fractor/work.rb +12 -0
  137. data/lib/fractor/work_distribution_manager.rb +151 -0
  138. data/lib/fractor/work_queue.rb +20 -0
  139. data/lib/fractor/work_result.rb +181 -9
  140. data/lib/fractor/worker.rb +73 -0
  141. data/lib/fractor/workflow/builder.rb +210 -0
  142. data/lib/fractor/workflow/chain_builder.rb +169 -0
  143. data/lib/fractor/workflow/circuit_breaker.rb +183 -0
  144. data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
  145. data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
  146. data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
  147. data/lib/fractor/workflow/execution_hooks.rb +39 -0
  148. data/lib/fractor/workflow/execution_strategy.rb +225 -0
  149. data/lib/fractor/workflow/execution_trace.rb +134 -0
  150. data/lib/fractor/workflow/helpers.rb +191 -0
  151. data/lib/fractor/workflow/job.rb +290 -0
  152. data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
  153. data/lib/fractor/workflow/logger.rb +110 -0
  154. data/lib/fractor/workflow/pre_execution_context.rb +193 -0
  155. data/lib/fractor/workflow/retry_config.rb +156 -0
  156. data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
  157. data/lib/fractor/workflow/retry_strategy.rb +93 -0
  158. data/lib/fractor/workflow/structured_logger.rb +30 -0
  159. data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
  160. data/lib/fractor/workflow/visualizer.rb +211 -0
  161. data/lib/fractor/workflow/workflow_context.rb +132 -0
  162. data/lib/fractor/workflow/workflow_executor.rb +669 -0
  163. data/lib/fractor/workflow/workflow_result.rb +55 -0
  164. data/lib/fractor/workflow/workflow_validator.rb +295 -0
  165. data/lib/fractor/workflow.rb +333 -0
  166. data/lib/fractor/wrapped_ractor.rb +66 -101
  167. data/lib/fractor/wrapped_ractor3.rb +161 -0
  168. data/lib/fractor/wrapped_ractor4.rb +242 -0
  169. data/lib/fractor.rb +92 -4
  170. metadata +179 -6
  171. data/tests/sample.rb.bak +0 -309
  172. data/tests/sample_working.rb.bak +0 -209
@@ -1,43 +1,845 @@
1
- = Scatter Gather Example
1
+ = Scatter-Gather Example
2
+ :toc: macro
3
+ :toc-title: Table of Contents
4
+ :toclevels: 3
2
5
 
3
- == Overview
6
+ toc::[]
4
7
 
5
- This example demonstrates the Scatter-Gather pattern with Fractor. In this pattern, work is scattered (distributed) across multiple workers for parallel processing, and then the results are gathered and combined.
8
+ == Purpose
6
9
 
7
- == Key Concepts
10
+ The Scatter-Gather example demonstrates parallel query execution across multiple heterogeneous data sources with intelligent result merging. It showcases how to distribute a single query to multiple backends simultaneously, collect responses, and aggregate them into a unified result set. This is a fundamental pattern for federated search, parallel database queries, and multi-source data aggregation systems.
8
11
 
9
- * *Scatter*: A primary task is broken down into multiple subtasks
10
- * *Parallel Processing*: Each subtask is processed concurrently by different workers
11
- * *Gather*: Results from all subtasks are collected and aggregated
12
- * *Final Processing*: The aggregated results are combined to form the final output
12
+ == Focus
13
13
 
14
- == Example Explanation
14
+ This example demonstrates:
15
15
 
16
- This example processes a large dataset by:
16
+ * **Parallel query distribution** to multiple data sources
17
+ * **Heterogeneous source handling** (database, API, cache, filesystem)
18
+ * **Concurrent execution** with independent source timing
19
+ * **Result aggregation** with source-weighted ranking
20
+ * **Source-specific optimization** strategies
21
+ * **Unified result merging** from diverse formats
17
22
 
18
- 1. Breaking it down into smaller chunks (scatter)
19
- 2. Processing each chunk in parallel using Fractor workers
20
- 3. Collecting the processed chunks (gather)
21
- 4. Combining the results for the final output
23
+ == Architecture
22
24
 
23
- == Features Demonstrated
25
+ === Scatter-Gather Flow Overview
24
26
 
25
- * Effective workload distribution
26
- * Parallel processing for improved performance
27
- * Result aggregation from multiple workers
28
- * Error handling in a distributed computation context
27
+ [source]
28
+ ----
29
+ ┌────────────────────────────────────────────────────────┐
30
+ │ User Query: "ruby concurrency" │
31
+ └────────────────────────────────────────────────────────┘
32
+
33
+ │ Scatter phase
34
+ │ Create work items for all sources
35
+
36
+ ┌───────────────────────────────────┐
37
+ │ MultiSourceSearch Controller │
38
+ │ Creates 4 SearchWork items │
39
+ └───────────────────────────────────┘
40
+
41
+ ┌───────────────────┼───────────────────┬───────────┐
42
+ │ │ │ │
43
+ ▼ ▼ ▼ ▼
44
+ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
45
+ │ Worker 1 │ │ Worker 2 │ │ Worker 3 │ │ Worker 4 │
46
+ │Database │ │ API │ │ Cache │ │Filesystem│
47
+ │Query │ │ Query │ │ Lookup │ │ Search │
48
+ └──────────┘ └──────────┘ └──────────┘ └──────────┘
49
+ │ │ │ │
50
+ │ ~150ms │ ~250ms │ ~20ms │ ~120ms
51
+ │ │ │ │
52
+ ▼ ▼ ▼ ▼
53
+ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
54
+ │ 7 hits │ │ 5 hits │ │ 3 hits │ │ 9 hits │
55
+ │DB Results│ │API Res. │ │Cache Res.│ │File Res. │
56
+ └──────────┘ └──────────┘ └──────────┘ └──────────┘
57
+ │ │ │ │
58
+ └───────────────────┴───────────────────┴───────────┘
59
+
60
+ │ Gather phase
61
+ │ Collect all results
62
+
63
+ ┌───────────────────────────────────┐
64
+ │ Result Aggregation │
65
+ │ - Group by source │
66
+ │ - Apply source weights │
67
+ │ - Rank by weighted relevance │
68
+ └───────────────────────────────────┘
69
+
70
+
71
+ ┌───────────────────────────────────┐
72
+ │ Unified Result Set (24 hits) │
73
+ │ Sorted by weighted relevance │
74
+ └───────────────────────────────────┘
75
+ ----
76
+
77
+ === Parallel Execution Timeline
78
+
79
+ [source]
80
+ ----
81
+ Time →
82
+ ┌──────────────────────────────────────────────────────┐
83
+ │ Cache : ████ │ ~20ms (fastest)
84
+ │ Filesystem: ████████████ │ ~120ms
85
+ │ Database : ███████████████ │ ~150ms
86
+ │ API : █████████████████████████ │ ~250ms (slowest)
87
+ └──────────────────────────────────────────────────────┘
88
+ ▲ ▲
89
+ │ │
90
+ Fastest Slowest
91
+ completes completes
92
+
93
+ Total Time: ~250ms (limited by slowest source)
94
+
95
+ Sequential execution would take: 20 + 120 + 150 + 250 = 540ms
96
+ Parallel speedup: 540ms / 250ms = 2.16x faster
97
+ ----
98
+
99
+ === Source Weighting and Ranking
100
+
101
+ [source]
102
+ ----
103
+ ┌─────────────────────────────────────────────────────┐
104
+ │ Raw Results from Sources │
105
+ ├─────────────────────────────────────────────────────┤
106
+ │ Cache: [0.9, 0.7, 0.6] weight: 1.2 │
107
+ │ Database: [0.8, 0.7, 0.5, ...] weight: 1.0 │
108
+ │ API: [0.9, 0.6, 0.4, ...] weight: 0.8 │
109
+ │ Filesystem: [0.8, 0.7, 0.6, ...] weight: 0.9 │
110
+ └─────────────────────────────────────────────────────┘
111
+
112
+ │ Apply weights
113
+
114
+ ┌─────────────────────────────────────────────────────┐
115
+ │ Weighted Relevance Scores │
116
+ ├─────────────────────────────────────────────────────┤
117
+ │ Cache[0]: 0.9 × 1.2 = 1.08 (highest) │
118
+ │ Cache[1]: 0.7 × 1.2 = 0.84 │
119
+ │ API[0]: 0.9 × 0.8 = 0.72 │
120
+ │ Database[0]:0.8 × 1.0 = 0.80 │
121
+ │ Filesystem[0]: 0.8 × 0.9 = 0.72 │
122
+ │ ... │
123
+ └─────────────────────────────────────────────────────┘
124
+
125
+ │ Sort descending
126
+
127
+ ┌─────────────────────────────────────────────────────┐
128
+ │ Final Ranked Results │
129
+ ├─────────────────────────────────────────────────────┤
130
+ │ 1. Cache[0]: 1.08 │
131
+ │ 2. Cache[1]: 0.84 │
132
+ │ 3. Database[0]: 0.80 │
133
+ │ 4. API[0]: 0.72 │
134
+ │ 5. Filesystem[0]: 0.72 │
135
+ │ ... │
136
+ └─────────────────────────────────────────────────────┘
137
+ ----
138
+
139
+ == Key Components
140
+
141
+ === SearchWork: Source-Specific Work Unit
142
+
143
+ The `SearchWork` class carries query and source information:
144
+
145
+ [source,ruby]
146
+ ----
147
+ class SearchWork < Fractor::Work
148
+ def initialize(query, source = :default, query_params = {})
149
+ super({
150
+ query: query, # <1>
151
+ source: source, # <2>
152
+ query_params: query_params # <3>
153
+ })
154
+ end
155
+
156
+ def source
157
+ input[:source]
158
+ end
159
+ end
160
+ ----
161
+ <1> The search query string
162
+ <2> Target data source (`:database`, `:api`, `:cache`, `:filesystem`)
163
+ <3> Source-specific query parameters
164
+
165
+ Purpose:
166
+
167
+ * **Source routing**: Direct work to appropriate handler
168
+ * **Parameter customization**: Each source has specific options
169
+ * **Parallel execution**: All sources queried simultaneously
170
+
171
+ === SearchWorker: Polymorphic Source Handler
172
+
173
+ The `SearchWorker` routes to source-specific search logic:
174
+
175
+ [source,ruby]
176
+ ----
177
+ class SearchWorker < Fractor::Worker
178
+ def process(work)
179
+ setup_source(work.source) # <1>
180
+
181
+ result = case work.source # <2>
182
+ when :database then search_database(work)
183
+ when :api then search_api(work)
184
+ when :cache then search_cache(work)
185
+ when :filesystem then search_filesystem(work)
186
+ else
187
+ return Fractor::WorkResult.new(
188
+ error: ArgumentError.new("Unknown source: #{work.source}"),
189
+ work: work
190
+ )
191
+ end
192
+
193
+ Fractor::WorkResult.new(
194
+ result: {
195
+ source: work.source, # <3>
196
+ query: work.query,
197
+ hits: result[:hits], # <4>
198
+ metadata: result[:metadata], # <5>
199
+ timing: result[:timing] # <6>
200
+ },
201
+ work: work
202
+ )
203
+ end
204
+
205
+ private
206
+
207
+ def search_database(work)
208
+ sleep(rand(0.05..0.2)) # Simulate query time
209
+
210
+ record_count = rand(3..10)
211
+ hits = Array.new(record_count) do |i|
212
+ {
213
+ id: "db-#{i + 1}",
214
+ title: "Database Result #{i + 1} for '#{work.query}'",
215
+ content: "This is database content for #{work.query}",
216
+ relevance: rand(0.1..1.0).round(2) # <7>
217
+ }
218
+ end
219
+
220
+ {
221
+ hits: hits,
222
+ metadata: {
223
+ source_type: "PostgreSQL Database",
224
+ total_available: record_count + rand(10..50)
225
+ },
226
+ timing: rand(0.01..0.3).round(3)
227
+ }
228
+ end
229
+
230
+ def search_cache(work)
231
+ sleep(rand(0.01..0.1)) # Fast cache lookup
232
+
233
+ cache_hit = [true, true, false].sample # <8>
234
+
235
+ if cache_hit
236
+ # Return cached results
237
+ { hits: [...], metadata: { cache_hit: true } }
238
+ else
239
+ # Cache miss
240
+ { hits: [], metadata: { cache_hit: false } }
241
+ end
242
+ end
243
+ end
244
+ ----
245
+ <1> Initialize connection to data source
246
+ <2> Route to appropriate search method
247
+ <3> Preserve source identifier for merging
248
+ <4> Search results with relevance scores
249
+ <5> Source-specific metadata
250
+ <6> Execution timing for performance analysis
251
+ <7> Intrinsic relevance score (0.0-1.0)
252
+ <8> Simulate cache hit/miss scenario
253
+
254
+ Design benefits:
255
+
256
+ * **Unified interface**: All sources handled by one worker type
257
+ * **Source isolation**: Each source has independent logic
258
+ * **Timing capture**: Enables performance profiling
259
+ * **Flexible results**: Source-specific metadata preserved
260
+
261
+ === MultiSourceSearch: Scatter-Gather Orchestrator
262
+
263
+ The `MultiSourceSearch` coordinates the entire process:
264
+
265
+ [source,ruby]
266
+ ----
267
+ class MultiSourceSearch
268
+ def search(query, sources = nil)
269
+ sources ||= [ # <1>
270
+ { source: :database, params: { max_results: 50 } },
271
+ { source: :api, params: { format: "json" } },
272
+ { source: :cache, params: { max_age: 3600 } },
273
+ { source: :filesystem, params: { extensions: %w[txt md] } }
274
+ ]
275
+
276
+ start_time = Time.now
277
+
278
+ # Scatter: Create work items
279
+ search_work_items = sources.map do |source|
280
+ SearchWork.new(query, source[:source], source[:params]) # <2>
281
+ end
282
+
283
+ @supervisor.add_work_items(search_work_items)
284
+ @supervisor.run # <3>
285
+
286
+ end_time = Time.now
287
+ total_time = end_time - start_time
288
+
289
+ # Gather: Merge results
290
+ @merged_results = merge_results(@supervisor.results, total_time) # <4>
291
+ end
292
+
293
+ private
294
+
295
+ def merge_results(results_aggregator, total_time)
296
+ results_by_source = {}
297
+ total_hits = 0
298
+
299
+ # Group by source
300
+ results_aggregator.results.each do |result|
301
+ source = result.result[:source]
302
+ results_by_source[source] = result.result # <5>
303
+ total_hits += result.result[:hits].size
304
+ end
305
+
306
+ # Apply source weights
307
+ all_hits = []
308
+ results_by_source.each do |source, result|
309
+ source_weight = case source # <6>
310
+ when :database then 1.0
311
+ when :api then 0.8
312
+ when :cache then 1.2 # Prioritize cache
313
+ when :filesystem then 0.9
314
+ end
315
+
316
+ result[:hits].each do |hit|
317
+ all_hits << {
318
+ id: hit[:id],
319
+ title: hit[:title],
320
+ source: source,
321
+ original_relevance: hit[:relevance],
322
+ weighted_relevance: hit[:relevance] * source_weight # <7>
323
+ }
324
+ end
325
+ end
29
326
 
30
- == Running the Example
327
+ # Rank by weighted relevance
328
+ ranked_hits = all_hits.sort_by { |hit| -hit[:weighted_relevance] } # <8>
31
329
 
32
- [source,sh]
330
+ {
331
+ query: query,
332
+ total_hits: total_hits,
333
+ execution_time: total_time,
334
+ sources: results_by_source.keys,
335
+ ranked_results: ranked_hits, # <9>
336
+ source_details: results_by_source
337
+ }
338
+ end
339
+ end
33
340
  ----
34
- ruby examples/scatter_gather/scatter_gather.rb
341
+ <1> Define all data sources to query
342
+ <2> Create parallel work items (scatter)
343
+ <3> Execute all queries concurrently
344
+ <4> Aggregate and rank results (gather)
345
+ <5> Group results by originating source
346
+ <6> Define source-specific trust weights
347
+ <7> Calculate weighted relevance score
348
+ <8> Sort by weighted relevance (descending)
349
+ <9> Return unified, ranked result set
350
+
351
+ Orchestration features:
352
+
353
+ * **Parallel dispatch**: All sources queried at once
354
+ * **Wait-for-all**: Collects all results before merging
355
+ * **Source weighting**: Prioritizes trusted sources
356
+ * **Unified ranking**: Single sorted result list
357
+
358
+ == Usage
359
+
360
+ .Basic usage
361
+ [example]
362
+ ====
363
+ [source,bash]
364
+ ----
365
+ # Run with default query
366
+ ruby scatter_gather.rb
367
+
368
+ # Search with custom query
369
+ ruby scatter_gather.rb "ruby concurrency patterns"
370
+
371
+ # Use more workers
372
+ ruby scatter_gather.rb "database optimization" 8
373
+ ----
374
+ ====
375
+
376
+ .Programmatic usage
377
+ [example]
378
+ ====
379
+ [source,ruby]
380
+ ----
381
+ require_relative "scatter_gather"
382
+
383
+ # Create multi-source search
384
+ search = ScatterGather::MultiSourceSearch.new(4)
385
+
386
+ # Execute parallel search
387
+ results = search.search("machine learning")
388
+
389
+ # Access results
390
+ puts "Total hits: #{results[:total_hits]}"
391
+ puts "Execution time: #{results[:execution_time]}s"
392
+
393
+ # Display top results
394
+ results[:ranked_results].take(10).each do |hit|
395
+ puts "#{hit[:title]} (#{hit[:source]}, score: #{hit[:weighted_relevance]})"
396
+ end
35
397
  ----
398
+ ====
36
399
 
37
400
  == Expected Output
38
401
 
39
- The example will show:
40
- * The input data being broken into chunks
41
- * Workers processing the chunks in parallel
42
- * Results being gathered from workers
43
- * The final aggregated results
402
+ [source,text]
403
+ ----
404
+ Starting Scatter-Gather Search Example
405
+ ======================================
406
+ This example demonstrates searching multiple data sources concurrently:
407
+ 1. Database - Simulates SQL database searches
408
+ 2. API - Simulates external REST API calls
409
+ 3. Cache - Simulates in-memory cache lookups
410
+ 4. Filesystem - Simulates searching through files
411
+
412
+ Search Results Summary:
413
+ ----------------------
414
+ Query: ruby concurrency patterns
415
+ Total hits: 24
416
+ Total execution time: 0.253 seconds
417
+ Sources searched: database, api, cache, filesystem
418
+
419
+ Top 5 Results (by relevance):
420
+ 1. Cached Result 1 for 'ruby concurrency patterns' (Source: cache, Relevance: 1.08)
421
+ This is cached content for ruby concurrency patterns...
422
+
423
+ 2. Cached Result 2 for 'ruby concurrency patterns' (Source: cache, Relevance: 0.96)
424
+ This is cached content for ruby concurrency patterns...
425
+
426
+ 3. Database Result 1 for 'ruby concurrency patterns' (Source: database, Relevance: 0.85)
427
+ This is database content for ruby concurrency patterns...
428
+
429
+ 4. File Result 1 for 'ruby concurrency patterns' (Source: filesystem, Relevance: 0.81)
430
+ This is file content matching ruby concurrency patterns...
431
+
432
+ 5. API Result 1 for 'ruby concurrency patterns' (Source: api, Relevance: 0.72)
433
+ This is API content for ruby concurrency patterns...
434
+
435
+ Source Details:
436
+ - Database (7 results, 0.152 sec)
437
+ Metadata: {:source_type=>"PostgreSQL Database", :total_available=>53}
438
+ - Api (5 results, 0.245 sec)
439
+ Metadata: {:source_type=>"External REST API", :provider=>"Google"}
440
+ - Cache (3 results, 0.018 sec)
441
+ Metadata: {:source_type=>"In-memory Cache", :cache_hit=>true}
442
+ - Filesystem (9 results, 0.128 sec)
443
+ Metadata: {:source_type=>"File System", :files_scanned=>342}
444
+ ----
445
+
446
+ == Learning Points
447
+
448
+ === 1. Scatter-Gather Pattern
449
+
450
+ The pattern has two distinct phases:
451
+
452
+ **Scatter phase**:
453
+ [source,ruby]
454
+ ----
455
+ # Distribute work to all sources
456
+ sources.each do |source|
457
+ supervisor.add_work_item(SearchWork.new(query, source))
458
+ end
459
+ supervisor.run # All execute in parallel
460
+ ----
461
+
462
+ **Gather phase**:
463
+ [source,ruby]
464
+ ----
465
+ # Collect and merge results
466
+ all_results = supervisor.results.results
467
+ merged = aggregate_results(all_results)
468
+ ----
469
+
470
+ **Key characteristics**:
471
+
472
+ * **Fork-join parallelism**: All work starts together, results combined at end
473
+ * **Independent execution**: Sources don't communicate with each other
474
+ * **Synchronization point**: Gather waits for all sources to complete
475
+ * **Result aggregation**: Combine heterogeneous formats into unified view
476
+
477
+ === 2. Source-Weighted Ranking
478
+
479
+ Different sources have different trust levels:
480
+
481
+ [source,ruby]
482
+ ----
483
+ source_weights = {
484
+ cache: 1.2, # Most trusted (already validated)
485
+ database: 1.0, # Baseline trust
486
+ filesystem: 0.9, # Slightly lower trust
487
+ api: 0.8 # External, less trusted
488
+ }
489
+
490
+ weighted_score = intrinsic_relevance × source_weight
491
+ ----
492
+
493
+ **Rationale**:
494
+
495
+ * **Cache**: Previously validated results, highest trust
496
+ * **Database**: Internal, controlled data, baseline
497
+ * **API**: External data, may be stale or inaccurate
498
+ * **Filesystem**: Unstructured, harder to validate
499
+
500
+ === 3. Performance Analysis
501
+
502
+ **Total time = max(source_times)**:
503
+
504
+ [source]
505
+ ----
506
+ Source times: [150ms, 250ms, 20ms, 120ms]
507
+ Total time: 250ms (limited by slowest)
508
+
509
+ Speedup = Σ(source_times) / max(source_times)
510
+ = (150 + 250 + 20 + 120) / 250
511
+ = 540 / 250
512
+ = 2.16x
513
+
514
+ Parallel efficiency = Speedup / num_sources
515
+ = 2.16 / 4
516
+ = 54%
517
+ ----
518
+
519
+ **Efficiency factors**:
520
+
521
+ * **Load imbalance**: Slow sources dominate total time
522
+ * **Overhead**: Ractor creation, synchronization
523
+ * **I/O bound**: Network/disk latency, not CPU
524
+
525
+ === 4. Cache Miss Handling
526
+
527
+ The cache may not have results:
528
+
529
+ [source,ruby]
530
+ ----
531
+ def search_cache(work)
532
+ if cache_hit?
533
+ return cached_results
534
+ else
535
+ return { hits: [], metadata: { cache_hit: false } }
536
+ end
537
+ end
538
+
539
+ # In merge_results
540
+ if source_result[:hits].empty?
541
+ # Don't penalize total score for cache miss
542
+ # Other sources provide results
543
+ end
544
+ ----
545
+
546
+ **Strategy**:
547
+
548
+ * Cache misses return empty results, not errors
549
+ * Merge phase handles varying result counts
550
+ * Total result count not affected by misses
551
+
552
+ === 5. Heterogeneous Result Formats
553
+
554
+ Different sources return different structures:
555
+
556
+ [source,ruby]
557
+ ----
558
+ # Database results
559
+ {
560
+ id: "db-123",
561
+ title: "...",
562
+ content: "...",
563
+ relevance: 0.85
564
+ }
565
+
566
+ # API results
567
+ {
568
+ id: "api-456",
569
+ title: "...",
570
+ content: "...",
571
+ relevance: 0.72,
572
+ provider: "Google" # Extra field
573
+ }
574
+
575
+ # Filesystem results
576
+ {
577
+ id: "file-789",
578
+ title: "...",
579
+ path: "/path/to/file", # Different structure
580
+ content: "...",
581
+ relevance: 0.91
582
+ }
583
+ ----
584
+
585
+ **Normalization**:
586
+
587
+ * Extract common fields (id, title, content, relevance)
588
+ * Preserve source-specific metadata separately
589
+ * Unified ranking uses normalized fields
590
+
591
+ === 6. Error Handling
592
+
593
+ Individual source failures don't stop other sources:
594
+
595
+ [source,ruby]
596
+ ----
597
+ def process(work)
598
+ begin
599
+ result = search_source(work.source, work.query)
600
+ Fractor::WorkResult.new(result: result, work: work)
601
+ rescue StandardError => e
602
+ # Return error for this source, others continue
603
+ Fractor::WorkResult.new(
604
+ error: "#{work.source} failed: #{e.message}",
605
+ work: work
606
+ )
607
+ end
608
+ end
609
+
610
+ # In merge_results
611
+ results.select { |r| r.success? }.each do |result|
612
+ # Only process successful results
613
+ merge_into_final_set(result)
614
+ end
615
+ ----
616
+
617
+ == Use Cases and Patterns
618
+
619
+ === Federated Search
620
+
621
+ Search across multiple databases:
622
+
623
+ [source,ruby]
624
+ ----
625
+ sources = [
626
+ { source: :postgres, params: { schema: "public" } },
627
+ { source: :elasticsearch, params: { index: "documents" } },
628
+ { source: :redis, params: { pattern: "*" } },
629
+ { source: :mongodb, params: { collection: "items" } }
630
+ ]
631
+
632
+ search.search("user query", sources)
633
+ ----
634
+
635
+ === Multi-Cloud Query
636
+
637
+ Query services across cloud providers:
638
+
639
+ [source,ruby]
640
+ ----
641
+ sources = [
642
+ { source: :aws_s3, params: { bucket: "data" } },
643
+ { source: :gcp_storage, params: { bucket: "archive" } },
644
+ { source: :azure_blob, params: { container: "files" } }
645
+ ]
646
+
647
+ search.search("document.pdf", sources)
648
+ ----
649
+
650
+ === Aggregated Pricing
651
+
652
+ Compare prices from multiple vendors:
653
+
654
+ [source,ruby]
655
+ ----
656
+ def search_vendor(work)
657
+ prices = fetch_prices(work.query, work.source)
658
+
659
+ {
660
+ hits: prices.map { |p| { price: p, vendor: work.source } },
661
+ metadata: { currency: "USD", last_updated: Time.now }
662
+ }
663
+ end
664
+
665
+ # Merge sorts by price instead of relevance
666
+ def merge_results(results)
667
+ all_prices = results.flat_map { |r| r[:hits] }
668
+ all_prices.sort_by { |p| p[:price] } # Lowest first
669
+ end
670
+ ----
671
+
672
+ === Monitoring Dashboard
673
+
674
+ Query multiple monitoring sources:
675
+
676
+ [source,ruby]
677
+ ----
678
+ sources = [
679
+ { source: :prometheus, params: { metric: "cpu_usage" } },
680
+ { source: :cloudwatch, params: { namespace: "AWS/EC2" } },
681
+ { source: :datadog, params: { query: "avg:system.cpu.usage" } },
682
+ { source: :newrelic, params: { metric: "CPU/User Time" } }
683
+ ]
684
+
685
+ # Aggregate metrics
686
+ metrics = search.search("cpu_usage", sources)
687
+ average_cpu = metrics[:ranked_results].map { |m| m[:value] }.sum / metrics[:total_hits]
688
+ ----
689
+
690
+ == Advanced Patterns
691
+
692
+ === Timeout Handling
693
+
694
+ Set per-source timeouts:
695
+
696
+ [source,ruby]
697
+ ----
698
+ def process(work)
699
+ Timeout.timeout(work.query_params[:timeout] || 5) do
700
+ search_source(work.source, work.query)
701
+ end
702
+ rescue Timeout::Error
703
+ Fractor::WorkResult.new(
704
+ result: { hits: [], metadata: { timeout: true } },
705
+ work: work
706
+ )
707
+ end
708
+ ----
709
+
710
+ === Fallback Sources
711
+
712
+ Use backup sources if primary fails:
713
+
714
+ [source,ruby]
715
+ ----
716
+ def search(query)
717
+ primary_sources = [:cache, :database]
718
+ fallback_sources = [:api, :filesystem]
719
+
720
+ # Try primary sources first
721
+ results = scatter_gather(query, primary_sources)
722
+
723
+ # If insufficient results, try fallbacks
724
+ if results[:total_hits] < MIN_RESULTS
725
+ fallback_results = scatter_gather(query, fallback_sources)
726
+ results = merge(results, fallback_results)
727
+ end
728
+
729
+ results
730
+ end
731
+ ----
732
+
733
+ === Progressive Results
734
+
735
+ Return fast results immediately, slower later:
736
+
737
+ [source,ruby]
738
+ ----
739
+ def search_progressive(query)
740
+ fast_sources = [:cache]
741
+ slow_sources = [:database, :api, :filesystem]
742
+
743
+ # Return cache results immediately
744
+ fast_results = scatter_gather(query, fast_sources)
745
+ yield fast_results if block_given?
746
+
747
+ # Add slow results as they arrive
748
+ slow_results = scatter_gather(query, slow_sources)
749
+ yield merge(fast_results, slow_results) if block_given?
750
+ end
751
+ ----
752
+
753
+ === Result Deduplication
754
+
755
+ Remove duplicate results across sources:
756
+
757
+ [source,ruby]
758
+ ----
759
+ def merge_results(results)
760
+ all_hits = results.flat_map { |r| r[:hits] }
761
+
762
+ # Deduplicate by content similarity
763
+ unique_hits = []
764
+ all_hits.each do |hit|
765
+ unless unique_hits.any? { |h| similar?(h, hit) }
766
+ unique_hits << hit
767
+ end
768
+ end
769
+
770
+ unique_hits.sort_by { |h| -h[:weighted_relevance] }
771
+ end
772
+
773
+ def similar?(hit1, hit2)
774
+ # Simple deduplication by title similarity
775
+ hit1[:title].downcase == hit2[:title].downcase
776
+ end
777
+ ----
778
+
779
+ == Performance Tuning
780
+
781
+ === Worker Pool Sizing
782
+
783
+ Match workers to data sources:
784
+
785
+ [source,ruby]
786
+ ----
787
+ # Option 1: One worker per source
788
+ worker_count = sources.size
789
+
790
+ # Option 2: More workers than sources (for queueing)
791
+ worker_count = sources.size * 2
792
+
793
+ # Option 3: Match to available cores
794
+ worker_count = [sources.size, Etc.nprocessors].min
795
+ ----
796
+
797
+ === Source Prioritization
798
+
799
+ Query fast sources first:
800
+
801
+ [source,ruby]
802
+ ----
803
+ sources_by_speed = [
804
+ { source: :cache, expected_time: 0.02 },
805
+ { source: :database, expected_time: 0.15 },
806
+ { source: :filesystem, expected_time: 0.12 },
807
+ { source: :api, expected_time: 0.25 }
808
+ ].sort_by { |s| s[:expected_time] }
809
+
810
+ # Start fast sources first for early results
811
+ sources_by_speed.each do |source|
812
+ supervisor.add_work_item(SearchWork.new(query, source[:source]))
813
+ end
814
+ ----
815
+
816
+ === Connection Pooling
817
+
818
+ Reuse connections across searches:
819
+
820
+ [source,ruby]
821
+ ----
822
+ class SearchWorker < Fractor::Worker
823
+ def initialize
824
+ super
825
+ @connections = {
826
+ database: connect_to_database,
827
+ api: initialize_api_client
828
+ }
829
+ end
830
+
831
+ def process(work)
832
+ conn = @connections[work.source]
833
+ search_with_connection(conn, work.query)
834
+ end
835
+ end
836
+ ----
837
+
838
+ == Next Steps
839
+
840
+ After understanding scatter-gather, explore:
841
+
842
+ * **link:../producer_subscriber/README.adoc[Producer-Subscriber]**: Hierarchical work decomposition
843
+ * **link:../pipeline_processing/README.adoc[Pipeline Processing]**: Sequential transformations
844
+ * **link:../hierarchical_hasher/README.adoc[Hierarchical Hasher]**: Map-reduce patterns
845
+ * **link:../workflow/README.adoc[Workflow System]**: Complex orchestration with dependencies