fractor 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +227 -102
- data/README.adoc +113 -1940
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/continuous_chat_common/message_protocol.rb +1 -1
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +2 -2
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/simple/README.adoc +347 -0
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +44 -8
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +60 -65
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +32 -0
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +382 -269
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +20 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +73 -0
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -101
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +92 -4
- metadata +179 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -1,43 +1,845 @@
|
|
|
1
|
-
= Scatter
|
|
1
|
+
= Scatter-Gather Example
|
|
2
|
+
:toc: macro
|
|
3
|
+
:toc-title: Table of Contents
|
|
4
|
+
:toclevels: 3
|
|
2
5
|
|
|
3
|
-
|
|
6
|
+
toc::[]
|
|
4
7
|
|
|
5
|
-
|
|
8
|
+
== Purpose
|
|
6
9
|
|
|
7
|
-
|
|
10
|
+
The Scatter-Gather example demonstrates parallel query execution across multiple heterogeneous data sources with intelligent result merging. It showcases how to distribute a single query to multiple backends simultaneously, collect responses, and aggregate them into a unified result set. This is a fundamental pattern for federated search, parallel database queries, and multi-source data aggregation systems.
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
* *Parallel Processing*: Each subtask is processed concurrently by different workers
|
|
11
|
-
* *Gather*: Results from all subtasks are collected and aggregated
|
|
12
|
-
* *Final Processing*: The aggregated results are combined to form the final output
|
|
12
|
+
== Focus
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
This example demonstrates:
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
* **Parallel query distribution** to multiple data sources
|
|
17
|
+
* **Heterogeneous source handling** (database, API, cache, filesystem)
|
|
18
|
+
* **Concurrent execution** with independent source timing
|
|
19
|
+
* **Result aggregation** with source-weighted ranking
|
|
20
|
+
* **Source-specific optimization** strategies
|
|
21
|
+
* **Unified result merging** from diverse formats
|
|
17
22
|
|
|
18
|
-
|
|
19
|
-
2. Processing each chunk in parallel using Fractor workers
|
|
20
|
-
3. Collecting the processed chunks (gather)
|
|
21
|
-
4. Combining the results for the final output
|
|
23
|
+
== Architecture
|
|
22
24
|
|
|
23
|
-
|
|
25
|
+
=== Scatter-Gather Flow Overview
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
[source]
|
|
28
|
+
----
|
|
29
|
+
┌────────────────────────────────────────────────────────┐
|
|
30
|
+
│ User Query: "ruby concurrency" │
|
|
31
|
+
└────────────────────────────────────────────────────────┘
|
|
32
|
+
│
|
|
33
|
+
│ Scatter phase
|
|
34
|
+
│ Create work items for all sources
|
|
35
|
+
▼
|
|
36
|
+
┌───────────────────────────────────┐
|
|
37
|
+
│ MultiSourceSearch Controller │
|
|
38
|
+
│ Creates 4 SearchWork items │
|
|
39
|
+
└───────────────────────────────────┘
|
|
40
|
+
│
|
|
41
|
+
┌───────────────────┼───────────────────┬───────────┐
|
|
42
|
+
│ │ │ │
|
|
43
|
+
▼ ▼ ▼ ▼
|
|
44
|
+
┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
|
45
|
+
│ Worker 1 │ │ Worker 2 │ │ Worker 3 │ │ Worker 4 │
|
|
46
|
+
│Database │ │ API │ │ Cache │ │Filesystem│
|
|
47
|
+
│Query │ │ Query │ │ Lookup │ │ Search │
|
|
48
|
+
└──────────┘ └──────────┘ └──────────┘ └──────────┘
|
|
49
|
+
│ │ │ │
|
|
50
|
+
│ ~150ms │ ~250ms │ ~20ms │ ~120ms
|
|
51
|
+
│ │ │ │
|
|
52
|
+
▼ ▼ ▼ ▼
|
|
53
|
+
┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
|
54
|
+
│ 7 hits │ │ 5 hits │ │ 3 hits │ │ 9 hits │
|
|
55
|
+
│DB Results│ │API Res. │ │Cache Res.│ │File Res. │
|
|
56
|
+
└──────────┘ └──────────┘ └──────────┘ └──────────┘
|
|
57
|
+
│ │ │ │
|
|
58
|
+
└───────────────────┴───────────────────┴───────────┘
|
|
59
|
+
│
|
|
60
|
+
│ Gather phase
|
|
61
|
+
│ Collect all results
|
|
62
|
+
▼
|
|
63
|
+
┌───────────────────────────────────┐
|
|
64
|
+
│ Result Aggregation │
|
|
65
|
+
│ - Group by source │
|
|
66
|
+
│ - Apply source weights │
|
|
67
|
+
│ - Rank by weighted relevance │
|
|
68
|
+
└───────────────────────────────────┘
|
|
69
|
+
│
|
|
70
|
+
▼
|
|
71
|
+
┌───────────────────────────────────┐
|
|
72
|
+
│ Unified Result Set (24 hits) │
|
|
73
|
+
│ Sorted by weighted relevance │
|
|
74
|
+
└───────────────────────────────────┘
|
|
75
|
+
----
|
|
76
|
+
|
|
77
|
+
=== Parallel Execution Timeline
|
|
78
|
+
|
|
79
|
+
[source]
|
|
80
|
+
----
|
|
81
|
+
Time →
|
|
82
|
+
┌──────────────────────────────────────────────────────┐
|
|
83
|
+
│ Cache : ████ │ ~20ms (fastest)
|
|
84
|
+
│ Filesystem: ████████████ │ ~120ms
|
|
85
|
+
│ Database : ███████████████ │ ~150ms
|
|
86
|
+
│ API : █████████████████████████ │ ~250ms (slowest)
|
|
87
|
+
└──────────────────────────────────────────────────────┘
|
|
88
|
+
▲ ▲
|
|
89
|
+
│ │
|
|
90
|
+
Fastest Slowest
|
|
91
|
+
completes completes
|
|
92
|
+
|
|
93
|
+
Total Time: ~250ms (limited by slowest source)
|
|
94
|
+
|
|
95
|
+
Sequential execution would take: 20 + 120 + 150 + 250 = 540ms
|
|
96
|
+
Parallel speedup: 540ms / 250ms = 2.16x faster
|
|
97
|
+
----
|
|
98
|
+
|
|
99
|
+
=== Source Weighting and Ranking
|
|
100
|
+
|
|
101
|
+
[source]
|
|
102
|
+
----
|
|
103
|
+
┌─────────────────────────────────────────────────────┐
|
|
104
|
+
│ Raw Results from Sources │
|
|
105
|
+
├─────────────────────────────────────────────────────┤
|
|
106
|
+
│ Cache: [0.9, 0.7, 0.6] weight: 1.2 │
|
|
107
|
+
│ Database: [0.8, 0.7, 0.5, ...] weight: 1.0 │
|
|
108
|
+
│ API: [0.9, 0.6, 0.4, ...] weight: 0.8 │
|
|
109
|
+
│ Filesystem: [0.8, 0.7, 0.6, ...] weight: 0.9 │
|
|
110
|
+
└─────────────────────────────────────────────────────┘
|
|
111
|
+
│
|
|
112
|
+
│ Apply weights
|
|
113
|
+
▼
|
|
114
|
+
┌─────────────────────────────────────────────────────┐
|
|
115
|
+
│ Weighted Relevance Scores │
|
|
116
|
+
├─────────────────────────────────────────────────────┤
|
|
117
|
+
│ Cache[0]: 0.9 × 1.2 = 1.08 (highest) │
|
|
118
|
+
│ Cache[1]: 0.7 × 1.2 = 0.84 │
|
|
119
|
+
│ API[0]: 0.9 × 0.8 = 0.72 │
|
|
120
|
+
│ Database[0]:0.8 × 1.0 = 0.80 │
|
|
121
|
+
│ Filesystem[0]: 0.8 × 0.9 = 0.72 │
|
|
122
|
+
│ ... │
|
|
123
|
+
└─────────────────────────────────────────────────────┘
|
|
124
|
+
│
|
|
125
|
+
│ Sort descending
|
|
126
|
+
▼
|
|
127
|
+
┌─────────────────────────────────────────────────────┐
|
|
128
|
+
│ Final Ranked Results │
|
|
129
|
+
├─────────────────────────────────────────────────────┤
|
|
130
|
+
│ 1. Cache[0]: 1.08 │
|
|
131
|
+
│ 2. Cache[1]: 0.84 │
|
|
132
|
+
│ 3. Database[0]: 0.80 │
|
|
133
|
+
│ 4. API[0]: 0.72 │
|
|
134
|
+
│ 5. Filesystem[0]: 0.72 │
|
|
135
|
+
│ ... │
|
|
136
|
+
└─────────────────────────────────────────────────────┘
|
|
137
|
+
----
|
|
138
|
+
|
|
139
|
+
== Key Components
|
|
140
|
+
|
|
141
|
+
=== SearchWork: Source-Specific Work Unit
|
|
142
|
+
|
|
143
|
+
The `SearchWork` class carries query and source information:
|
|
144
|
+
|
|
145
|
+
[source,ruby]
|
|
146
|
+
----
|
|
147
|
+
class SearchWork < Fractor::Work
|
|
148
|
+
def initialize(query, source = :default, query_params = {})
|
|
149
|
+
super({
|
|
150
|
+
query: query, # <1>
|
|
151
|
+
source: source, # <2>
|
|
152
|
+
query_params: query_params # <3>
|
|
153
|
+
})
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def source
|
|
157
|
+
input[:source]
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
----
|
|
161
|
+
<1> The search query string
|
|
162
|
+
<2> Target data source (`:database`, `:api`, `:cache`, `:filesystem`)
|
|
163
|
+
<3> Source-specific query parameters
|
|
164
|
+
|
|
165
|
+
Purpose:
|
|
166
|
+
|
|
167
|
+
* **Source routing**: Direct work to appropriate handler
|
|
168
|
+
* **Parameter customization**: Each source has specific options
|
|
169
|
+
* **Parallel execution**: All sources queried simultaneously
|
|
170
|
+
|
|
171
|
+
=== SearchWorker: Polymorphic Source Handler
|
|
172
|
+
|
|
173
|
+
The `SearchWorker` routes to source-specific search logic:
|
|
174
|
+
|
|
175
|
+
[source,ruby]
|
|
176
|
+
----
|
|
177
|
+
class SearchWorker < Fractor::Worker
|
|
178
|
+
def process(work)
|
|
179
|
+
setup_source(work.source) # <1>
|
|
180
|
+
|
|
181
|
+
result = case work.source # <2>
|
|
182
|
+
when :database then search_database(work)
|
|
183
|
+
when :api then search_api(work)
|
|
184
|
+
when :cache then search_cache(work)
|
|
185
|
+
when :filesystem then search_filesystem(work)
|
|
186
|
+
else
|
|
187
|
+
return Fractor::WorkResult.new(
|
|
188
|
+
error: ArgumentError.new("Unknown source: #{work.source}"),
|
|
189
|
+
work: work
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
Fractor::WorkResult.new(
|
|
194
|
+
result: {
|
|
195
|
+
source: work.source, # <3>
|
|
196
|
+
query: work.query,
|
|
197
|
+
hits: result[:hits], # <4>
|
|
198
|
+
metadata: result[:metadata], # <5>
|
|
199
|
+
timing: result[:timing] # <6>
|
|
200
|
+
},
|
|
201
|
+
work: work
|
|
202
|
+
)
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
private
|
|
206
|
+
|
|
207
|
+
def search_database(work)
|
|
208
|
+
sleep(rand(0.05..0.2)) # Simulate query time
|
|
209
|
+
|
|
210
|
+
record_count = rand(3..10)
|
|
211
|
+
hits = Array.new(record_count) do |i|
|
|
212
|
+
{
|
|
213
|
+
id: "db-#{i + 1}",
|
|
214
|
+
title: "Database Result #{i + 1} for '#{work.query}'",
|
|
215
|
+
content: "This is database content for #{work.query}",
|
|
216
|
+
relevance: rand(0.1..1.0).round(2) # <7>
|
|
217
|
+
}
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
{
|
|
221
|
+
hits: hits,
|
|
222
|
+
metadata: {
|
|
223
|
+
source_type: "PostgreSQL Database",
|
|
224
|
+
total_available: record_count + rand(10..50)
|
|
225
|
+
},
|
|
226
|
+
timing: rand(0.01..0.3).round(3)
|
|
227
|
+
}
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def search_cache(work)
|
|
231
|
+
sleep(rand(0.01..0.1)) # Fast cache lookup
|
|
232
|
+
|
|
233
|
+
cache_hit = [true, true, false].sample # <8>
|
|
234
|
+
|
|
235
|
+
if cache_hit
|
|
236
|
+
# Return cached results
|
|
237
|
+
{ hits: [...], metadata: { cache_hit: true } }
|
|
238
|
+
else
|
|
239
|
+
# Cache miss
|
|
240
|
+
{ hits: [], metadata: { cache_hit: false } }
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
----
|
|
245
|
+
<1> Initialize connection to data source
|
|
246
|
+
<2> Route to appropriate search method
|
|
247
|
+
<3> Preserve source identifier for merging
|
|
248
|
+
<4> Search results with relevance scores
|
|
249
|
+
<5> Source-specific metadata
|
|
250
|
+
<6> Execution timing for performance analysis
|
|
251
|
+
<7> Intrinsic relevance score (0.0-1.0)
|
|
252
|
+
<8> Simulate cache hit/miss scenario
|
|
253
|
+
|
|
254
|
+
Design benefits:
|
|
255
|
+
|
|
256
|
+
* **Unified interface**: All sources handled by one worker type
|
|
257
|
+
* **Source isolation**: Each source has independent logic
|
|
258
|
+
* **Timing capture**: Enables performance profiling
|
|
259
|
+
* **Flexible results**: Source-specific metadata preserved
|
|
260
|
+
|
|
261
|
+
=== MultiSourceSearch: Scatter-Gather Orchestrator
|
|
262
|
+
|
|
263
|
+
The `MultiSourceSearch` coordinates the entire process:
|
|
264
|
+
|
|
265
|
+
[source,ruby]
|
|
266
|
+
----
|
|
267
|
+
class MultiSourceSearch
|
|
268
|
+
def search(query, sources = nil)
|
|
269
|
+
sources ||= [ # <1>
|
|
270
|
+
{ source: :database, params: { max_results: 50 } },
|
|
271
|
+
{ source: :api, params: { format: "json" } },
|
|
272
|
+
{ source: :cache, params: { max_age: 3600 } },
|
|
273
|
+
{ source: :filesystem, params: { extensions: %w[txt md] } }
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
start_time = Time.now
|
|
277
|
+
|
|
278
|
+
# Scatter: Create work items
|
|
279
|
+
search_work_items = sources.map do |source|
|
|
280
|
+
SearchWork.new(query, source[:source], source[:params]) # <2>
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
@supervisor.add_work_items(search_work_items)
|
|
284
|
+
@supervisor.run # <3>
|
|
285
|
+
|
|
286
|
+
end_time = Time.now
|
|
287
|
+
total_time = end_time - start_time
|
|
288
|
+
|
|
289
|
+
# Gather: Merge results
|
|
290
|
+
@merged_results = merge_results(@supervisor.results, total_time) # <4>
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
private
|
|
294
|
+
|
|
295
|
+
def merge_results(results_aggregator, total_time)
|
|
296
|
+
results_by_source = {}
|
|
297
|
+
total_hits = 0
|
|
298
|
+
|
|
299
|
+
# Group by source
|
|
300
|
+
results_aggregator.results.each do |result|
|
|
301
|
+
source = result.result[:source]
|
|
302
|
+
results_by_source[source] = result.result # <5>
|
|
303
|
+
total_hits += result.result[:hits].size
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Apply source weights
|
|
307
|
+
all_hits = []
|
|
308
|
+
results_by_source.each do |source, result|
|
|
309
|
+
source_weight = case source # <6>
|
|
310
|
+
when :database then 1.0
|
|
311
|
+
when :api then 0.8
|
|
312
|
+
when :cache then 1.2 # Prioritize cache
|
|
313
|
+
when :filesystem then 0.9
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
result[:hits].each do |hit|
|
|
317
|
+
all_hits << {
|
|
318
|
+
id: hit[:id],
|
|
319
|
+
title: hit[:title],
|
|
320
|
+
source: source,
|
|
321
|
+
original_relevance: hit[:relevance],
|
|
322
|
+
weighted_relevance: hit[:relevance] * source_weight # <7>
|
|
323
|
+
}
|
|
324
|
+
end
|
|
325
|
+
end
|
|
29
326
|
|
|
30
|
-
|
|
327
|
+
# Rank by weighted relevance
|
|
328
|
+
ranked_hits = all_hits.sort_by { |hit| -hit[:weighted_relevance] } # <8>
|
|
31
329
|
|
|
32
|
-
|
|
330
|
+
{
|
|
331
|
+
query: query,
|
|
332
|
+
total_hits: total_hits,
|
|
333
|
+
execution_time: total_time,
|
|
334
|
+
sources: results_by_source.keys,
|
|
335
|
+
ranked_results: ranked_hits, # <9>
|
|
336
|
+
source_details: results_by_source
|
|
337
|
+
}
|
|
338
|
+
end
|
|
339
|
+
end
|
|
33
340
|
----
|
|
34
|
-
|
|
341
|
+
<1> Define all data sources to query
|
|
342
|
+
<2> Create parallel work items (scatter)
|
|
343
|
+
<3> Execute all queries concurrently
|
|
344
|
+
<4> Aggregate and rank results (gather)
|
|
345
|
+
<5> Group results by originating source
|
|
346
|
+
<6> Define source-specific trust weights
|
|
347
|
+
<7> Calculate weighted relevance score
|
|
348
|
+
<8> Sort by weighted relevance (descending)
|
|
349
|
+
<9> Return unified, ranked result set
|
|
350
|
+
|
|
351
|
+
Orchestration features:
|
|
352
|
+
|
|
353
|
+
* **Parallel dispatch**: All sources queried at once
|
|
354
|
+
* **Wait-for-all**: Collects all results before merging
|
|
355
|
+
* **Source weighting**: Prioritizes trusted sources
|
|
356
|
+
* **Unified ranking**: Single sorted result list
|
|
357
|
+
|
|
358
|
+
== Usage
|
|
359
|
+
|
|
360
|
+
.Basic usage
|
|
361
|
+
[example]
|
|
362
|
+
====
|
|
363
|
+
[source,bash]
|
|
364
|
+
----
|
|
365
|
+
# Run with default query
|
|
366
|
+
ruby scatter_gather.rb
|
|
367
|
+
|
|
368
|
+
# Search with custom query
|
|
369
|
+
ruby scatter_gather.rb "ruby concurrency patterns"
|
|
370
|
+
|
|
371
|
+
# Use more workers
|
|
372
|
+
ruby scatter_gather.rb "database optimization" 8
|
|
373
|
+
----
|
|
374
|
+
====
|
|
375
|
+
|
|
376
|
+
.Programmatic usage
|
|
377
|
+
[example]
|
|
378
|
+
====
|
|
379
|
+
[source,ruby]
|
|
380
|
+
----
|
|
381
|
+
require_relative "scatter_gather"
|
|
382
|
+
|
|
383
|
+
# Create multi-source search
|
|
384
|
+
search = ScatterGather::MultiSourceSearch.new(4)
|
|
385
|
+
|
|
386
|
+
# Execute parallel search
|
|
387
|
+
results = search.search("machine learning")
|
|
388
|
+
|
|
389
|
+
# Access results
|
|
390
|
+
puts "Total hits: #{results[:total_hits]}"
|
|
391
|
+
puts "Execution time: #{results[:execution_time]}s"
|
|
392
|
+
|
|
393
|
+
# Display top results
|
|
394
|
+
results[:ranked_results].take(10).each do |hit|
|
|
395
|
+
puts "#{hit[:title]} (#{hit[:source]}, score: #{hit[:weighted_relevance]})"
|
|
396
|
+
end
|
|
35
397
|
----
|
|
398
|
+
====
|
|
36
399
|
|
|
37
400
|
== Expected Output
|
|
38
401
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
402
|
+
[source,text]
|
|
403
|
+
----
|
|
404
|
+
Starting Scatter-Gather Search Example
|
|
405
|
+
======================================
|
|
406
|
+
This example demonstrates searching multiple data sources concurrently:
|
|
407
|
+
1. Database - Simulates SQL database searches
|
|
408
|
+
2. API - Simulates external REST API calls
|
|
409
|
+
3. Cache - Simulates in-memory cache lookups
|
|
410
|
+
4. Filesystem - Simulates searching through files
|
|
411
|
+
|
|
412
|
+
Search Results Summary:
|
|
413
|
+
----------------------
|
|
414
|
+
Query: ruby concurrency patterns
|
|
415
|
+
Total hits: 24
|
|
416
|
+
Total execution time: 0.253 seconds
|
|
417
|
+
Sources searched: database, api, cache, filesystem
|
|
418
|
+
|
|
419
|
+
Top 5 Results (by relevance):
|
|
420
|
+
1. Cached Result 1 for 'ruby concurrency patterns' (Source: cache, Relevance: 1.08)
|
|
421
|
+
This is cached content for ruby concurrency patterns...
|
|
422
|
+
|
|
423
|
+
2. Cached Result 2 for 'ruby concurrency patterns' (Source: cache, Relevance: 0.96)
|
|
424
|
+
This is cached content for ruby concurrency patterns...
|
|
425
|
+
|
|
426
|
+
3. Database Result 1 for 'ruby concurrency patterns' (Source: database, Relevance: 0.85)
|
|
427
|
+
This is database content for ruby concurrency patterns...
|
|
428
|
+
|
|
429
|
+
4. File Result 1 for 'ruby concurrency patterns' (Source: filesystem, Relevance: 0.81)
|
|
430
|
+
This is file content matching ruby concurrency patterns...
|
|
431
|
+
|
|
432
|
+
5. API Result 1 for 'ruby concurrency patterns' (Source: api, Relevance: 0.72)
|
|
433
|
+
This is API content for ruby concurrency patterns...
|
|
434
|
+
|
|
435
|
+
Source Details:
|
|
436
|
+
- Database (7 results, 0.152 sec)
|
|
437
|
+
Metadata: {:source_type=>"PostgreSQL Database", :total_available=>53}
|
|
438
|
+
- Api (5 results, 0.245 sec)
|
|
439
|
+
Metadata: {:source_type=>"External REST API", :provider=>"Google"}
|
|
440
|
+
- Cache (3 results, 0.018 sec)
|
|
441
|
+
Metadata: {:source_type=>"In-memory Cache", :cache_hit=>true}
|
|
442
|
+
- Filesystem (9 results, 0.128 sec)
|
|
443
|
+
Metadata: {:source_type=>"File System", :files_scanned=>342}
|
|
444
|
+
----
|
|
445
|
+
|
|
446
|
+
== Learning Points
|
|
447
|
+
|
|
448
|
+
=== 1. Scatter-Gather Pattern
|
|
449
|
+
|
|
450
|
+
The pattern has two distinct phases:
|
|
451
|
+
|
|
452
|
+
**Scatter phase**:
|
|
453
|
+
[source,ruby]
|
|
454
|
+
----
|
|
455
|
+
# Distribute work to all sources
|
|
456
|
+
sources.each do |source|
|
|
457
|
+
supervisor.add_work_item(SearchWork.new(query, source))
|
|
458
|
+
end
|
|
459
|
+
supervisor.run # All execute in parallel
|
|
460
|
+
----
|
|
461
|
+
|
|
462
|
+
**Gather phase**:
|
|
463
|
+
[source,ruby]
|
|
464
|
+
----
|
|
465
|
+
# Collect and merge results
|
|
466
|
+
all_results = supervisor.results.results
|
|
467
|
+
merged = aggregate_results(all_results)
|
|
468
|
+
----
|
|
469
|
+
|
|
470
|
+
**Key characteristics**:
|
|
471
|
+
|
|
472
|
+
* **Fork-join parallelism**: All work starts together, results combined at end
|
|
473
|
+
* **Independent execution**: Sources don't communicate with each other
|
|
474
|
+
* **Synchronization point**: Gather waits for all sources to complete
|
|
475
|
+
* **Result aggregation**: Combine heterogeneous formats into unified view
|
|
476
|
+
|
|
477
|
+
=== 2. Source-Weighted Ranking
|
|
478
|
+
|
|
479
|
+
Different sources have different trust levels:
|
|
480
|
+
|
|
481
|
+
[source,ruby]
|
|
482
|
+
----
|
|
483
|
+
source_weights = {
|
|
484
|
+
cache: 1.2, # Most trusted (already validated)
|
|
485
|
+
database: 1.0, # Baseline trust
|
|
486
|
+
filesystem: 0.9, # Slightly lower trust
|
|
487
|
+
api: 0.8 # External, less trusted
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
weighted_score = intrinsic_relevance × source_weight
|
|
491
|
+
----
|
|
492
|
+
|
|
493
|
+
**Rationale**:
|
|
494
|
+
|
|
495
|
+
* **Cache**: Previously validated results, highest trust
|
|
496
|
+
* **Database**: Internal, controlled data, baseline
|
|
497
|
+
* **API**: External data, may be stale or inaccurate
|
|
498
|
+
* **Filesystem**: Unstructured, harder to validate
|
|
499
|
+
|
|
500
|
+
=== 3. Performance Analysis
|
|
501
|
+
|
|
502
|
+
**Total time = max(source_times)**:
|
|
503
|
+
|
|
504
|
+
[source]
|
|
505
|
+
----
|
|
506
|
+
Source times: [150ms, 250ms, 20ms, 120ms]
|
|
507
|
+
Total time: 250ms (limited by slowest)
|
|
508
|
+
|
|
509
|
+
Speedup = Σ(source_times) / max(source_times)
|
|
510
|
+
= (150 + 250 + 20 + 120) / 250
|
|
511
|
+
= 540 / 250
|
|
512
|
+
= 2.16x
|
|
513
|
+
|
|
514
|
+
Parallel efficiency = Speedup / num_sources
|
|
515
|
+
= 2.16 / 4
|
|
516
|
+
= 54%
|
|
517
|
+
----
|
|
518
|
+
|
|
519
|
+
**Efficiency factors**:
|
|
520
|
+
|
|
521
|
+
* **Load imbalance**: Slow sources dominate total time
|
|
522
|
+
* **Overhead**: Ractor creation, synchronization
|
|
523
|
+
* **I/O bound**: Network/disk latency, not CPU
|
|
524
|
+
|
|
525
|
+
=== 4. Cache Miss Handling
|
|
526
|
+
|
|
527
|
+
The cache may not have results:
|
|
528
|
+
|
|
529
|
+
[source,ruby]
|
|
530
|
+
----
|
|
531
|
+
def search_cache(work)
|
|
532
|
+
if cache_hit?
|
|
533
|
+
return cached_results
|
|
534
|
+
else
|
|
535
|
+
return { hits: [], metadata: { cache_hit: false } }
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
# In merge_results
|
|
540
|
+
if source_result[:hits].empty?
|
|
541
|
+
# Don't penalize total score for cache miss
|
|
542
|
+
# Other sources provide results
|
|
543
|
+
end
|
|
544
|
+
----
|
|
545
|
+
|
|
546
|
+
**Strategy**:
|
|
547
|
+
|
|
548
|
+
* Cache misses return empty results, not errors
|
|
549
|
+
* Merge phase handles varying result counts
|
|
550
|
+
* Total result count not affected by misses
|
|
551
|
+
|
|
552
|
+
=== 5. Heterogeneous Result Formats
|
|
553
|
+
|
|
554
|
+
Different sources return different structures:
|
|
555
|
+
|
|
556
|
+
[source,ruby]
|
|
557
|
+
----
|
|
558
|
+
# Database results
|
|
559
|
+
{
|
|
560
|
+
id: "db-123",
|
|
561
|
+
title: "...",
|
|
562
|
+
content: "...",
|
|
563
|
+
relevance: 0.85
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
# API results
|
|
567
|
+
{
|
|
568
|
+
id: "api-456",
|
|
569
|
+
title: "...",
|
|
570
|
+
content: "...",
|
|
571
|
+
relevance: 0.72,
|
|
572
|
+
provider: "Google" # Extra field
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
# Filesystem results
|
|
576
|
+
{
|
|
577
|
+
id: "file-789",
|
|
578
|
+
title: "...",
|
|
579
|
+
path: "/path/to/file", # Different structure
|
|
580
|
+
content: "...",
|
|
581
|
+
relevance: 0.91
|
|
582
|
+
}
|
|
583
|
+
----
|
|
584
|
+
|
|
585
|
+
**Normalization**:
|
|
586
|
+
|
|
587
|
+
* Extract common fields (id, title, content, relevance)
|
|
588
|
+
* Preserve source-specific metadata separately
|
|
589
|
+
* Unified ranking uses normalized fields
|
|
590
|
+
|
|
591
|
+
=== 6. Error Handling
|
|
592
|
+
|
|
593
|
+
Individual source failures don't stop other sources:
|
|
594
|
+
|
|
595
|
+
[source,ruby]
|
|
596
|
+
----
|
|
597
|
+
def process(work)
|
|
598
|
+
begin
|
|
599
|
+
result = search_source(work.source, work.query)
|
|
600
|
+
Fractor::WorkResult.new(result: result, work: work)
|
|
601
|
+
rescue StandardError => e
|
|
602
|
+
# Return error for this source, others continue
|
|
603
|
+
Fractor::WorkResult.new(
|
|
604
|
+
error: "#{work.source} failed: #{e.message}",
|
|
605
|
+
work: work
|
|
606
|
+
)
|
|
607
|
+
end
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
# In merge_results
|
|
611
|
+
results.select { |r| r.success? }.each do |result|
|
|
612
|
+
# Only process successful results
|
|
613
|
+
merge_into_final_set(result)
|
|
614
|
+
end
|
|
615
|
+
----
|
|
616
|
+
|
|
617
|
+
== Use Cases and Patterns
|
|
618
|
+
|
|
619
|
+
=== Federated Search
|
|
620
|
+
|
|
621
|
+
Search across multiple databases:
|
|
622
|
+
|
|
623
|
+
[source,ruby]
|
|
624
|
+
----
|
|
625
|
+
sources = [
|
|
626
|
+
{ source: :postgres, params: { schema: "public" } },
|
|
627
|
+
{ source: :elasticsearch, params: { index: "documents" } },
|
|
628
|
+
{ source: :redis, params: { pattern: "*" } },
|
|
629
|
+
{ source: :mongodb, params: { collection: "items" } }
|
|
630
|
+
]
|
|
631
|
+
|
|
632
|
+
search.search("user query", sources)
|
|
633
|
+
----
|
|
634
|
+
|
|
635
|
+
=== Multi-Cloud Query
|
|
636
|
+
|
|
637
|
+
Query services across cloud providers:
|
|
638
|
+
|
|
639
|
+
[source,ruby]
|
|
640
|
+
----
|
|
641
|
+
sources = [
|
|
642
|
+
{ source: :aws_s3, params: { bucket: "data" } },
|
|
643
|
+
{ source: :gcp_storage, params: { bucket: "archive" } },
|
|
644
|
+
{ source: :azure_blob, params: { container: "files" } }
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
search.search("document.pdf", sources)
|
|
648
|
+
----
|
|
649
|
+
|
|
650
|
+
=== Aggregated Pricing
|
|
651
|
+
|
|
652
|
+
Compare prices from multiple vendors:
|
|
653
|
+
|
|
654
|
+
[source,ruby]
|
|
655
|
+
----
|
|
656
|
+
def search_vendor(work)
|
|
657
|
+
prices = fetch_prices(work.query, work.source)
|
|
658
|
+
|
|
659
|
+
{
|
|
660
|
+
hits: prices.map { |p| { price: p, vendor: work.source } },
|
|
661
|
+
metadata: { currency: "USD", last_updated: Time.now }
|
|
662
|
+
}
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
# Merge sorts by price instead of relevance
|
|
666
|
+
def merge_results(results)
|
|
667
|
+
all_prices = results.flat_map { |r| r[:hits] }
|
|
668
|
+
all_prices.sort_by { |p| p[:price] } # Lowest first
|
|
669
|
+
end
|
|
670
|
+
----
|
|
671
|
+
|
|
672
|
+
=== Monitoring Dashboard
|
|
673
|
+
|
|
674
|
+
Query multiple monitoring sources:
|
|
675
|
+
|
|
676
|
+
[source,ruby]
|
|
677
|
+
----
|
|
678
|
+
sources = [
|
|
679
|
+
{ source: :prometheus, params: { metric: "cpu_usage" } },
|
|
680
|
+
{ source: :cloudwatch, params: { namespace: "AWS/EC2" } },
|
|
681
|
+
{ source: :datadog, params: { query: "avg:system.cpu.usage" } },
|
|
682
|
+
{ source: :newrelic, params: { metric: "CPU/User Time" } }
|
|
683
|
+
]
|
|
684
|
+
|
|
685
|
+
# Aggregate metrics
|
|
686
|
+
metrics = search.search("cpu_usage", sources)
|
|
687
|
+
average_cpu = metrics[:ranked_results].map { |m| m[:value] }.sum / metrics[:total_hits]
|
|
688
|
+
----
|
|
689
|
+
|
|
690
|
+
== Advanced Patterns
|
|
691
|
+
|
|
692
|
+
=== Timeout Handling
|
|
693
|
+
|
|
694
|
+
Set per-source timeouts:
|
|
695
|
+
|
|
696
|
+
[source,ruby]
|
|
697
|
+
----
|
|
698
|
+
def process(work)
|
|
699
|
+
Timeout.timeout(work.query_params[:timeout] || 5) do
|
|
700
|
+
search_source(work.source, work.query)
|
|
701
|
+
end
|
|
702
|
+
rescue Timeout::Error
|
|
703
|
+
Fractor::WorkResult.new(
|
|
704
|
+
result: { hits: [], metadata: { timeout: true } },
|
|
705
|
+
work: work
|
|
706
|
+
)
|
|
707
|
+
end
|
|
708
|
+
----
|
|
709
|
+
|
|
710
|
+
=== Fallback Sources
|
|
711
|
+
|
|
712
|
+
Use backup sources if primary fails:
|
|
713
|
+
|
|
714
|
+
[source,ruby]
|
|
715
|
+
----
|
|
716
|
+
def search(query)
|
|
717
|
+
primary_sources = [:cache, :database]
|
|
718
|
+
fallback_sources = [:api, :filesystem]
|
|
719
|
+
|
|
720
|
+
# Try primary sources first
|
|
721
|
+
results = scatter_gather(query, primary_sources)
|
|
722
|
+
|
|
723
|
+
# If insufficient results, try fallbacks
|
|
724
|
+
if results[:total_hits] < MIN_RESULTS
|
|
725
|
+
fallback_results = scatter_gather(query, fallback_sources)
|
|
726
|
+
results = merge(results, fallback_results)
|
|
727
|
+
end
|
|
728
|
+
|
|
729
|
+
results
|
|
730
|
+
end
|
|
731
|
+
----
|
|
732
|
+
|
|
733
|
+
=== Progressive Results
|
|
734
|
+
|
|
735
|
+
Return fast results immediately, slower later:
|
|
736
|
+
|
|
737
|
+
[source,ruby]
|
|
738
|
+
----
|
|
739
|
+
def search_progressive(query)
|
|
740
|
+
fast_sources = [:cache]
|
|
741
|
+
slow_sources = [:database, :api, :filesystem]
|
|
742
|
+
|
|
743
|
+
# Return cache results immediately
|
|
744
|
+
fast_results = scatter_gather(query, fast_sources)
|
|
745
|
+
yield fast_results if block_given?
|
|
746
|
+
|
|
747
|
+
# Add slow results as they arrive
|
|
748
|
+
slow_results = scatter_gather(query, slow_sources)
|
|
749
|
+
yield merge(fast_results, slow_results) if block_given?
|
|
750
|
+
end
|
|
751
|
+
----
|
|
752
|
+
|
|
753
|
+
=== Result Deduplication
|
|
754
|
+
|
|
755
|
+
Remove duplicate results across sources:
|
|
756
|
+
|
|
757
|
+
[source,ruby]
|
|
758
|
+
----
|
|
759
|
+
def merge_results(results)
|
|
760
|
+
all_hits = results.flat_map { |r| r[:hits] }
|
|
761
|
+
|
|
762
|
+
# Deduplicate by content similarity
|
|
763
|
+
unique_hits = []
|
|
764
|
+
all_hits.each do |hit|
|
|
765
|
+
unless unique_hits.any? { |h| similar?(h, hit) }
|
|
766
|
+
unique_hits << hit
|
|
767
|
+
end
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
unique_hits.sort_by { |h| -h[:weighted_relevance] }
|
|
771
|
+
end
|
|
772
|
+
|
|
773
|
+
def similar?(hit1, hit2)
|
|
774
|
+
# Simple deduplication by title similarity
|
|
775
|
+
hit1[:title].downcase == hit2[:title].downcase
|
|
776
|
+
end
|
|
777
|
+
----
|
|
778
|
+
|
|
779
|
+
== Performance Tuning
|
|
780
|
+
|
|
781
|
+
=== Worker Pool Sizing
|
|
782
|
+
|
|
783
|
+
Match workers to data sources:
|
|
784
|
+
|
|
785
|
+
[source,ruby]
|
|
786
|
+
----
|
|
787
|
+
# Option 1: One worker per source
|
|
788
|
+
worker_count = sources.size
|
|
789
|
+
|
|
790
|
+
# Option 2: More workers than sources (for queueing)
|
|
791
|
+
worker_count = sources.size * 2
|
|
792
|
+
|
|
793
|
+
# Option 3: Match to available cores
|
|
794
|
+
worker_count = [sources.size, Etc.nprocessors].min
|
|
795
|
+
----
|
|
796
|
+
|
|
797
|
+
=== Source Prioritization
|
|
798
|
+
|
|
799
|
+
Query fast sources first:
|
|
800
|
+
|
|
801
|
+
[source,ruby]
|
|
802
|
+
----
|
|
803
|
+
sources_by_speed = [
|
|
804
|
+
{ source: :cache, expected_time: 0.02 },
|
|
805
|
+
{ source: :database, expected_time: 0.15 },
|
|
806
|
+
{ source: :filesystem, expected_time: 0.12 },
|
|
807
|
+
{ source: :api, expected_time: 0.25 }
|
|
808
|
+
].sort_by { |s| s[:expected_time] }
|
|
809
|
+
|
|
810
|
+
# Start fast sources first for early results
|
|
811
|
+
sources_by_speed.each do |source|
|
|
812
|
+
supervisor.add_work_item(SearchWork.new(query, source[:source]))
|
|
813
|
+
end
|
|
814
|
+
----
|
|
815
|
+
|
|
816
|
+
=== Connection Pooling
|
|
817
|
+
|
|
818
|
+
Reuse connections across searches:
|
|
819
|
+
|
|
820
|
+
[source,ruby]
|
|
821
|
+
----
|
|
822
|
+
class SearchWorker < Fractor::Worker
|
|
823
|
+
def initialize
|
|
824
|
+
super
|
|
825
|
+
@connections = {
|
|
826
|
+
database: connect_to_database,
|
|
827
|
+
api: initialize_api_client
|
|
828
|
+
}
|
|
829
|
+
end
|
|
830
|
+
|
|
831
|
+
def process(work)
|
|
832
|
+
conn = @connections[work.source]
|
|
833
|
+
search_with_connection(conn, work.query)
|
|
834
|
+
end
|
|
835
|
+
end
|
|
836
|
+
----
|
|
837
|
+
|
|
838
|
+
== Next Steps
|
|
839
|
+
|
|
840
|
+
After understanding scatter-gather, explore:
|
|
841
|
+
|
|
842
|
+
* **link:../producer_subscriber/README.adoc[Producer-Subscriber]**: Hierarchical work decomposition
|
|
843
|
+
* **link:../pipeline_processing/README.adoc[Pipeline Processing]**: Sequential transformations
|
|
844
|
+
* **link:../hierarchical_hasher/README.adoc[Hierarchical Hasher]**: Map-reduce patterns
|
|
845
|
+
* **link:../workflow/README.adoc[Workflow System]**: Complex orchestration with dependencies
|