simple_flow 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/.github/workflows/deploy-github-pages.yml +52 -0
- data/.rubocop.yml +57 -0
- data/CHANGELOG.md +4 -0
- data/COMMITS.md +196 -0
- data/LICENSE +21 -0
- data/README.md +481 -0
- data/Rakefile +15 -0
- data/benchmarks/parallel_vs_sequential.rb +98 -0
- data/benchmarks/pipeline_overhead.rb +130 -0
- data/docs/api/middleware.md +468 -0
- data/docs/api/parallel-step.md +363 -0
- data/docs/api/pipeline.md +382 -0
- data/docs/api/result.md +375 -0
- data/docs/concurrent/best-practices.md +687 -0
- data/docs/concurrent/introduction.md +246 -0
- data/docs/concurrent/parallel-steps.md +418 -0
- data/docs/concurrent/performance.md +481 -0
- data/docs/core-concepts/flow-control.md +452 -0
- data/docs/core-concepts/middleware.md +389 -0
- data/docs/core-concepts/overview.md +219 -0
- data/docs/core-concepts/pipeline.md +315 -0
- data/docs/core-concepts/result.md +168 -0
- data/docs/core-concepts/steps.md +391 -0
- data/docs/development/benchmarking.md +443 -0
- data/docs/development/contributing.md +380 -0
- data/docs/development/dagwood-concepts.md +435 -0
- data/docs/development/testing.md +514 -0
- data/docs/getting-started/examples.md +197 -0
- data/docs/getting-started/installation.md +62 -0
- data/docs/getting-started/quick-start.md +218 -0
- data/docs/guides/choosing-concurrency-model.md +441 -0
- data/docs/guides/complex-workflows.md +440 -0
- data/docs/guides/data-fetching.md +478 -0
- data/docs/guides/error-handling.md +635 -0
- data/docs/guides/file-processing.md +505 -0
- data/docs/guides/validation-patterns.md +496 -0
- data/docs/index.md +169 -0
- data/examples/.gitignore +3 -0
- data/examples/01_basic_pipeline.rb +112 -0
- data/examples/02_error_handling.rb +178 -0
- data/examples/03_middleware.rb +186 -0
- data/examples/04_parallel_automatic.rb +221 -0
- data/examples/05_parallel_explicit.rb +279 -0
- data/examples/06_real_world_ecommerce.rb +288 -0
- data/examples/07_real_world_etl.rb +277 -0
- data/examples/08_graph_visualization.rb +246 -0
- data/examples/09_pipeline_visualization.rb +266 -0
- data/examples/10_concurrency_control.rb +235 -0
- data/examples/11_sequential_dependencies.rb +243 -0
- data/examples/12_none_constant.rb +161 -0
- data/examples/README.md +374 -0
- data/examples/regression_test/01_basic_pipeline.txt +38 -0
- data/examples/regression_test/02_error_handling.txt +92 -0
- data/examples/regression_test/03_middleware.txt +61 -0
- data/examples/regression_test/04_parallel_automatic.txt +86 -0
- data/examples/regression_test/05_parallel_explicit.txt +80 -0
- data/examples/regression_test/06_real_world_ecommerce.txt +53 -0
- data/examples/regression_test/07_real_world_etl.txt +58 -0
- data/examples/regression_test/08_graph_visualization.txt +429 -0
- data/examples/regression_test/09_pipeline_visualization.txt +305 -0
- data/examples/regression_test/10_concurrency_control.txt +96 -0
- data/examples/regression_test/11_sequential_dependencies.txt +86 -0
- data/examples/regression_test/12_none_constant.txt +64 -0
- data/examples/regression_test.rb +105 -0
- data/lib/simple_flow/dependency_graph.rb +120 -0
- data/lib/simple_flow/dependency_graph_visualizer.rb +326 -0
- data/lib/simple_flow/middleware.rb +36 -0
- data/lib/simple_flow/parallel_executor.rb +80 -0
- data/lib/simple_flow/pipeline.rb +405 -0
- data/lib/simple_flow/result.rb +88 -0
- data/lib/simple_flow/step_tracker.rb +58 -0
- data/lib/simple_flow/version.rb +5 -0
- data/lib/simple_flow.rb +41 -0
- data/mkdocs.yml +146 -0
- data/pipeline_graph.dot +51 -0
- data/pipeline_graph.html +60 -0
- data/pipeline_graph.mmd +19 -0
- metadata +127 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
# Data Fetching Guide
|
|
2
|
+
|
|
3
|
+
This guide demonstrates how to fetch data from various sources using SimpleFlow, including APIs, databases, file systems, and external services.
|
|
4
|
+
|
|
5
|
+
## API Data Fetching
|
|
6
|
+
|
|
7
|
+
### Basic API Call
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
step :fetch_from_api, ->(result) {
|
|
11
|
+
begin
|
|
12
|
+
response = HTTP.get("https://api.example.com/users/#{result.value}")
|
|
13
|
+
data = JSON.parse(response.body)
|
|
14
|
+
result.with_context(:user_data, data).continue(result.value)
|
|
15
|
+
rescue HTTP::Error => e
|
|
16
|
+
result.halt.with_error(:api, "API request failed: #{e.message}")
|
|
17
|
+
rescue JSON::ParserError => e
|
|
18
|
+
result.halt.with_error(:parse, "Invalid JSON: #{e.message}")
|
|
19
|
+
end
|
|
20
|
+
}
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Parallel API Calls
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
27
|
+
step :fetch_weather, ->(result) {
|
|
28
|
+
location = result.value[:location]
|
|
29
|
+
weather = HTTP.get("https://api.weather.com/current?location=#{location}").parse
|
|
30
|
+
result.with_context(:weather, weather).continue(result.value)
|
|
31
|
+
}, depends_on: []
|
|
32
|
+
|
|
33
|
+
step :fetch_news, ->(result) {
|
|
34
|
+
topic = result.value[:topic]
|
|
35
|
+
news = HTTP.get("https://api.news.com/articles?topic=#{topic}").parse
|
|
36
|
+
result.with_context(:news, news).continue(result.value)
|
|
37
|
+
}, depends_on: []
|
|
38
|
+
|
|
39
|
+
step :fetch_stocks, ->(result) {
|
|
40
|
+
symbols = result.value[:symbols]
|
|
41
|
+
stocks = HTTP.get("https://api.stocks.com/quotes?symbols=#{symbols}").parse
|
|
42
|
+
result.with_context(:stocks, stocks).continue(result.value)
|
|
43
|
+
}, depends_on: []
|
|
44
|
+
|
|
45
|
+
step :combine_results, ->(result) {
|
|
46
|
+
combined = {
|
|
47
|
+
weather: result.context[:weather],
|
|
48
|
+
news: result.context[:news],
|
|
49
|
+
stocks: result.context[:stocks]
|
|
50
|
+
}
|
|
51
|
+
result.continue(combined)
|
|
52
|
+
}, depends_on: [:fetch_weather, :fetch_news, :fetch_stocks]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# All API calls execute in parallel
|
|
56
|
+
result = pipeline.call_parallel(
|
|
57
|
+
SimpleFlow::Result.new({ location: "NYC", topic: "tech", symbols: "AAPL,GOOGL" })
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### API with Authentication
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
class AuthenticatedAPI
|
|
65
|
+
def initialize(api_key)
|
|
66
|
+
@api_key = api_key
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def call(result)
|
|
70
|
+
endpoint = result.value[:endpoint]
|
|
71
|
+
|
|
72
|
+
response = HTTP
|
|
73
|
+
.auth("Bearer #{@api_key}")
|
|
74
|
+
.get("https://api.example.com/#{endpoint}")
|
|
75
|
+
|
|
76
|
+
if response.status.success?
|
|
77
|
+
data = JSON.parse(response.body)
|
|
78
|
+
result.with_context(:api_response, data).continue(result.value)
|
|
79
|
+
else
|
|
80
|
+
result.halt.with_error(:api, "Request failed with status #{response.status}")
|
|
81
|
+
end
|
|
82
|
+
rescue StandardError => e
|
|
83
|
+
result.halt.with_error(:api, "API error: #{e.message}")
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
88
|
+
step :fetch_data, AuthenticatedAPI.new(ENV['API_KEY']), depends_on: []
|
|
89
|
+
end
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Rate-Limited API Calls
|
|
93
|
+
|
|
94
|
+
```ruby
|
|
95
|
+
class RateLimitedFetcher
|
|
96
|
+
def initialize(max_requests_per_second: 10)
|
|
97
|
+
@max_requests = max_requests_per_second
|
|
98
|
+
@request_times = []
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def call(result)
|
|
102
|
+
wait_if_rate_limited
|
|
103
|
+
|
|
104
|
+
begin
|
|
105
|
+
@request_times << Time.now
|
|
106
|
+
response = HTTP.get(result.value[:url])
|
|
107
|
+
data = response.parse
|
|
108
|
+
|
|
109
|
+
result.with_context(:data, data).continue(result.value)
|
|
110
|
+
rescue HTTP::Error => e
|
|
111
|
+
result.halt.with_error(:http, e.message)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def wait_if_rate_limited
|
|
118
|
+
# Remove old requests outside the time window
|
|
119
|
+
one_second_ago = Time.now - 1
|
|
120
|
+
@request_times.reject! { |time| time < one_second_ago }
|
|
121
|
+
|
|
122
|
+
# Wait if we've hit the limit
|
|
123
|
+
if @request_times.size >= @max_requests
|
|
124
|
+
sleep(0.1)
|
|
125
|
+
wait_if_rate_limited
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Database Queries
|
|
132
|
+
|
|
133
|
+
### Basic Database Query
|
|
134
|
+
|
|
135
|
+
```ruby
|
|
136
|
+
step :fetch_users, ->(result) {
|
|
137
|
+
users = DB[:users]
|
|
138
|
+
.where(active: true)
|
|
139
|
+
.where { created_at > Date.today - 30 }
|
|
140
|
+
.all
|
|
141
|
+
|
|
142
|
+
result.with_context(:users, users).continue(result.value)
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Parallel Database Queries
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
150
|
+
step :fetch_users, ->(result) {
|
|
151
|
+
users = DB[:users].where(active: true).all
|
|
152
|
+
result.with_context(:users, users).continue(result.value)
|
|
153
|
+
}, depends_on: []
|
|
154
|
+
|
|
155
|
+
step :fetch_orders, ->(result) {
|
|
156
|
+
orders = DB[:orders].where(status: 'completed').all
|
|
157
|
+
result.with_context(:orders, orders).continue(result.value)
|
|
158
|
+
}, depends_on: []
|
|
159
|
+
|
|
160
|
+
step :fetch_products, ->(result) {
|
|
161
|
+
products = DB[:products].where(in_stock: true).all
|
|
162
|
+
result.with_context(:products, products).continue(result.value)
|
|
163
|
+
}, depends_on: []
|
|
164
|
+
|
|
165
|
+
step :aggregate, ->(result) {
|
|
166
|
+
stats = {
|
|
167
|
+
total_users: result.context[:users].size,
|
|
168
|
+
total_orders: result.context[:orders].size,
|
|
169
|
+
total_products: result.context[:products].size
|
|
170
|
+
}
|
|
171
|
+
result.continue(stats)
|
|
172
|
+
}, depends_on: [:fetch_users, :fetch_orders, :fetch_products]
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Ensure your database connection pool supports concurrent queries
|
|
176
|
+
DB = Sequel.connect(
|
|
177
|
+
'postgres://localhost/mydb',
|
|
178
|
+
max_connections: 10 # Allow concurrent connections
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
result = pipeline.call_parallel(SimpleFlow::Result.new(nil))
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Complex Joins and Aggregations
|
|
185
|
+
|
|
186
|
+
```ruby
|
|
187
|
+
step :fetch_user_analytics, ->(result) {
|
|
188
|
+
user_id = result.value
|
|
189
|
+
|
|
190
|
+
analytics = DB[:users]
|
|
191
|
+
.select(:users__id, :users__name)
|
|
192
|
+
.select_append { count(:orders__id).as(:order_count) }
|
|
193
|
+
.select_append { sum(:orders__total).as(:total_spent) }
|
|
194
|
+
.left_join(:orders, user_id: :id)
|
|
195
|
+
.where(users__id: user_id)
|
|
196
|
+
.group(:users__id, :users__name)
|
|
197
|
+
.first
|
|
198
|
+
|
|
199
|
+
result.with_context(:analytics, analytics).continue(result.value)
|
|
200
|
+
}
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## File System Operations
|
|
204
|
+
|
|
205
|
+
### Reading Files
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
step :read_config, ->(result) {
|
|
209
|
+
begin
|
|
210
|
+
config_path = result.value[:config_path]
|
|
211
|
+
content = File.read(config_path)
|
|
212
|
+
config = JSON.parse(content)
|
|
213
|
+
|
|
214
|
+
result.with_context(:config, config).continue(result.value)
|
|
215
|
+
rescue Errno::ENOENT
|
|
216
|
+
result.halt.with_error(:file, "Config file not found: #{config_path}")
|
|
217
|
+
rescue JSON::ParserError => e
|
|
218
|
+
result.halt.with_error(:parse, "Invalid JSON in config: #{e.message}")
|
|
219
|
+
end
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Reading Multiple Files in Parallel
|
|
224
|
+
|
|
225
|
+
```ruby
|
|
226
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
227
|
+
step :read_users_csv, ->(result) {
|
|
228
|
+
users = CSV.read('data/users.csv', headers: true).map(&:to_h)
|
|
229
|
+
result.with_context(:users, users).continue(result.value)
|
|
230
|
+
}, depends_on: []
|
|
231
|
+
|
|
232
|
+
step :read_products_json, ->(result) {
|
|
233
|
+
products = JSON.parse(File.read('data/products.json'))
|
|
234
|
+
result.with_context(:products, products).continue(result.value)
|
|
235
|
+
}, depends_on: []
|
|
236
|
+
|
|
237
|
+
step :read_config_yaml, ->(result) {
|
|
238
|
+
config = YAML.load_file('config/settings.yml')
|
|
239
|
+
result.with_context(:config, config).continue(result.value)
|
|
240
|
+
}, depends_on: []
|
|
241
|
+
|
|
242
|
+
step :combine_data, ->(result) {
|
|
243
|
+
combined = {
|
|
244
|
+
users: result.context[:users],
|
|
245
|
+
products: result.context[:products],
|
|
246
|
+
config: result.context[:config]
|
|
247
|
+
}
|
|
248
|
+
result.continue(combined)
|
|
249
|
+
}, depends_on: [:read_users_csv, :read_products_json, :read_config_yaml]
|
|
250
|
+
end
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Processing Large Files
|
|
254
|
+
|
|
255
|
+
```ruby
|
|
256
|
+
step :process_large_file, ->(result) {
|
|
257
|
+
file_path = result.value
|
|
258
|
+
processed_count = 0
|
|
259
|
+
|
|
260
|
+
File.foreach(file_path).each_slice(1000) do |batch|
|
|
261
|
+
# Process in batches
|
|
262
|
+
batch.each do |line|
|
|
263
|
+
process_line(line)
|
|
264
|
+
processed_count += 1
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
result.with_context(:lines_processed, processed_count).continue(result.value)
|
|
269
|
+
}
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Caching Strategies
|
|
273
|
+
|
|
274
|
+
### Simple Cache with Fallback
|
|
275
|
+
|
|
276
|
+
```ruby
|
|
277
|
+
step :fetch_with_cache, ->(result) {
|
|
278
|
+
cache_key = "user_#{result.value}"
|
|
279
|
+
|
|
280
|
+
# Try cache first
|
|
281
|
+
cached = REDIS.get(cache_key)
|
|
282
|
+
if cached
|
|
283
|
+
data = JSON.parse(cached)
|
|
284
|
+
return result.with_context(:source, :cache).continue(data)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Cache miss - fetch from API
|
|
288
|
+
begin
|
|
289
|
+
response = HTTP.get("https://api.example.com/users/#{result.value}")
|
|
290
|
+
data = response.parse
|
|
291
|
+
|
|
292
|
+
# Store in cache for 1 hour
|
|
293
|
+
REDIS.setex(cache_key, 3600, data.to_json)
|
|
294
|
+
|
|
295
|
+
result.with_context(:source, :api).continue(data)
|
|
296
|
+
rescue HTTP::Error => e
|
|
297
|
+
result.halt.with_error(:fetch, "Failed to fetch data: #{e.message}")
|
|
298
|
+
end
|
|
299
|
+
}
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Multi-Level Caching
|
|
303
|
+
|
|
304
|
+
```ruby
|
|
305
|
+
class MultiLevelCache
|
|
306
|
+
def self.call(result)
|
|
307
|
+
key = result.value[:cache_key]
|
|
308
|
+
|
|
309
|
+
# Level 1: Memory cache
|
|
310
|
+
if data = MEMORY_CACHE[key]
|
|
311
|
+
return result.with_context(:cache_level, :memory).continue(data)
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Level 2: Redis cache
|
|
315
|
+
if cached = REDIS.get(key)
|
|
316
|
+
data = JSON.parse(cached)
|
|
317
|
+
MEMORY_CACHE[key] = data
|
|
318
|
+
return result.with_context(:cache_level, :redis).continue(data)
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# Level 3: Database
|
|
322
|
+
if record = DB[:cache].where(key: key).first
|
|
323
|
+
data = JSON.parse(record[:value])
|
|
324
|
+
REDIS.setex(key, 3600, data.to_json)
|
|
325
|
+
MEMORY_CACHE[key] = data
|
|
326
|
+
return result.with_context(:cache_level, :database).continue(data)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# No cache hit - need to fetch
|
|
330
|
+
result.with_context(:cache_level, :none).continue(nil)
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
## Batch Processing
|
|
336
|
+
|
|
337
|
+
### Fetching Data in Batches
|
|
338
|
+
|
|
339
|
+
```ruby
|
|
340
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
341
|
+
step :fetch_batch, ->(result) {
|
|
342
|
+
batch_ids = result.value
|
|
343
|
+
records = DB[:records].where(id: batch_ids).all
|
|
344
|
+
|
|
345
|
+
result.with_context(:records, records).continue(result.value)
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
step :process_records, ->(result) {
|
|
349
|
+
records = result.context[:records]
|
|
350
|
+
processed = records.map { |r| transform_record(r) }
|
|
351
|
+
|
|
352
|
+
result.continue(processed)
|
|
353
|
+
}
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Process in batches
|
|
357
|
+
all_ids = (1..10000).to_a
|
|
358
|
+
all_ids.each_slice(100) do |batch|
|
|
359
|
+
result = pipeline.call(SimpleFlow::Result.new(batch))
|
|
360
|
+
save_processed_batch(result.value)
|
|
361
|
+
end
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## Real-World ETL Example
|
|
365
|
+
|
|
366
|
+
```ruby
|
|
367
|
+
class ETLPipeline
|
|
368
|
+
def self.build
|
|
369
|
+
SimpleFlow::Pipeline.new do
|
|
370
|
+
# Extract phase - parallel data loading
|
|
371
|
+
step :extract_users, ->(result) {
|
|
372
|
+
users = CSV.read('data/users.csv', headers: true).map(&:to_h)
|
|
373
|
+
result.with_context(:raw_users, users).continue(result.value)
|
|
374
|
+
}, depends_on: []
|
|
375
|
+
|
|
376
|
+
step :extract_orders, ->(result) {
|
|
377
|
+
orders = JSON.parse(File.read('data/orders.json'))
|
|
378
|
+
result.with_context(:raw_orders, orders).continue(result.value)
|
|
379
|
+
}, depends_on: []
|
|
380
|
+
|
|
381
|
+
step :extract_products, ->(result) {
|
|
382
|
+
products = DB[:products].all
|
|
383
|
+
result.with_context(:raw_products, products).continue(result.value)
|
|
384
|
+
}, depends_on: []
|
|
385
|
+
|
|
386
|
+
# Transform phase - parallel transformations
|
|
387
|
+
step :transform_users, ->(result) {
|
|
388
|
+
users = result.context[:raw_users].map do |user|
|
|
389
|
+
{
|
|
390
|
+
id: user['id'].to_i,
|
|
391
|
+
name: user['name'].strip.downcase,
|
|
392
|
+
email: user['email'].downcase,
|
|
393
|
+
created_at: Date.parse(user['signup_date'])
|
|
394
|
+
}
|
|
395
|
+
end
|
|
396
|
+
result.with_context(:users, users).continue(result.value)
|
|
397
|
+
}, depends_on: [:extract_users]
|
|
398
|
+
|
|
399
|
+
step :transform_orders, ->(result) {
|
|
400
|
+
orders = result.context[:raw_orders]
|
|
401
|
+
.reject { |o| o['status'] == 'cancelled' }
|
|
402
|
+
.map do |order|
|
|
403
|
+
{
|
|
404
|
+
id: order['order_id'],
|
|
405
|
+
user_id: order['user_id'],
|
|
406
|
+
total: order['amount'].to_f,
|
|
407
|
+
items: order['items'].size
|
|
408
|
+
}
|
|
409
|
+
end
|
|
410
|
+
result.with_context(:orders, orders).continue(result.value)
|
|
411
|
+
}, depends_on: [:extract_orders]
|
|
412
|
+
|
|
413
|
+
# Load phase - aggregate and save
|
|
414
|
+
step :aggregate_stats, ->(result) {
|
|
415
|
+
users = result.context[:users]
|
|
416
|
+
orders = result.context[:orders]
|
|
417
|
+
|
|
418
|
+
stats = users.map do |user|
|
|
419
|
+
user_orders = orders.select { |o| o[:user_id] == user[:id] }
|
|
420
|
+
{
|
|
421
|
+
user_id: user[:id],
|
|
422
|
+
total_orders: user_orders.size,
|
|
423
|
+
total_spent: user_orders.sum { |o| o[:total] },
|
|
424
|
+
avg_order: user_orders.empty? ? 0 : user_orders.sum { |o| o[:total] } / user_orders.size
|
|
425
|
+
}
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
result.continue(stats)
|
|
429
|
+
}, depends_on: [:transform_users, :transform_orders]
|
|
430
|
+
|
|
431
|
+
step :save_results, ->(result) {
|
|
432
|
+
DB[:user_stats].multi_insert(result.value)
|
|
433
|
+
result.continue("Saved #{result.value.size} records")
|
|
434
|
+
}, depends_on: [:aggregate_stats]
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
# Execute ETL pipeline
|
|
440
|
+
result = ETLPipeline.build.call_parallel(SimpleFlow::Result.new(nil))
|
|
441
|
+
puts result.value # "Saved 150 records"
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
## Error Handling for Data Fetching
|
|
445
|
+
|
|
446
|
+
```ruby
|
|
447
|
+
step :fetch_with_retries, ->(result) {
|
|
448
|
+
max_retries = 3
|
|
449
|
+
attempt = 0
|
|
450
|
+
|
|
451
|
+
begin
|
|
452
|
+
attempt += 1
|
|
453
|
+
response = HTTP.timeout(10).get(result.value[:url])
|
|
454
|
+
data = response.parse
|
|
455
|
+
|
|
456
|
+
result
|
|
457
|
+
.with_context(:attempts, attempt)
|
|
458
|
+
.with_context(:data, data)
|
|
459
|
+
.continue(result.value)
|
|
460
|
+
rescue HTTP::TimeoutError
|
|
461
|
+
if attempt < max_retries
|
|
462
|
+
sleep(attempt ** 2) # Exponential backoff
|
|
463
|
+
retry
|
|
464
|
+
else
|
|
465
|
+
result.halt.with_error(:timeout, "Request timed out after #{max_retries} attempts")
|
|
466
|
+
end
|
|
467
|
+
rescue HTTP::Error => e
|
|
468
|
+
result.halt.with_error(:http, "HTTP error: #{e.message}")
|
|
469
|
+
end
|
|
470
|
+
}
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
## Related Documentation
|
|
474
|
+
|
|
475
|
+
- [Error Handling](error-handling.md) - Handling errors during data fetching
|
|
476
|
+
- [File Processing](file-processing.md) - Advanced file processing techniques
|
|
477
|
+
- [Complex Workflows](complex-workflows.md) - Building complete data pipelines
|
|
478
|
+
- [Performance Guide](../concurrent/performance.md) - Optimizing data fetching
|