simple_flow 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/.github/workflows/deploy-github-pages.yml +52 -0
  4. data/.rubocop.yml +57 -0
  5. data/CHANGELOG.md +4 -0
  6. data/COMMITS.md +196 -0
  7. data/LICENSE +21 -0
  8. data/README.md +481 -0
  9. data/Rakefile +15 -0
  10. data/benchmarks/parallel_vs_sequential.rb +98 -0
  11. data/benchmarks/pipeline_overhead.rb +130 -0
  12. data/docs/api/middleware.md +468 -0
  13. data/docs/api/parallel-step.md +363 -0
  14. data/docs/api/pipeline.md +382 -0
  15. data/docs/api/result.md +375 -0
  16. data/docs/concurrent/best-practices.md +687 -0
  17. data/docs/concurrent/introduction.md +246 -0
  18. data/docs/concurrent/parallel-steps.md +418 -0
  19. data/docs/concurrent/performance.md +481 -0
  20. data/docs/core-concepts/flow-control.md +452 -0
  21. data/docs/core-concepts/middleware.md +389 -0
  22. data/docs/core-concepts/overview.md +219 -0
  23. data/docs/core-concepts/pipeline.md +315 -0
  24. data/docs/core-concepts/result.md +168 -0
  25. data/docs/core-concepts/steps.md +391 -0
  26. data/docs/development/benchmarking.md +443 -0
  27. data/docs/development/contributing.md +380 -0
  28. data/docs/development/dagwood-concepts.md +435 -0
  29. data/docs/development/testing.md +514 -0
  30. data/docs/getting-started/examples.md +197 -0
  31. data/docs/getting-started/installation.md +62 -0
  32. data/docs/getting-started/quick-start.md +218 -0
  33. data/docs/guides/choosing-concurrency-model.md +441 -0
  34. data/docs/guides/complex-workflows.md +440 -0
  35. data/docs/guides/data-fetching.md +478 -0
  36. data/docs/guides/error-handling.md +635 -0
  37. data/docs/guides/file-processing.md +505 -0
  38. data/docs/guides/validation-patterns.md +496 -0
  39. data/docs/index.md +169 -0
  40. data/examples/.gitignore +3 -0
  41. data/examples/01_basic_pipeline.rb +112 -0
  42. data/examples/02_error_handling.rb +178 -0
  43. data/examples/03_middleware.rb +186 -0
  44. data/examples/04_parallel_automatic.rb +221 -0
  45. data/examples/05_parallel_explicit.rb +279 -0
  46. data/examples/06_real_world_ecommerce.rb +288 -0
  47. data/examples/07_real_world_etl.rb +277 -0
  48. data/examples/08_graph_visualization.rb +246 -0
  49. data/examples/09_pipeline_visualization.rb +266 -0
  50. data/examples/10_concurrency_control.rb +235 -0
  51. data/examples/11_sequential_dependencies.rb +243 -0
  52. data/examples/12_none_constant.rb +161 -0
  53. data/examples/README.md +374 -0
  54. data/examples/regression_test/01_basic_pipeline.txt +38 -0
  55. data/examples/regression_test/02_error_handling.txt +92 -0
  56. data/examples/regression_test/03_middleware.txt +61 -0
  57. data/examples/regression_test/04_parallel_automatic.txt +86 -0
  58. data/examples/regression_test/05_parallel_explicit.txt +80 -0
  59. data/examples/regression_test/06_real_world_ecommerce.txt +53 -0
  60. data/examples/regression_test/07_real_world_etl.txt +58 -0
  61. data/examples/regression_test/08_graph_visualization.txt +429 -0
  62. data/examples/regression_test/09_pipeline_visualization.txt +305 -0
  63. data/examples/regression_test/10_concurrency_control.txt +96 -0
  64. data/examples/regression_test/11_sequential_dependencies.txt +86 -0
  65. data/examples/regression_test/12_none_constant.txt +64 -0
  66. data/examples/regression_test.rb +105 -0
  67. data/lib/simple_flow/dependency_graph.rb +120 -0
  68. data/lib/simple_flow/dependency_graph_visualizer.rb +326 -0
  69. data/lib/simple_flow/middleware.rb +36 -0
  70. data/lib/simple_flow/parallel_executor.rb +80 -0
  71. data/lib/simple_flow/pipeline.rb +405 -0
  72. data/lib/simple_flow/result.rb +88 -0
  73. data/lib/simple_flow/step_tracker.rb +58 -0
  74. data/lib/simple_flow/version.rb +5 -0
  75. data/lib/simple_flow.rb +41 -0
  76. data/mkdocs.yml +146 -0
  77. data/pipeline_graph.dot +51 -0
  78. data/pipeline_graph.html +60 -0
  79. data/pipeline_graph.mmd +19 -0
  80. metadata +127 -0
@@ -0,0 +1,478 @@
1
+ # Data Fetching Guide
2
+
3
+ This guide demonstrates how to fetch data from various sources using SimpleFlow, including APIs, databases, file systems, and external services.
4
+
5
+ ## API Data Fetching
6
+
7
+ ### Basic API Call
8
+
9
+ ```ruby
10
+ step :fetch_from_api, ->(result) {
11
+ begin
12
+ response = HTTP.get("https://api.example.com/users/#{result.value}")
13
+ data = JSON.parse(response.body)
14
+ result.with_context(:user_data, data).continue(result.value)
15
+ rescue HTTP::Error => e
16
+ result.halt.with_error(:api, "API request failed: #{e.message}")
17
+ rescue JSON::ParserError => e
18
+ result.halt.with_error(:parse, "Invalid JSON: #{e.message}")
19
+ end
20
+ }
21
+ ```
22
+
23
+ ### Parallel API Calls
24
+
25
+ ```ruby
26
+ pipeline = SimpleFlow::Pipeline.new do
27
+ step :fetch_weather, ->(result) {
28
+ location = result.value[:location]
29
+ weather = HTTP.get("https://api.weather.com/current?location=#{location}").parse
30
+ result.with_context(:weather, weather).continue(result.value)
31
+ }, depends_on: []
32
+
33
+ step :fetch_news, ->(result) {
34
+ topic = result.value[:topic]
35
+ news = HTTP.get("https://api.news.com/articles?topic=#{topic}").parse
36
+ result.with_context(:news, news).continue(result.value)
37
+ }, depends_on: []
38
+
39
+ step :fetch_stocks, ->(result) {
40
+ symbols = result.value[:symbols]
41
+ stocks = HTTP.get("https://api.stocks.com/quotes?symbols=#{symbols}").parse
42
+ result.with_context(:stocks, stocks).continue(result.value)
43
+ }, depends_on: []
44
+
45
+ step :combine_results, ->(result) {
46
+ combined = {
47
+ weather: result.context[:weather],
48
+ news: result.context[:news],
49
+ stocks: result.context[:stocks]
50
+ }
51
+ result.continue(combined)
52
+ }, depends_on: [:fetch_weather, :fetch_news, :fetch_stocks]
53
+ end
54
+
55
+ # All API calls execute in parallel
56
+ result = pipeline.call_parallel(
57
+ SimpleFlow::Result.new({ location: "NYC", topic: "tech", symbols: "AAPL,GOOGL" })
58
+ )
59
+ ```
60
+
61
+ ### API with Authentication
62
+
63
+ ```ruby
64
+ class AuthenticatedAPI
65
+ def initialize(api_key)
66
+ @api_key = api_key
67
+ end
68
+
69
+ def call(result)
70
+ endpoint = result.value[:endpoint]
71
+
72
+ response = HTTP
73
+ .auth("Bearer #{@api_key}")
74
+ .get("https://api.example.com/#{endpoint}")
75
+
76
+ if response.status.success?
77
+ data = JSON.parse(response.body)
78
+ result.with_context(:api_response, data).continue(result.value)
79
+ else
80
+ result.halt.with_error(:api, "Request failed with status #{response.status}")
81
+ end
82
+ rescue StandardError => e
83
+ result.halt.with_error(:api, "API error: #{e.message}")
84
+ end
85
+ end
86
+
87
+ pipeline = SimpleFlow::Pipeline.new do
88
+ step :fetch_data, AuthenticatedAPI.new(ENV['API_KEY']), depends_on: []
89
+ end
90
+ ```
91
+
92
+ ### Rate-Limited API Calls
93
+
94
+ ```ruby
95
+ class RateLimitedFetcher
96
+ def initialize(max_requests_per_second: 10)
97
+ @max_requests = max_requests_per_second
98
+ @request_times = []
99
+ end
100
+
101
+ def call(result)
102
+ wait_if_rate_limited
103
+
104
+ begin
105
+ @request_times << Time.now
106
+ response = HTTP.get(result.value[:url])
107
+ data = response.parse
108
+
109
+ result.with_context(:data, data).continue(result.value)
110
+ rescue HTTP::Error => e
111
+ result.halt.with_error(:http, e.message)
112
+ end
113
+ end
114
+
115
+ private
116
+
117
+ def wait_if_rate_limited
118
+ # Remove old requests outside the time window
119
+ one_second_ago = Time.now - 1
120
+ @request_times.reject! { |time| time < one_second_ago }
121
+
122
+ # Wait if we've hit the limit
123
+ if @request_times.size >= @max_requests
124
+ sleep(0.1)
125
+ wait_if_rate_limited
126
+ end
127
+ end
128
+ end
129
+ ```
130
+
131
+ ## Database Queries
132
+
133
+ ### Basic Database Query
134
+
135
+ ```ruby
136
+ step :fetch_users, ->(result) {
137
+ users = DB[:users]
138
+ .where(active: true)
139
+ .where { created_at > Date.today - 30 }
140
+ .all
141
+
142
+ result.with_context(:users, users).continue(result.value)
143
+ }
144
+ ```
145
+
146
+ ### Parallel Database Queries
147
+
148
+ ```ruby
149
+ pipeline = SimpleFlow::Pipeline.new do
150
+ step :fetch_users, ->(result) {
151
+ users = DB[:users].where(active: true).all
152
+ result.with_context(:users, users).continue(result.value)
153
+ }, depends_on: []
154
+
155
+ step :fetch_orders, ->(result) {
156
+ orders = DB[:orders].where(status: 'completed').all
157
+ result.with_context(:orders, orders).continue(result.value)
158
+ }, depends_on: []
159
+
160
+ step :fetch_products, ->(result) {
161
+ products = DB[:products].where(in_stock: true).all
162
+ result.with_context(:products, products).continue(result.value)
163
+ }, depends_on: []
164
+
165
+ step :aggregate, ->(result) {
166
+ stats = {
167
+ total_users: result.context[:users].size,
168
+ total_orders: result.context[:orders].size,
169
+ total_products: result.context[:products].size
170
+ }
171
+ result.continue(stats)
172
+ }, depends_on: [:fetch_users, :fetch_orders, :fetch_products]
173
+ end
174
+
175
+ # Ensure your database connection pool supports concurrent queries
176
+ DB = Sequel.connect(
177
+ 'postgres://localhost/mydb',
178
+ max_connections: 10 # Allow concurrent connections
179
+ )
180
+
181
+ result = pipeline.call_parallel(SimpleFlow::Result.new(nil))
182
+ ```
183
+
184
+ ### Complex Joins and Aggregations
185
+
186
+ ```ruby
187
+ step :fetch_user_analytics, ->(result) {
188
+ user_id = result.value
189
+
190
+ analytics = DB[:users]
191
+ .select(:users__id, :users__name)
192
+ .select_append { count(:orders__id).as(:order_count) }
193
+ .select_append { sum(:orders__total).as(:total_spent) }
194
+ .left_join(:orders, user_id: :id)
195
+ .where(users__id: user_id)
196
+ .group(:users__id, :users__name)
197
+ .first
198
+
199
+ result.with_context(:analytics, analytics).continue(result.value)
200
+ }
201
+ ```
202
+
203
+ ## File System Operations
204
+
205
+ ### Reading Files
206
+
207
+ ```ruby
208
+ step :read_config, ->(result) {
209
+ begin
210
+ config_path = result.value[:config_path]
211
+ content = File.read(config_path)
212
+ config = JSON.parse(content)
213
+
214
+ result.with_context(:config, config).continue(result.value)
215
+ rescue Errno::ENOENT
216
+ result.halt.with_error(:file, "Config file not found: #{config_path}")
217
+ rescue JSON::ParserError => e
218
+ result.halt.with_error(:parse, "Invalid JSON in config: #{e.message}")
219
+ end
220
+ }
221
+ ```
222
+
223
+ ### Reading Multiple Files in Parallel
224
+
225
+ ```ruby
226
+ pipeline = SimpleFlow::Pipeline.new do
227
+ step :read_users_csv, ->(result) {
228
+ users = CSV.read('data/users.csv', headers: true).map(&:to_h)
229
+ result.with_context(:users, users).continue(result.value)
230
+ }, depends_on: []
231
+
232
+ step :read_products_json, ->(result) {
233
+ products = JSON.parse(File.read('data/products.json'))
234
+ result.with_context(:products, products).continue(result.value)
235
+ }, depends_on: []
236
+
237
+ step :read_config_yaml, ->(result) {
238
+ config = YAML.load_file('config/settings.yml')
239
+ result.with_context(:config, config).continue(result.value)
240
+ }, depends_on: []
241
+
242
+ step :combine_data, ->(result) {
243
+ combined = {
244
+ users: result.context[:users],
245
+ products: result.context[:products],
246
+ config: result.context[:config]
247
+ }
248
+ result.continue(combined)
249
+ }, depends_on: [:read_users_csv, :read_products_json, :read_config_yaml]
250
+ end
251
+ ```
252
+
253
+ ### Processing Large Files
254
+
255
+ ```ruby
256
+ step :process_large_file, ->(result) {
257
+ file_path = result.value
258
+ processed_count = 0
259
+
260
+ File.foreach(file_path).each_slice(1000) do |batch|
261
+ # Process in batches
262
+ batch.each do |line|
263
+ process_line(line)
264
+ processed_count += 1
265
+ end
266
+ end
267
+
268
+ result.with_context(:lines_processed, processed_count).continue(result.value)
269
+ }
270
+ ```
271
+
272
+ ## Caching Strategies
273
+
274
+ ### Simple Cache with Fallback
275
+
276
+ ```ruby
277
+ step :fetch_with_cache, ->(result) {
278
+ cache_key = "user_#{result.value}"
279
+
280
+ # Try cache first
281
+ cached = REDIS.get(cache_key)
282
+ if cached
283
+ data = JSON.parse(cached)
284
+ return result.with_context(:source, :cache).continue(data)
285
+ end
286
+
287
+ # Cache miss - fetch from API
288
+ begin
289
+ response = HTTP.get("https://api.example.com/users/#{result.value}")
290
+ data = response.parse
291
+
292
+ # Store in cache for 1 hour
293
+ REDIS.setex(cache_key, 3600, data.to_json)
294
+
295
+ result.with_context(:source, :api).continue(data)
296
+ rescue HTTP::Error => e
297
+ result.halt.with_error(:fetch, "Failed to fetch data: #{e.message}")
298
+ end
299
+ }
300
+ ```
301
+
302
+ ### Multi-Level Caching
303
+
304
+ ```ruby
305
+ class MultiLevelCache
306
+ def self.call(result)
307
+ key = result.value[:cache_key]
308
+
309
+ # Level 1: Memory cache
310
+ if data = MEMORY_CACHE[key]
311
+ return result.with_context(:cache_level, :memory).continue(data)
312
+ end
313
+
314
+ # Level 2: Redis cache
315
+ if cached = REDIS.get(key)
316
+ data = JSON.parse(cached)
317
+ MEMORY_CACHE[key] = data
318
+ return result.with_context(:cache_level, :redis).continue(data)
319
+ end
320
+
321
+ # Level 3: Database
322
+ if record = DB[:cache].where(key: key).first
323
+ data = JSON.parse(record[:value])
324
+ REDIS.setex(key, 3600, data.to_json)
325
+ MEMORY_CACHE[key] = data
326
+ return result.with_context(:cache_level, :database).continue(data)
327
+ end
328
+
329
+ # No cache hit - need to fetch
330
+ result.with_context(:cache_level, :none).continue(nil)
331
+ end
332
+ end
333
+ ```
334
+
335
+ ## Batch Processing
336
+
337
+ ### Fetching Data in Batches
338
+
339
+ ```ruby
340
+ pipeline = SimpleFlow::Pipeline.new do
341
+ step :fetch_batch, ->(result) {
342
+ batch_ids = result.value
343
+ records = DB[:records].where(id: batch_ids).all
344
+
345
+ result.with_context(:records, records).continue(result.value)
346
+ }
347
+
348
+ step :process_records, ->(result) {
349
+ records = result.context[:records]
350
+ processed = records.map { |r| transform_record(r) }
351
+
352
+ result.continue(processed)
353
+ }
354
+ end
355
+
356
+ # Process in batches
357
+ all_ids = (1..10000).to_a
358
+ all_ids.each_slice(100) do |batch|
359
+ result = pipeline.call(SimpleFlow::Result.new(batch))
360
+ save_processed_batch(result.value)
361
+ end
362
+ ```
363
+
364
+ ## Real-World ETL Example
365
+
366
+ ```ruby
367
+ class ETLPipeline
368
+ def self.build
369
+ SimpleFlow::Pipeline.new do
370
+ # Extract phase - parallel data loading
371
+ step :extract_users, ->(result) {
372
+ users = CSV.read('data/users.csv', headers: true).map(&:to_h)
373
+ result.with_context(:raw_users, users).continue(result.value)
374
+ }, depends_on: []
375
+
376
+ step :extract_orders, ->(result) {
377
+ orders = JSON.parse(File.read('data/orders.json'))
378
+ result.with_context(:raw_orders, orders).continue(result.value)
379
+ }, depends_on: []
380
+
381
+ step :extract_products, ->(result) {
382
+ products = DB[:products].all
383
+ result.with_context(:raw_products, products).continue(result.value)
384
+ }, depends_on: []
385
+
386
+ # Transform phase - parallel transformations
387
+ step :transform_users, ->(result) {
388
+ users = result.context[:raw_users].map do |user|
389
+ {
390
+ id: user['id'].to_i,
391
+ name: user['name'].strip.downcase,
392
+ email: user['email'].downcase,
393
+ created_at: Date.parse(user['signup_date'])
394
+ }
395
+ end
396
+ result.with_context(:users, users).continue(result.value)
397
+ }, depends_on: [:extract_users]
398
+
399
+ step :transform_orders, ->(result) {
400
+ orders = result.context[:raw_orders]
401
+ .reject { |o| o['status'] == 'cancelled' }
402
+ .map do |order|
403
+ {
404
+ id: order['order_id'],
405
+ user_id: order['user_id'],
406
+ total: order['amount'].to_f,
407
+ items: order['items'].size
408
+ }
409
+ end
410
+ result.with_context(:orders, orders).continue(result.value)
411
+ }, depends_on: [:extract_orders]
412
+
413
+ # Load phase - aggregate and save
414
+ step :aggregate_stats, ->(result) {
415
+ users = result.context[:users]
416
+ orders = result.context[:orders]
417
+
418
+ stats = users.map do |user|
419
+ user_orders = orders.select { |o| o[:user_id] == user[:id] }
420
+ {
421
+ user_id: user[:id],
422
+ total_orders: user_orders.size,
423
+ total_spent: user_orders.sum { |o| o[:total] },
424
+ avg_order: user_orders.empty? ? 0 : user_orders.sum { |o| o[:total] } / user_orders.size
425
+ }
426
+ end
427
+
428
+ result.continue(stats)
429
+ }, depends_on: [:transform_users, :transform_orders]
430
+
431
+ step :save_results, ->(result) {
432
+ DB[:user_stats].multi_insert(result.value)
433
+ result.continue("Saved #{result.value.size} records")
434
+ }, depends_on: [:aggregate_stats]
435
+ end
436
+ end
437
+ end
438
+
439
+ # Execute ETL pipeline
440
+ result = ETLPipeline.build.call_parallel(SimpleFlow::Result.new(nil))
441
+ puts result.value # "Saved 150 records"
442
+ ```
443
+
444
+ ## Error Handling for Data Fetching
445
+
446
+ ```ruby
447
+ step :fetch_with_retries, ->(result) {
448
+ max_retries = 3
449
+ attempt = 0
450
+
451
+ begin
452
+ attempt += 1
453
+ response = HTTP.timeout(10).get(result.value[:url])
454
+ data = response.parse
455
+
456
+ result
457
+ .with_context(:attempts, attempt)
458
+ .with_context(:data, data)
459
+ .continue(result.value)
460
+ rescue HTTP::TimeoutError
461
+ if attempt < max_retries
462
+ sleep(attempt ** 2) # Exponential backoff
463
+ retry
464
+ else
465
+ result.halt.with_error(:timeout, "Request timed out after #{max_retries} attempts")
466
+ end
467
+ rescue HTTP::Error => e
468
+ result.halt.with_error(:http, "HTTP error: #{e.message}")
469
+ end
470
+ }
471
+ ```
472
+
473
+ ## Related Documentation
474
+
475
+ - [Error Handling](error-handling.md) - Handling errors during data fetching
476
+ - [File Processing](file-processing.md) - Advanced file processing techniques
477
+ - [Complex Workflows](complex-workflows.md) - Building complete data pipelines
478
+ - [Performance Guide](../concurrent/performance.md) - Optimizing data fetching