fractor 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop-https---raw-githubusercontent-com-riboseinc-oss-guides-main-ci-rubocop-yml +552 -0
- data/.rubocop.yml +14 -8
- data/.rubocop_todo.yml +284 -43
- data/README.adoc +111 -950
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/auto_detection/auto_detection.rb +9 -9
- data/examples/continuous_chat_common/message_protocol.rb +53 -0
- data/examples/continuous_chat_fractor/README.adoc +217 -0
- data/examples/continuous_chat_fractor/chat_client.rb +303 -0
- data/examples/continuous_chat_fractor/chat_common.rb +83 -0
- data/examples/continuous_chat_fractor/chat_server.rb +167 -0
- data/examples/continuous_chat_fractor/simulate.rb +345 -0
- data/examples/continuous_chat_server/README.adoc +135 -0
- data/examples/continuous_chat_server/chat_client.rb +303 -0
- data/examples/continuous_chat_server/chat_server.rb +359 -0
- data/examples/continuous_chat_server/simulate.rb +343 -0
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/hierarchical_hasher/hierarchical_hasher.rb +12 -8
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/multi_work_type/multi_work_type.rb +30 -29
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +16 -16
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/producer_subscriber/producer_subscriber.rb +20 -16
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/scatter_gather/scatter_gather.rb +29 -28
- data/examples/simple/README.adoc +347 -0
- data/examples/simple/sample.rb +5 -5
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +88 -45
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +183 -0
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +33 -1
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +430 -144
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +88 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +75 -1
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -91
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +93 -3
- metadata +192 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -0,0 +1,740 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Building a Data Processing Pipeline
|
|
4
|
+
nav_order: 5
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
== Building a Data Processing Pipeline
|
|
8
|
+
|
|
9
|
+
=== Overview
|
|
10
|
+
|
|
11
|
+
In this 30-minute intermediate tutorial, you'll build a complete data processing pipeline that extracts data from CSV files, transforms it, validates it, and loads it into a database. This real-world example demonstrates how to structure complex pipelines using Fractor.
|
|
12
|
+
|
|
13
|
+
**What you'll learn:**
|
|
14
|
+
|
|
15
|
+
* Breaking down complex tasks into pipeline stages
|
|
16
|
+
* Creating specialized workers for each stage
|
|
17
|
+
* Handling errors and validation
|
|
18
|
+
* Monitoring pipeline progress
|
|
19
|
+
* Best practices for production pipelines
|
|
20
|
+
|
|
21
|
+
**Prerequisites:**
|
|
22
|
+
|
|
23
|
+
* Completed link:getting-started[Getting Started] tutorial
|
|
24
|
+
* Basic understanding of link:../guides/core-concepts[Core Concepts]
|
|
25
|
+
* Familiarity with Ruby classes and CSV processing
|
|
26
|
+
|
|
27
|
+
=== The Problem
|
|
28
|
+
|
|
29
|
+
You need to process customer data from multiple CSV files:
|
|
30
|
+
|
|
31
|
+
1. **Extract**: Read CSV files and parse records
|
|
32
|
+
2. **Transform**: Clean and normalize data (trim whitespace, format dates, etc.)
|
|
33
|
+
3. **Validate**: Check data quality (required fields, valid emails, etc.)
|
|
34
|
+
4. **Load**: Insert valid records into a database
|
|
35
|
+
|
|
36
|
+
The pipeline should:
|
|
37
|
+
|
|
38
|
+
* Process files in parallel for speed
|
|
39
|
+
* Handle validation errors gracefully
|
|
40
|
+
* Track success/failure statistics
|
|
41
|
+
* Be production-ready with proper error handling
|
|
42
|
+
|
|
43
|
+
=== Step 1: Set Up the Project
|
|
44
|
+
|
|
45
|
+
Create a new project directory:
|
|
46
|
+
|
|
47
|
+
[source,sh]
|
|
48
|
+
----
|
|
49
|
+
mkdir customer_pipeline
|
|
50
|
+
cd customer_pipeline
|
|
51
|
+
----
|
|
52
|
+
|
|
53
|
+
Create the directory structure:
|
|
54
|
+
|
|
55
|
+
[source,sh]
|
|
56
|
+
----
|
|
57
|
+
mkdir -p lib data
|
|
58
|
+
touch lib/pipeline.rb
|
|
59
|
+
touch lib/models.rb
|
|
60
|
+
touch lib/workers.rb
|
|
61
|
+
----
|
|
62
|
+
|
|
63
|
+
Install Fractor:
|
|
64
|
+
|
|
65
|
+
[source,sh]
|
|
66
|
+
----
|
|
67
|
+
gem install fractor
|
|
68
|
+
----
|
|
69
|
+
|
|
70
|
+
=== Step 2: Define Data Models
|
|
71
|
+
|
|
72
|
+
Create `lib/models.rb` to define our data structures:
|
|
73
|
+
|
|
74
|
+
[source,ruby]
|
|
75
|
+
----
|
|
76
|
+
require 'date'
|
|
77
|
+
|
|
78
|
+
# Raw data from CSV
|
|
79
|
+
class RawCustomer
|
|
80
|
+
attr_reader :data
|
|
81
|
+
|
|
82
|
+
def initialize(row)
|
|
83
|
+
@data = row
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def [](key)
|
|
87
|
+
@data[key]
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Transformed and validated customer
|
|
92
|
+
class Customer
|
|
93
|
+
attr_accessor :id, :name, :email, :phone, :signup_date, :country
|
|
94
|
+
|
|
95
|
+
def initialize(attrs = {})
|
|
96
|
+
@id = attrs[:id]
|
|
97
|
+
@name = attrs[:name]
|
|
98
|
+
@email = attrs[:email]
|
|
99
|
+
@phone = attrs[:phone]
|
|
100
|
+
@signup_date = attrs[:signup_date]
|
|
101
|
+
@country = attrs[:country]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def to_h
|
|
105
|
+
{
|
|
106
|
+
id: @id,
|
|
107
|
+
name: @name,
|
|
108
|
+
email: @email,
|
|
109
|
+
phone: @phone,
|
|
110
|
+
signup_date: @signup_date,
|
|
111
|
+
country: @country
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Validation result
|
|
117
|
+
class ValidationResult
|
|
118
|
+
attr_reader :customer, :errors
|
|
119
|
+
|
|
120
|
+
def initialize(customer:, errors: [])
|
|
121
|
+
@customer = customer
|
|
122
|
+
@errors = errors
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def valid?
|
|
126
|
+
@errors.empty?
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
----
|
|
130
|
+
|
|
131
|
+
=== Step 3: Create Pipeline Workers
|
|
132
|
+
|
|
133
|
+
Create `lib/workers.rb` with specialized workers for each stage:
|
|
134
|
+
|
|
135
|
+
[source,ruby]
|
|
136
|
+
----
|
|
137
|
+
require 'fractor'
|
|
138
|
+
require 'csv'
|
|
139
|
+
require_relative 'models'
|
|
140
|
+
|
|
141
|
+
# Stage 1: Extract - Read and parse CSV files
|
|
142
|
+
class ExtractWorker < Fractor::Worker
|
|
143
|
+
def process(work)
|
|
144
|
+
filepath = work.input[:filepath]
|
|
145
|
+
|
|
146
|
+
customers = []
|
|
147
|
+
CSV.foreach(filepath, headers: true) do |row|
|
|
148
|
+
customers << RawCustomer.new(row.to_h)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
Fractor::WorkResult.new(
|
|
152
|
+
result: {
|
|
153
|
+
filepath: filepath,
|
|
154
|
+
customers: customers,
|
|
155
|
+
count: customers.size
|
|
156
|
+
},
|
|
157
|
+
work: work
|
|
158
|
+
)
|
|
159
|
+
rescue => e
|
|
160
|
+
Fractor::WorkResult.new(
|
|
161
|
+
error: e,
|
|
162
|
+
error_code: :extraction_failed,
|
|
163
|
+
error_context: { filepath: filepath },
|
|
164
|
+
work: work
|
|
165
|
+
)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Stage 2: Transform - Clean and normalize data
|
|
170
|
+
class TransformWorker < Fractor::Worker
|
|
171
|
+
def process(work)
|
|
172
|
+
raw_customer = work.input[:customer]
|
|
173
|
+
|
|
174
|
+
customer = Customer.new(
|
|
175
|
+
id: raw_customer['id']&.strip,
|
|
176
|
+
name: normalize_name(raw_customer['name']),
|
|
177
|
+
email: raw_customer['email']&.strip&.downcase,
|
|
178
|
+
phone: normalize_phone(raw_customer['phone']),
|
|
179
|
+
signup_date: parse_date(raw_customer['signup_date']),
|
|
180
|
+
country: raw_customer['country']&.strip&.upcase
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
Fractor::WorkResult.new(
|
|
184
|
+
result: { customer: customer },
|
|
185
|
+
work: work
|
|
186
|
+
)
|
|
187
|
+
rescue => e
|
|
188
|
+
Fractor::WorkResult.new(
|
|
189
|
+
error: e,
|
|
190
|
+
error_code: :transformation_failed,
|
|
191
|
+
error_context: { raw_data: raw_customer.data },
|
|
192
|
+
work: work
|
|
193
|
+
)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
private
|
|
197
|
+
|
|
198
|
+
def normalize_name(name)
|
|
199
|
+
return nil unless name
|
|
200
|
+
name.strip.split.map(&:capitalize).join(' ')
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def normalize_phone(phone)
|
|
204
|
+
return nil unless phone
|
|
205
|
+
phone.gsub(/[^0-9]/, '')
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def parse_date(date_str)
|
|
209
|
+
return nil unless date_str
|
|
210
|
+
Date.parse(date_str)
|
|
211
|
+
rescue ArgumentError
|
|
212
|
+
nil
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Stage 3: Validate - Check data quality
|
|
217
|
+
class ValidateWorker < Fractor::Worker
|
|
218
|
+
EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i
|
|
219
|
+
|
|
220
|
+
def process(work)
|
|
221
|
+
customer = work.input[:customer]
|
|
222
|
+
errors = []
|
|
223
|
+
|
|
224
|
+
# Required field validation
|
|
225
|
+
errors << "ID is required" if customer.id.nil? || customer.id.empty?
|
|
226
|
+
errors << "Name is required" if customer.name.nil? || customer.name.empty?
|
|
227
|
+
errors << "Email is required" if customer.email.nil? || customer.email.empty?
|
|
228
|
+
|
|
229
|
+
# Format validation
|
|
230
|
+
if customer.email && !customer.email.match?(EMAIL_REGEX)
|
|
231
|
+
errors << "Email format is invalid"
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
if customer.phone && customer.phone.length < 10
|
|
235
|
+
errors << "Phone number is too short"
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Business logic validation
|
|
239
|
+
if customer.signup_date && customer.signup_date > Date.today
|
|
240
|
+
errors << "Signup date cannot be in the future"
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
result = ValidationResult.new(
|
|
244
|
+
customer: customer,
|
|
245
|
+
errors: errors
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
Fractor::WorkResult.new(
|
|
249
|
+
result: { validation_result: result },
|
|
250
|
+
work: work
|
|
251
|
+
)
|
|
252
|
+
rescue => e
|
|
253
|
+
Fractor::WorkResult.new(
|
|
254
|
+
error: e,
|
|
255
|
+
error_code: :validation_failed,
|
|
256
|
+
work: work
|
|
257
|
+
)
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Stage 4: Load - Insert into database (simulated)
|
|
262
|
+
class LoadWorker < Fractor::Worker
|
|
263
|
+
def process(work)
|
|
264
|
+
validation_result = work.input[:validation_result]
|
|
265
|
+
|
|
266
|
+
unless validation_result.valid?
|
|
267
|
+
return Fractor::WorkResult.new(
|
|
268
|
+
error: "Validation failed: #{validation_result.errors.join(', ')}",
|
|
269
|
+
error_code: :invalid_data,
|
|
270
|
+
error_context: {
|
|
271
|
+
customer_id: validation_result.customer.id,
|
|
272
|
+
errors: validation_result.errors
|
|
273
|
+
},
|
|
274
|
+
work: work
|
|
275
|
+
)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
customer = validation_result.customer
|
|
279
|
+
|
|
280
|
+
# Simulate database insert
|
|
281
|
+
# In production: db.insert(:customers, customer.to_h)
|
|
282
|
+
insert_to_database(customer)
|
|
283
|
+
|
|
284
|
+
Fractor::WorkResult.new(
|
|
285
|
+
result: {
|
|
286
|
+
customer_id: customer.id,
|
|
287
|
+
inserted: true
|
|
288
|
+
},
|
|
289
|
+
work: work
|
|
290
|
+
)
|
|
291
|
+
rescue => e
|
|
292
|
+
Fractor::WorkResult.new(
|
|
293
|
+
error: e,
|
|
294
|
+
error_code: :load_failed,
|
|
295
|
+
error_context: { customer_id: customer&.id },
|
|
296
|
+
work: work
|
|
297
|
+
)
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
private
|
|
301
|
+
|
|
302
|
+
def insert_to_database(customer)
|
|
303
|
+
# Simulate database operation
|
|
304
|
+
sleep(0.01) # Simulate network latency
|
|
305
|
+
puts " ✓ Loaded customer: #{customer.id} - #{customer.name}"
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
----
|
|
309
|
+
|
|
310
|
+
=== Step 4: Build the Pipeline
|
|
311
|
+
|
|
312
|
+
Create `lib/pipeline.rb` to orchestrate the pipeline:
|
|
313
|
+
|
|
314
|
+
[source,ruby]
|
|
315
|
+
----
|
|
316
|
+
require 'fractor'
|
|
317
|
+
require_relative 'workers'
|
|
318
|
+
require_relative 'models'
|
|
319
|
+
|
|
320
|
+
class CustomerPipeline
|
|
321
|
+
attr_reader :stats
|
|
322
|
+
|
|
323
|
+
def initialize(csv_files)
|
|
324
|
+
@csv_files = csv_files
|
|
325
|
+
@stats = {
|
|
326
|
+
files_processed: 0,
|
|
327
|
+
customers_extracted: 0,
|
|
328
|
+
customers_loaded: 0,
|
|
329
|
+
validation_errors: 0,
|
|
330
|
+
processing_errors: 0
|
|
331
|
+
}
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
def run
|
|
335
|
+
puts "Starting customer data pipeline..."
|
|
336
|
+
puts "Processing #{@csv_files.size} files\n\n"
|
|
337
|
+
|
|
338
|
+
# Stage 1: Extract customers from CSV files
|
|
339
|
+
extracted_results = extract_stage
|
|
340
|
+
|
|
341
|
+
# Stage 2: Transform extracted customers
|
|
342
|
+
transformed_results = transform_stage(extracted_results)
|
|
343
|
+
|
|
344
|
+
# Stage 3: Validate transformed customers
|
|
345
|
+
validated_results = validate_stage(transformed_results)
|
|
346
|
+
|
|
347
|
+
# Stage 4: Load valid customers
|
|
348
|
+
load_stage(validated_results)
|
|
349
|
+
|
|
350
|
+
print_statistics
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
private
|
|
354
|
+
|
|
355
|
+
def extract_stage
|
|
356
|
+
puts "Stage 1: Extracting data from CSV files..."
|
|
357
|
+
|
|
358
|
+
supervisor = Fractor::Supervisor.new(
|
|
359
|
+
worker_pools: [
|
|
360
|
+
{ worker_class: ExtractWorker, num_workers: 4 }
|
|
361
|
+
]
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Create work items for each file
|
|
365
|
+
work_items = @csv_files.map do |filepath|
|
|
366
|
+
Fractor::Work.new(filepath: filepath)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
supervisor.add_work_items(work_items)
|
|
370
|
+
supervisor.run
|
|
371
|
+
|
|
372
|
+
# Process results
|
|
373
|
+
results = supervisor.results.results
|
|
374
|
+
errors = supervisor.results.errors
|
|
375
|
+
|
|
376
|
+
@stats[:files_processed] = results.size
|
|
377
|
+
@stats[:processing_errors] += errors.size
|
|
378
|
+
|
|
379
|
+
results.each do |result|
|
|
380
|
+
@stats[:customers_extracted] += result.result[:count]
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
errors.each do |error|
|
|
384
|
+
puts " ✗ Failed to extract #{error.error_context[:filepath]}: #{error.error}"
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
puts " → Extracted #{@stats[:customers_extracted]} customers from #{results.size} files\n\n"
|
|
388
|
+
|
|
389
|
+
results
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
def transform_stage(extract_results)
|
|
393
|
+
puts "Stage 2: Transforming customer data..."
|
|
394
|
+
|
|
395
|
+
supervisor = Fractor::Supervisor.new(
|
|
396
|
+
worker_pools: [
|
|
397
|
+
{ worker_class: TransformWorker, num_workers: 8 }
|
|
398
|
+
]
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Create work items for each customer
|
|
402
|
+
work_items = extract_results.flat_map do |result|
|
|
403
|
+
result.result[:customers].map do |customer|
|
|
404
|
+
Fractor::Work.new(customer: customer)
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
supervisor.add_work_items(work_items)
|
|
409
|
+
supervisor.run
|
|
410
|
+
|
|
411
|
+
results = supervisor.results.results
|
|
412
|
+
errors = supervisor.results.errors
|
|
413
|
+
|
|
414
|
+
@stats[:processing_errors] += errors.size
|
|
415
|
+
|
|
416
|
+
puts " → Transformed #{results.size} customers\n\n"
|
|
417
|
+
|
|
418
|
+
results
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def validate_stage(transform_results)
|
|
422
|
+
puts "Stage 3: Validating customer data..."
|
|
423
|
+
|
|
424
|
+
supervisor = Fractor::Supervisor.new(
|
|
425
|
+
worker_pools: [
|
|
426
|
+
{ worker_class: ValidateWorker, num_workers: 8 }
|
|
427
|
+
]
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
work_items = transform_results.map do |result|
|
|
431
|
+
Fractor::Work.new(customer: result.result[:customer])
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
supervisor.add_work_items(work_items)
|
|
435
|
+
supervisor.run
|
|
436
|
+
|
|
437
|
+
results = supervisor.results.results
|
|
438
|
+
errors = supervisor.results.errors
|
|
439
|
+
|
|
440
|
+
@stats[:processing_errors] += errors.size
|
|
441
|
+
|
|
442
|
+
# Count validation failures
|
|
443
|
+
results.each do |result|
|
|
444
|
+
unless result.result[:validation_result].valid?
|
|
445
|
+
@stats[:validation_errors] += 1
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
puts " → Validated #{results.size} customers (#{@stats[:validation_errors]} invalid)\n\n"
|
|
450
|
+
|
|
451
|
+
results
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
def load_stage(validate_results)
|
|
455
|
+
puts "Stage 4: Loading valid customers..."
|
|
456
|
+
|
|
457
|
+
supervisor = Fractor::Supervisor.new(
|
|
458
|
+
worker_pools: [
|
|
459
|
+
{ worker_class: LoadWorker, num_workers: 4 }
|
|
460
|
+
]
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
work_items = validate_results.map do |result|
|
|
464
|
+
Fractor::Work.new(validation_result: result.result[:validation_result])
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
supervisor.add_work_items(work_items)
|
|
468
|
+
supervisor.run
|
|
469
|
+
|
|
470
|
+
results = supervisor.results.results
|
|
471
|
+
errors = supervisor.results.errors
|
|
472
|
+
|
|
473
|
+
@stats[:customers_loaded] = results.size
|
|
474
|
+
# Note: LoadWorker returns errors for invalid data, not processing errors
|
|
475
|
+
|
|
476
|
+
puts "\n → Loaded #{results.size} customers\n\n"
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def print_statistics
|
|
480
|
+
puts "=" * 60
|
|
481
|
+
puts "Pipeline Statistics"
|
|
482
|
+
puts "=" * 60
|
|
483
|
+
puts "Files processed: #{@stats[:files_processed]}"
|
|
484
|
+
puts "Customers extracted: #{@stats[:customers_extracted]}"
|
|
485
|
+
puts "Customers loaded: #{@stats[:customers_loaded]}"
|
|
486
|
+
puts "Validation errors: #{@stats[:validation_errors]}"
|
|
487
|
+
puts "Processing errors: #{@stats[:processing_errors]}"
|
|
488
|
+
puts "Success rate: #{success_rate}%"
|
|
489
|
+
puts "=" * 60
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def success_rate
|
|
493
|
+
return 0 if @stats[:customers_extracted] == 0
|
|
494
|
+
((@stats[:customers_loaded].to_f / @stats[:customers_extracted]) * 100).round(2)
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
----
|
|
498
|
+
|
|
499
|
+
=== Step 5: Create Test Data
|
|
500
|
+
|
|
501
|
+
Create sample CSV files in the `data/` directory:
|
|
502
|
+
|
|
503
|
+
[source,ruby]
|
|
504
|
+
----
|
|
505
|
+
# Create data/sample_customers_1.csv
|
|
506
|
+
require 'csv'
|
|
507
|
+
|
|
508
|
+
CSV.open('data/sample_customers_1.csv', 'w') do |csv|
|
|
509
|
+
csv << ['id', 'name', 'email', 'phone', 'signup_date', 'country']
|
|
510
|
+
csv << ['1', ' john doe ', 'john@example.com', '555-1234', '2024-01-15', 'us']
|
|
511
|
+
csv << ['2', 'JANE SMITH', 'jane@example.com', '555-5678', '2024-02-20', 'ca']
|
|
512
|
+
csv << ['3', 'bob wilson', 'invalid-email', '555-9999', '2024-03-10', 'uk']
|
|
513
|
+
csv << ['4', '', 'empty@example.com', '555-4321', '2024-04-05', 'au']
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
CSV.open('data/sample_customers_2.csv', 'w') do |csv|
|
|
517
|
+
csv << ['id', 'name', 'email', 'phone', 'signup_date', 'country']
|
|
518
|
+
csv << ['5', 'alice brown', 'alice@example.com', '555-1111', '2024-05-12', 'nz']
|
|
519
|
+
csv << ['6', 'charlie davis', 'charlie@example.com', '123', '2024-06-18', 'ie']
|
|
520
|
+
csv << ['7', 'eve martinez', 'eve@example.com', '555-2222', '2025-12-31', 'es']
|
|
521
|
+
end
|
|
522
|
+
----
|
|
523
|
+
|
|
524
|
+
=== Step 6: Run the Pipeline
|
|
525
|
+
|
|
526
|
+
Create a runner script `run_pipeline.rb`:
|
|
527
|
+
|
|
528
|
+
[source,ruby]
|
|
529
|
+
----
|
|
530
|
+
require_relative 'lib/pipeline'
|
|
531
|
+
|
|
532
|
+
# Find all CSV files in data directory
|
|
533
|
+
csv_files = Dir.glob('data/*.csv')
|
|
534
|
+
|
|
535
|
+
if csv_files.empty?
|
|
536
|
+
puts "No CSV files found in data/ directory"
|
|
537
|
+
exit 1
|
|
538
|
+
end
|
|
539
|
+
|
|
540
|
+
# Run the pipeline
|
|
541
|
+
pipeline = CustomerPipeline.new(csv_files)
|
|
542
|
+
pipeline.run
|
|
543
|
+
----
|
|
544
|
+
|
|
545
|
+
Run it:
|
|
546
|
+
|
|
547
|
+
[source,sh]
|
|
548
|
+
----
|
|
549
|
+
ruby run_pipeline.rb
|
|
550
|
+
----
|
|
551
|
+
|
|
552
|
+
Expected output:
|
|
553
|
+
|
|
554
|
+
[source]
|
|
555
|
+
----
|
|
556
|
+
Starting customer data pipeline...
|
|
557
|
+
Processing 2 files
|
|
558
|
+
|
|
559
|
+
Stage 1: Extracting data from CSV files...
|
|
560
|
+
→ Extracted 7 customers from 2 files
|
|
561
|
+
|
|
562
|
+
Stage 2: Transforming customer data...
|
|
563
|
+
→ Transformed 7 customers
|
|
564
|
+
|
|
565
|
+
Stage 3: Validating customer data...
|
|
566
|
+
→ Validated 7 customers (3 invalid)
|
|
567
|
+
|
|
568
|
+
Stage 4: Loading valid customers...
|
|
569
|
+
✓ Loaded customer: 1 - John Doe
|
|
570
|
+
✓ Loaded customer: 2 - Jane Smith
|
|
571
|
+
✓ Loaded customer: 5 - Alice Brown
|
|
572
|
+
|
|
573
|
+
→ Loaded 3 customers
|
|
574
|
+
|
|
575
|
+
============================================================
|
|
576
|
+
Pipeline Statistics
|
|
577
|
+
============================================================
|
|
578
|
+
Files processed: 2
|
|
579
|
+
Customers extracted: 7
|
|
580
|
+
Customers loaded: 3
|
|
581
|
+
Validation errors: 3
|
|
582
|
+
Processing errors: 0
|
|
583
|
+
Success rate: 42.86%
|
|
584
|
+
============================================================
|
|
585
|
+
----
|
|
586
|
+
|
|
587
|
+
=== Step 7: Add Error Monitoring
|
|
588
|
+
|
|
589
|
+
Enhance the pipeline with error reporting:
|
|
590
|
+
|
|
591
|
+
[source,ruby]
|
|
592
|
+
----
|
|
593
|
+
require 'fractor'
|
|
594
|
+
require_relative 'lib/pipeline'
|
|
595
|
+
|
|
596
|
+
# Set up error reporter
|
|
597
|
+
reporter = Fractor::ErrorReporter.new
|
|
598
|
+
|
|
599
|
+
# Register error handlers
|
|
600
|
+
reporter.on_error do |work_result, job_name|
|
|
601
|
+
if work_result.critical?
|
|
602
|
+
puts "CRITICAL: #{job_name} - #{work_result.error.message}"
|
|
603
|
+
end
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
# Modify pipeline to use reporter (add to each stage)
|
|
607
|
+
# ... supervisor.results.errors.each { |e| reporter.record(e, job_name: "extract") }
|
|
608
|
+
|
|
609
|
+
# Run pipeline
|
|
610
|
+
csv_files = Dir.glob('data/*.csv')
|
|
611
|
+
pipeline = CustomerPipeline.new(csv_files)
|
|
612
|
+
pipeline.run
|
|
613
|
+
|
|
614
|
+
# Print error report
|
|
615
|
+
puts "\n"
|
|
616
|
+
puts reporter.formatted_report
|
|
617
|
+
----
|
|
618
|
+
|
|
619
|
+
=== Best Practices Demonstrated
|
|
620
|
+
|
|
621
|
+
==== 1. Separation of Concerns
|
|
622
|
+
|
|
623
|
+
Each worker has a single responsibility:
|
|
624
|
+
|
|
625
|
+
* **ExtractWorker**: Only reads CSV files
|
|
626
|
+
* **TransformWorker**: Only normalizes data
|
|
627
|
+
* **ValidateWorker**: Only validates data
|
|
628
|
+
* **LoadWorker**: Only inserts to database
|
|
629
|
+
|
|
630
|
+
==== 2. Error Handling
|
|
631
|
+
|
|
632
|
+
Comprehensive error handling at each stage:
|
|
633
|
+
|
|
634
|
+
[source,ruby]
|
|
635
|
+
----
|
|
636
|
+
rescue => e
|
|
637
|
+
Fractor::WorkResult.new(
|
|
638
|
+
error: e,
|
|
639
|
+
error_code: :stage_specific_code,
|
|
640
|
+
error_context: { relevant: 'context' },
|
|
641
|
+
work: work
|
|
642
|
+
)
|
|
643
|
+
end
|
|
644
|
+
----
|
|
645
|
+
|
|
646
|
+
==== 3. Progress Tracking
|
|
647
|
+
|
|
648
|
+
Statistics collection throughout the pipeline:
|
|
649
|
+
|
|
650
|
+
[source,ruby]
|
|
651
|
+
----
|
|
652
|
+
@stats[:customers_extracted] += result.result[:count]
|
|
653
|
+
----
|
|
654
|
+
|
|
655
|
+
==== 4. Parallel Processing
|
|
656
|
+
|
|
657
|
+
Different worker counts optimized for each stage:
|
|
658
|
+
|
|
659
|
+
* Extract: 4 workers (I/O bound)
|
|
660
|
+
* Transform: 8 workers (CPU bound)
|
|
661
|
+
* Validate: 8 workers (CPU bound)
|
|
662
|
+
* Load: 4 workers (I/O bound)
|
|
663
|
+
|
|
664
|
+
=== Enhancements
|
|
665
|
+
|
|
666
|
+
==== 1. Add Retry Logic
|
|
667
|
+
|
|
668
|
+
Use workflows for automatic retry:
|
|
669
|
+
|
|
670
|
+
[source,ruby]
|
|
671
|
+
----
|
|
672
|
+
class CustomerWorkflow < Fractor::Workflow
|
|
673
|
+
workflow "customer-pipeline" do
|
|
674
|
+
job "extract" do
|
|
675
|
+
runs_with ExtractWorker
|
|
676
|
+
retry_on_error max_attempts: 3, backoff: :exponential
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
# ... more jobs
|
|
680
|
+
end
|
|
681
|
+
end
|
|
682
|
+
----
|
|
683
|
+
|
|
684
|
+
==== 2. Add Performance Monitoring
|
|
685
|
+
|
|
686
|
+
[source,ruby]
|
|
687
|
+
----
|
|
688
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor)
|
|
689
|
+
monitor.start
|
|
690
|
+
|
|
691
|
+
# ... run pipeline ...
|
|
692
|
+
|
|
693
|
+
puts monitor.report
|
|
694
|
+
monitor.stop
|
|
695
|
+
----
|
|
696
|
+
|
|
697
|
+
==== 3. Add Dead Letter Queue
|
|
698
|
+
|
|
699
|
+
Capture permanently failed records:
|
|
700
|
+
|
|
701
|
+
[source,ruby]
|
|
702
|
+
----
|
|
703
|
+
workflow "customer-pipeline" do
|
|
704
|
+
configure_dead_letter_queue max_size: 1000
|
|
705
|
+
|
|
706
|
+
# ... jobs ...
|
|
707
|
+
end
|
|
708
|
+
|
|
709
|
+
# After execution
|
|
710
|
+
dlq = workflow.dead_letter_queue
|
|
711
|
+
dlq.all.each do |entry|
|
|
712
|
+
puts "Failed: #{entry.error.message}"
|
|
713
|
+
end
|
|
714
|
+
----
|
|
715
|
+
|
|
716
|
+
=== Summary
|
|
717
|
+
|
|
718
|
+
You've built a production-ready data processing pipeline that:
|
|
719
|
+
|
|
720
|
+
✓ Processes data in parallel stages
|
|
721
|
+
✓ Handles errors gracefully with proper error codes
|
|
722
|
+
✓ Tracks detailed statistics
|
|
723
|
+
✓ Validates data quality
|
|
724
|
+
✓ Can be monitored and extended
|
|
725
|
+
|
|
726
|
+
**Key takeaways:**
|
|
727
|
+
|
|
728
|
+
1. Break complex processes into stages
|
|
729
|
+
2. Create focused, single-responsibility workers
|
|
730
|
+
3. Use proper error handling with context
|
|
731
|
+
4. Track statistics for monitoring
|
|
732
|
+
5. Optimize worker counts per stage
|
|
733
|
+
6. Consider workflows for more complex pipelines
|
|
734
|
+
|
|
735
|
+
=== Next Steps
|
|
736
|
+
|
|
737
|
+
* Try the link:long-running-services[Creating Long-Running Services] tutorial
|
|
738
|
+
* Learn about link:../guides/workflows[Workflows] for more complex patterns
|
|
739
|
+
* Explore link:../reference/error-reporting[Error Reporting] for production monitoring
|
|
740
|
+
* Check out link:../reference/examples[Real-World Examples]
|