fractor 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop-https---raw-githubusercontent-com-riboseinc-oss-guides-main-ci-rubocop-yml +552 -0
- data/.rubocop.yml +14 -8
- data/.rubocop_todo.yml +284 -43
- data/README.adoc +111 -950
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/auto_detection/auto_detection.rb +9 -9
- data/examples/continuous_chat_common/message_protocol.rb +53 -0
- data/examples/continuous_chat_fractor/README.adoc +217 -0
- data/examples/continuous_chat_fractor/chat_client.rb +303 -0
- data/examples/continuous_chat_fractor/chat_common.rb +83 -0
- data/examples/continuous_chat_fractor/chat_server.rb +167 -0
- data/examples/continuous_chat_fractor/simulate.rb +345 -0
- data/examples/continuous_chat_server/README.adoc +135 -0
- data/examples/continuous_chat_server/chat_client.rb +303 -0
- data/examples/continuous_chat_server/chat_server.rb +359 -0
- data/examples/continuous_chat_server/simulate.rb +343 -0
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/hierarchical_hasher/hierarchical_hasher.rb +12 -8
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/multi_work_type/multi_work_type.rb +30 -29
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +16 -16
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/producer_subscriber/producer_subscriber.rb +20 -16
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/scatter_gather/scatter_gather.rb +29 -28
- data/examples/simple/README.adoc +347 -0
- data/examples/simple/sample.rb +5 -5
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +88 -45
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +183 -0
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +33 -1
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +430 -144
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +88 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +75 -1
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -91
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +93 -3
- metadata +192 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
= Web Scraper with Rate Limiting
|
|
2
|
+
:toc:
|
|
3
|
+
:toc-placement!:
|
|
4
|
+
|
|
5
|
+
A production-ready web scraper that demonstrates parallel URL fetching with rate limiting, retry logic, and robust error handling using Fractor.
|
|
6
|
+
|
|
7
|
+
toc::[]
|
|
8
|
+
|
|
9
|
+
== Problem Description
|
|
10
|
+
|
|
11
|
+
Web scraping often involves fetching data from multiple URLs, which can be slow when done sequentially. However, parallel scraping must be carefully managed to:
|
|
12
|
+
|
|
13
|
+
* Respect rate limits and avoid overwhelming target servers
|
|
14
|
+
* Handle network errors and timeouts gracefully
|
|
15
|
+
* Retry failed requests with exponential backoff
|
|
16
|
+
* Track progress across multiple workers
|
|
17
|
+
* Store scraped data in an organized manner
|
|
18
|
+
|
|
19
|
+
This example demonstrates how to build a production-ready web scraper that handles all these concerns using Fractor's parallel processing capabilities.
|
|
20
|
+
|
|
21
|
+
== When to Use This Pattern
|
|
22
|
+
|
|
23
|
+
Use this web scraping pattern when you need to:
|
|
24
|
+
|
|
25
|
+
* **Scrape multiple URLs efficiently**: Process hundreds or thousands of URLs in parallel while respecting rate limits
|
|
26
|
+
* **Handle unreliable networks**: Implement retry logic with exponential backoff for network failures
|
|
27
|
+
* **Respect server policies**: Enforce rate limiting to avoid overwhelming target servers or violating terms of service
|
|
28
|
+
* **Track progress**: Monitor scraping progress across multiple parallel workers
|
|
29
|
+
* **Handle errors gracefully**: Continue processing even when some URLs fail
|
|
30
|
+
* **Store results reliably**: Save scraped data with proper error handling
|
|
31
|
+
|
|
32
|
+
This pattern is ideal for:
|
|
33
|
+
|
|
34
|
+
* Data aggregation from multiple sources
|
|
35
|
+
* Content monitoring and change detection
|
|
36
|
+
* Price comparison and monitoring
|
|
37
|
+
* SEO and content analysis
|
|
38
|
+
* Research data collection
|
|
39
|
+
* Web archiving projects
|
|
40
|
+
|
|
41
|
+
== Architecture
|
|
42
|
+
|
|
43
|
+
[source]
|
|
44
|
+
----
|
|
45
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
46
|
+
│ Main Application │
|
|
47
|
+
│ • Creates ScrapeWork for each URL │
|
|
48
|
+
│ • Initializes ProgressTracker │
|
|
49
|
+
│ • Submits work to Supervisor │
|
|
50
|
+
└─────────────────┬───────────────────────────────────────────┘
|
|
51
|
+
│
|
|
52
|
+
│ Submits ScrapeWork
|
|
53
|
+
▼
|
|
54
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
55
|
+
│ Supervisor │
|
|
56
|
+
│ • Manages 3 WebScraperWorker instances │
|
|
57
|
+
│ • Distributes work across workers │
|
|
58
|
+
│ • Collects results │
|
|
59
|
+
└─────────────────┬───────────────────────────────────────────┘
|
|
60
|
+
│
|
|
61
|
+
┌─────────┼─────────┐
|
|
62
|
+
│ │ │
|
|
63
|
+
▼ ▼ ▼
|
|
64
|
+
┌─────────┐ ┌─────────┐ ┌─────────┐
|
|
65
|
+
│ Worker 1│ │ Worker 2│ │ Worker 3│
|
|
66
|
+
│ │ │ │ │ │
|
|
67
|
+
│ Rate │ │ Rate │ │ Rate │
|
|
68
|
+
│ Limiter │ │ Limiter │ │ Limiter │
|
|
69
|
+
│ │ │ │ │ │
|
|
70
|
+
│ Retry │ │ Retry │ │ Retry │
|
|
71
|
+
│ Logic │ │ Logic │ │ Logic │
|
|
72
|
+
└────┬────┘ └────┬────┘ └────┬────┘
|
|
73
|
+
│ │ │
|
|
74
|
+
│ Fetch │ Fetch │ Fetch
|
|
75
|
+
▼ ▼ ▼
|
|
76
|
+
┌─────────────────────────────────┐
|
|
77
|
+
│ Target Websites │
|
|
78
|
+
│ • httpbin.org/html │
|
|
79
|
+
│ • httpbin.org/json │
|
|
80
|
+
│ • httpbin.org/xml │
|
|
81
|
+
│ • ... more URLs ... │
|
|
82
|
+
└─────────────────────────────────┘
|
|
83
|
+
│ │ │
|
|
84
|
+
│ Save │ Save │ Save
|
|
85
|
+
▼ ▼ ▼
|
|
86
|
+
┌─────────────────────────────────┐
|
|
87
|
+
│ Output Directory │
|
|
88
|
+
│ • url_timestamp.json │
|
|
89
|
+
│ • url_timestamp.html │
|
|
90
|
+
│ • Metadata and content │
|
|
91
|
+
└─────────────────────────────────┘
|
|
92
|
+
----
|
|
93
|
+
|
|
94
|
+
=== Components
|
|
95
|
+
|
|
96
|
+
**ScrapeWork**::
|
|
97
|
+
Work class that encapsulates a URL to scrape and the current retry attempt number.
|
|
98
|
+
|
|
99
|
+
**WebScraperWorker**::
|
|
100
|
+
Worker class that processes scrape work with:
|
|
101
|
+
+
|
|
102
|
+
* **Rate limiting**: Enforces minimum delay between requests per domain
|
|
103
|
+
* **HTTP fetching**: Makes HTTP requests with proper headers and timeouts
|
|
104
|
+
* **Error handling**: Catches network errors and HTTP errors
|
|
105
|
+
* **Retry logic**: Implements exponential backoff for failed requests
|
|
106
|
+
* **Data parsing**: Extracts and structures response data
|
|
107
|
+
* **File saving**: Stores both metadata (JSON) and content (HTML)
|
|
108
|
+
|
|
109
|
+
**ProgressTracker**::
|
|
110
|
+
Monitors and reports scraping progress:
|
|
111
|
+
+
|
|
112
|
+
* Tracks completed, successful, and failed URLs
|
|
113
|
+
* Calculates completion percentage
|
|
114
|
+
* Measures scraping rate (URLs/second)
|
|
115
|
+
* Provides real-time progress updates
|
|
116
|
+
* Generates final summary statistics
|
|
117
|
+
|
|
118
|
+
== Key Features
|
|
119
|
+
|
|
120
|
+
=== Rate Limiting
|
|
121
|
+
|
|
122
|
+
Each worker maintains per-domain rate limits to avoid overwhelming servers:
|
|
123
|
+
|
|
124
|
+
[source,ruby]
|
|
125
|
+
----
|
|
126
|
+
RATE_LIMIT_DELAY = 0.5 # 500ms between requests
|
|
127
|
+
|
|
128
|
+
def enforce_rate_limit(url)
|
|
129
|
+
domain = extract_domain(url)
|
|
130
|
+
last_time = @last_request_time[domain]
|
|
131
|
+
|
|
132
|
+
if last_time
|
|
133
|
+
elapsed = Time.now - last_time
|
|
134
|
+
if elapsed < RATE_LIMIT_DELAY
|
|
135
|
+
sleep_time = RATE_LIMIT_DELAY - elapsed
|
|
136
|
+
sleep(sleep_time)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
@last_request_time[domain] = Time.now
|
|
141
|
+
end
|
|
142
|
+
----
|
|
143
|
+
|
|
144
|
+
=== Exponential Backoff
|
|
145
|
+
|
|
146
|
+
Failed requests are retried with increasing delays:
|
|
147
|
+
|
|
148
|
+
[source,ruby]
|
|
149
|
+
----
|
|
150
|
+
MAX_RETRIES = 3
|
|
151
|
+
RETRY_DELAYS = [1, 2, 4].freeze # Seconds
|
|
152
|
+
|
|
153
|
+
# First retry: 1s delay
|
|
154
|
+
# Second retry: 2s delay
|
|
155
|
+
# Third retry: 4s delay
|
|
156
|
+
----
|
|
157
|
+
|
|
158
|
+
=== Comprehensive Error Handling
|
|
159
|
+
|
|
160
|
+
Handles multiple error scenarios:
|
|
161
|
+
|
|
162
|
+
* Network timeouts and connection errors
|
|
163
|
+
* HTTP errors (403, 404, 500, etc.)
|
|
164
|
+
* Invalid URLs
|
|
165
|
+
* Response parsing errors
|
|
166
|
+
* File system errors
|
|
167
|
+
|
|
168
|
+
=== Progress Tracking
|
|
169
|
+
|
|
170
|
+
Real-time monitoring of scraping progress:
|
|
171
|
+
|
|
172
|
+
[source]
|
|
173
|
+
----
|
|
174
|
+
============================================================
|
|
175
|
+
Progress: 7/10 (70.0%)
|
|
176
|
+
Successful: 5 | Failed: 1
|
|
177
|
+
Elapsed: 12.3s | Rate: 0.57 URLs/s
|
|
178
|
+
============================================================
|
|
179
|
+
----
|
|
180
|
+
|
|
181
|
+
== Setup Instructions
|
|
182
|
+
|
|
183
|
+
=== Prerequisites
|
|
184
|
+
|
|
185
|
+
* Ruby 3.0 or higher (required for Ractor support)
|
|
186
|
+
* Fractor gem installed
|
|
187
|
+
* Internet connection for scraping
|
|
188
|
+
|
|
189
|
+
=== Installation
|
|
190
|
+
|
|
191
|
+
. Install dependencies:
|
|
192
|
+
+
|
|
193
|
+
[source,shell]
|
|
194
|
+
----
|
|
195
|
+
bundle install
|
|
196
|
+
----
|
|
197
|
+
|
|
198
|
+
. Create the example directory:
|
|
199
|
+
+
|
|
200
|
+
[source,shell]
|
|
201
|
+
----
|
|
202
|
+
mkdir -p examples/web_scraper
|
|
203
|
+
----
|
|
204
|
+
|
|
205
|
+
== Usage
|
|
206
|
+
|
|
207
|
+
=== Basic Usage
|
|
208
|
+
|
|
209
|
+
Run the example with default URLs:
|
|
210
|
+
|
|
211
|
+
[source,shell]
|
|
212
|
+
----
|
|
213
|
+
cd examples/web_scraper
|
|
214
|
+
ruby web_scraper.rb
|
|
215
|
+
----
|
|
216
|
+
|
|
217
|
+
=== Custom URLs
|
|
218
|
+
|
|
219
|
+
Modify the `urls` array in the script to scrape your own URLs:
|
|
220
|
+
|
|
221
|
+
[source,ruby]
|
|
222
|
+
----
|
|
223
|
+
urls = [
|
|
224
|
+
"https://example.com/page1",
|
|
225
|
+
"https://example.com/page2",
|
|
226
|
+
"https://another-site.com/data"
|
|
227
|
+
]
|
|
228
|
+
----
|
|
229
|
+
|
|
230
|
+
=== Configuration Options
|
|
231
|
+
|
|
232
|
+
Adjust scraping parameters by modifying the worker class:
|
|
233
|
+
|
|
234
|
+
[source,ruby]
|
|
235
|
+
----
|
|
236
|
+
# Change rate limit (seconds between requests)
|
|
237
|
+
RATE_LIMIT_DELAY = 1.0
|
|
238
|
+
|
|
239
|
+
# Adjust retry attempts
|
|
240
|
+
MAX_RETRIES = 5
|
|
241
|
+
|
|
242
|
+
# Modify retry delays (exponential backoff)
|
|
243
|
+
RETRY_DELAYS = [1, 3, 9, 27].freeze
|
|
244
|
+
|
|
245
|
+
# Change number of workers
|
|
246
|
+
supervisor = Fractor::Supervisor.new(
|
|
247
|
+
worker_class: WebScraperWorker,
|
|
248
|
+
worker_count: 5, # Increase parallelism
|
|
249
|
+
worker_args: [{ output_dir: output_dir }]
|
|
250
|
+
)
|
|
251
|
+
----
|
|
252
|
+
|
|
253
|
+
== Expected Output
|
|
254
|
+
|
|
255
|
+
=== Console Output
|
|
256
|
+
|
|
257
|
+
[source]
|
|
258
|
+
----
|
|
259
|
+
Starting Web Scraper Example
|
|
260
|
+
URLs to scrape: 10
|
|
261
|
+
Workers: 3
|
|
262
|
+
Rate limit: 500ms between requests per domain
|
|
263
|
+
Max retries: 3 with exponential backoff
|
|
264
|
+
|
|
265
|
+
[Worker 1] Scraping https://httpbin.org/html (attempt 1/3)
|
|
266
|
+
[Worker 2] Scraping https://httpbin.org/json (attempt 1/3)
|
|
267
|
+
[Worker 3] Scraping https://httpbin.org/xml (attempt 1/3)
|
|
268
|
+
[Worker 1] ✓ Successfully scraped https://httpbin.org/html
|
|
269
|
+
[Worker 1] Saved to scraped_data/httpbin_org_html_20231025_130300
|
|
270
|
+
|
|
271
|
+
============================================================
|
|
272
|
+
Progress: 3/10 (30.0%)
|
|
273
|
+
Successful: 3 | Failed: 0
|
|
274
|
+
Elapsed: 2.1s | Rate: 1.43 URLs/s
|
|
275
|
+
============================================================
|
|
276
|
+
|
|
277
|
+
[Worker 2] ✗ Error scraping https://httpbin.org/deny: HTTP Error: 403 Forbidden
|
|
278
|
+
[Worker 2] Will retry in 1s...
|
|
279
|
+
[Worker 2] Scraping https://httpbin.org/deny (attempt 2/3)
|
|
280
|
+
[Worker 2] ✗ Error scraping https://httpbin.org/deny: HTTP Error: 403 Forbidden
|
|
281
|
+
[Worker 2] Will retry in 2s...
|
|
282
|
+
|
|
283
|
+
============================================================
|
|
284
|
+
SCRAPING COMPLETE
|
|
285
|
+
============================================================
|
|
286
|
+
Total URLs: 10
|
|
287
|
+
Successful: 8
|
|
288
|
+
Failed: 2
|
|
289
|
+
Total time: 15.23s
|
|
290
|
+
Average rate: 0.66 URLs/s
|
|
291
|
+
============================================================
|
|
292
|
+
|
|
293
|
+
Failed URLs:
|
|
294
|
+
- https://httpbin.org/deny: Max retries exceeded: HTTP Error: 403 Forbidden
|
|
295
|
+
- https://httpbin.org/status/500: Max retries exceeded: HTTP Error: 500 Internal Server Error
|
|
296
|
+
|
|
297
|
+
Data saved to: scraped_data/
|
|
298
|
+
----
|
|
299
|
+
|
|
300
|
+
=== File Output
|
|
301
|
+
|
|
302
|
+
Each successfully scraped URL generates two files:
|
|
303
|
+
|
|
304
|
+
[source]
|
|
305
|
+
----
|
|
306
|
+
scraped_data/
|
|
307
|
+
├── httpbin_org_html_20231025_130300.json
|
|
308
|
+
├── httpbin_org_html_20231025_130300.html
|
|
309
|
+
├── httpbin_org_json_20231025_130301.json
|
|
310
|
+
├── httpbin_org_json_20231025_130301.html
|
|
311
|
+
└── ...
|
|
312
|
+
----
|
|
313
|
+
|
|
314
|
+
**JSON metadata file** contains:
|
|
315
|
+
|
|
316
|
+
[source,json]
|
|
317
|
+
----
|
|
318
|
+
{
|
|
319
|
+
"url": "https://httpbin.org/html",
|
|
320
|
+
"content": "<!DOCTYPE html>...",
|
|
321
|
+
"content_type": "text/html; charset=utf-8",
|
|
322
|
+
"size": 3741,
|
|
323
|
+
"timestamp": "2023-10-25T13:03:00+08:00",
|
|
324
|
+
"headers": {
|
|
325
|
+
"content-type": ["text/html; charset=utf-8"],
|
|
326
|
+
"content-length": ["3741"],
|
|
327
|
+
"server": ["gunicorn/19.9.0"]
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
----
|
|
331
|
+
|
|
332
|
+
**HTML content file** contains the raw response body.
|
|
333
|
+
|
|
334
|
+
== Performance Benchmarks
|
|
335
|
+
|
|
336
|
+
=== Serial vs Parallel Comparison
|
|
337
|
+
|
|
338
|
+
Scraping 100 URLs with 2-second response time each:
|
|
339
|
+
|
|
340
|
+
|===
|
|
341
|
+
|Approach |Workers |Time |Rate
|
|
342
|
+
|
|
343
|
+
|Serial
|
|
344
|
+
|1
|
|
345
|
+
|~200s
|
|
346
|
+
|0.5 URLs/s
|
|
347
|
+
|
|
348
|
+
|Parallel (3 workers)
|
|
349
|
+
|3
|
|
350
|
+
|~67s
|
|
351
|
+
|1.5 URLs/s
|
|
352
|
+
|
|
353
|
+
|Parallel (5 workers)
|
|
354
|
+
|5
|
|
355
|
+
|~40s
|
|
356
|
+
|2.5 URLs/s
|
|
357
|
+
|
|
358
|
+
|Parallel (10 workers)
|
|
359
|
+
|10
|
|
360
|
+
|~20s
|
|
361
|
+
|5.0 URLs/s
|
|
362
|
+
|===
|
|
363
|
+
|
|
364
|
+
NOTE: Actual performance depends on network speed, server response times, and rate limiting constraints.
|
|
365
|
+
|
|
366
|
+
=== Impact of Rate Limiting
|
|
367
|
+
|
|
368
|
+
With 500ms rate limit per domain:
|
|
369
|
+
|
|
370
|
+
|===
|
|
371
|
+
|Scenario |Time Impact
|
|
372
|
+
|
|
373
|
+
|Single domain (all URLs same host)
|
|
374
|
+
|~50s for 100 URLs (limited by rate limit)
|
|
375
|
+
|
|
376
|
+
|Multiple domains (different hosts)
|
|
377
|
+
|~20s for 100 URLs (parallel across domains)
|
|
378
|
+
|===
|
|
379
|
+
|
|
380
|
+
=== Memory Usage
|
|
381
|
+
|
|
382
|
+
* Base memory: ~50 MB
|
|
383
|
+
* Per worker: ~10 MB
|
|
384
|
+
* Per cached response: ~100 KB (varies by content size)
|
|
385
|
+
* Recommended: 100-500 MB for typical scraping tasks
|
|
386
|
+
|
|
387
|
+
== Best Practices
|
|
388
|
+
|
|
389
|
+
=== Respect Robots.txt
|
|
390
|
+
|
|
391
|
+
Always check and respect the target site's `robots.txt`:
|
|
392
|
+
|
|
393
|
+
[source,ruby]
|
|
394
|
+
----
|
|
395
|
+
def check_robots_txt(url)
|
|
396
|
+
uri = URI.parse(url)
|
|
397
|
+
robots_url = "#{uri.scheme}://#{uri.host}/robots.txt"
|
|
398
|
+
# Parse robots.txt and check if scraping is allowed
|
|
399
|
+
end
|
|
400
|
+
----
|
|
401
|
+
|
|
402
|
+
=== Use Appropriate User-Agent
|
|
403
|
+
|
|
404
|
+
Identify your scraper with a descriptive User-Agent:
|
|
405
|
+
|
|
406
|
+
[source,ruby]
|
|
407
|
+
----
|
|
408
|
+
request["User-Agent"] = "YourCompany Bot/1.0 (contact@example.com)"
|
|
409
|
+
----
|
|
410
|
+
|
|
411
|
+
=== Implement Politeness Delays
|
|
412
|
+
|
|
413
|
+
Adjust rate limits based on server capacity:
|
|
414
|
+
|
|
415
|
+
[source,ruby]
|
|
416
|
+
----
|
|
417
|
+
# Conservative: 1-2 seconds between requests
|
|
418
|
+
RATE_LIMIT_DELAY = 1.5
|
|
419
|
+
|
|
420
|
+
# Aggressive (only for your own servers): 100-200ms
|
|
421
|
+
RATE_LIMIT_DELAY = 0.15
|
|
422
|
+
----
|
|
423
|
+
|
|
424
|
+
=== Handle Dynamic Content
|
|
425
|
+
|
|
426
|
+
For JavaScript-heavy sites, consider using Selenium or Puppeteer instead of basic HTTP requests.
|
|
427
|
+
|
|
428
|
+
=== Monitor and Log
|
|
429
|
+
|
|
430
|
+
Implement comprehensive logging:
|
|
431
|
+
|
|
432
|
+
[source,ruby]
|
|
433
|
+
----
|
|
434
|
+
def log_request(url, status, duration)
|
|
435
|
+
File.open("scraper.log", "a") do |f|
|
|
436
|
+
f.puts "#{Time.now.iso8601} | #{url} | #{status} | #{duration}s"
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
----
|
|
440
|
+
|
|
441
|
+
=== Cache DNS Lookups
|
|
442
|
+
|
|
443
|
+
For large-scale scraping, cache DNS lookups to improve performance:
|
|
444
|
+
|
|
445
|
+
[source,ruby]
|
|
446
|
+
----
|
|
447
|
+
@dns_cache ||= {}
|
|
448
|
+
@dns_cache[host] ||= Resolv.getaddress(host)
|
|
449
|
+
----
|
|
450
|
+
|
|
451
|
+
=== Use Connection Pooling
|
|
452
|
+
|
|
453
|
+
For repeated requests to the same domain:
|
|
454
|
+
|
|
455
|
+
[source,ruby]
|
|
456
|
+
----
|
|
457
|
+
@http_connections ||= {}
|
|
458
|
+
@http_connections[domain] ||= create_http_connection(domain)
|
|
459
|
+
----
|
|
460
|
+
|
|
461
|
+
=== Handle Redirects
|
|
462
|
+
|
|
463
|
+
Follow redirects but limit the number to prevent infinite loops:
|
|
464
|
+
|
|
465
|
+
[source,ruby]
|
|
466
|
+
----
|
|
467
|
+
MAX_REDIRECTS = 5
|
|
468
|
+
|
|
469
|
+
def fetch_url(url, redirect_count = 0)
|
|
470
|
+
# ... fetch logic ...
|
|
471
|
+
if response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
|
|
472
|
+
fetch_url(response['location'], redirect_count + 1)
|
|
473
|
+
end
|
|
474
|
+
end
|
|
475
|
+
----
|
|
476
|
+
|
|
477
|
+
== Troubleshooting
|
|
478
|
+
|
|
479
|
+
=== Connection Timeouts
|
|
480
|
+
|
|
481
|
+
**Problem**: Worker hangs on slow or unresponsive URLs.
|
|
482
|
+
|
|
483
|
+
**Solution**: Adjust timeout values:
|
|
484
|
+
|
|
485
|
+
[source,ruby]
|
|
486
|
+
----
|
|
487
|
+
http.open_timeout = 10 # Connection timeout
|
|
488
|
+
http.read_timeout = 30 # Response read timeout
|
|
489
|
+
----
|
|
490
|
+
|
|
491
|
+
=== Rate Limit Too Aggressive
|
|
492
|
+
|
|
493
|
+
**Problem**: Still getting blocked despite rate limiting.
|
|
494
|
+
|
|
495
|
+
**Solution**: Increase the delay:
|
|
496
|
+
|
|
497
|
+
[source,ruby]
|
|
498
|
+
----
|
|
499
|
+
RATE_LIMIT_DELAY = 2.0 # 2 seconds between requests
|
|
500
|
+
----
|
|
501
|
+
|
|
502
|
+
=== Memory Issues with Large Responses
|
|
503
|
+
|
|
504
|
+
**Problem**: Large HTML pages consume too much memory.
|
|
505
|
+
|
|
506
|
+
**Solution**: Stream large responses to disk:
|
|
507
|
+
|
|
508
|
+
[source,ruby]
|
|
509
|
+
----
|
|
510
|
+
def fetch_large_file(url, filepath)
|
|
511
|
+
File.open(filepath, 'wb') do |file|
|
|
512
|
+
http.request_get(uri.path) do |response|
|
|
513
|
+
response.read_body { |chunk| file.write(chunk) }
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
----
|
|
518
|
+
|
|
519
|
+
=== SSL Certificate Errors
|
|
520
|
+
|
|
521
|
+
**Problem**: SSL verification failures.
|
|
522
|
+
|
|
523
|
+
**Solution**: Configure SSL verification (but use with caution):
|
|
524
|
+
|
|
525
|
+
[source,ruby]
|
|
526
|
+
----
|
|
527
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
528
|
+
http.ca_file = '/path/to/ca-bundle.crt'
|
|
529
|
+
----
|
|
530
|
+
|
|
531
|
+
=== Worker Starvation
|
|
532
|
+
|
|
533
|
+
**Problem**: Some workers idle while others are busy.
|
|
534
|
+
|
|
535
|
+
**Solution**: Ensure work is evenly distributed. Fractor handles this automatically, but you can verify with logging:
|
|
536
|
+
|
|
537
|
+
[source,ruby]
|
|
538
|
+
----
|
|
539
|
+
puts "[Worker #{worker_id}] Processed #{@request_count} requests"
|
|
540
|
+
----
|
|
541
|
+
|
|
542
|
+
=== Failed Retries
|
|
543
|
+
|
|
544
|
+
**Problem**: URLs fail even after maximum retries.
|
|
545
|
+
|
|
546
|
+
**Solution**:
|
|
547
|
+
* Increase `MAX_RETRIES`
|
|
548
|
+
* Adjust `RETRY_DELAYS` for longer backoff
|
|
549
|
+
* Implement exponential backoff with jitter:
|
|
550
|
+
|
|
551
|
+
[source,ruby]
|
|
552
|
+
----
|
|
553
|
+
delay = RETRY_DELAYS[attempt - 1] * (0.5 + rand * 0.5)
|
|
554
|
+
----
|
|
555
|
+
|
|
556
|
+
== Advanced Usage
|
|
557
|
+
|
|
558
|
+
=== Custom Data Extraction
|
|
559
|
+
|
|
560
|
+
Add custom parsing logic for specific content types:
|
|
561
|
+
|
|
562
|
+
[source,ruby]
|
|
563
|
+
----
|
|
564
|
+
def parse_response(response, url)
|
|
565
|
+
content = response.body
|
|
566
|
+
|
|
567
|
+
case response["content-type"]
|
|
568
|
+
when /json/
|
|
569
|
+
JSON.parse(content)
|
|
570
|
+
when /xml/
|
|
571
|
+
Nokogiri::XML(content)
|
|
572
|
+
when /html/
|
|
573
|
+
Nokogiri::HTML(content)
|
|
574
|
+
else
|
|
575
|
+
content
|
|
576
|
+
end
|
|
577
|
+
end
|
|
578
|
+
----
|
|
579
|
+
|
|
580
|
+
=== Proxy Support
|
|
581
|
+
|
|
582
|
+
Route requests through a proxy:
|
|
583
|
+
|
|
584
|
+
[source,ruby]
|
|
585
|
+
----
|
|
586
|
+
def fetch_url(url)
|
|
587
|
+
uri = URI.parse(url)
|
|
588
|
+
proxy = URI.parse(ENV['HTTP_PROXY'])
|
|
589
|
+
|
|
590
|
+
http = Net::HTTP.new(
|
|
591
|
+
uri.host, uri.port,
|
|
592
|
+
proxy.host, proxy.port
|
|
593
|
+
)
|
|
594
|
+
# ... rest of fetch logic
|
|
595
|
+
end
|
|
596
|
+
----
|
|
597
|
+
|
|
598
|
+
=== Concurrent Domain Scraping
|
|
599
|
+
|
|
600
|
+
Allow multiple concurrent requests per domain (use carefully):
|
|
601
|
+
|
|
602
|
+
[source,ruby]
|
|
603
|
+
----
|
|
604
|
+
MAX_CONCURRENT_PER_DOMAIN = 2
|
|
605
|
+
|
|
606
|
+
def enforce_rate_limit(url)
|
|
607
|
+
domain = extract_domain(url)
|
|
608
|
+
@domain_semaphores ||= {}
|
|
609
|
+
@domain_semaphores[domain] ||= Mutex.new
|
|
610
|
+
|
|
611
|
+
@domain_semaphores[domain].synchronize do
|
|
612
|
+
# Rate limiting logic
|
|
613
|
+
end
|
|
614
|
+
end
|
|
615
|
+
----
|
|
616
|
+
|
|
617
|
+
== Related Examples
|
|
618
|
+
|
|
619
|
+
* link:../api_aggregator/README.adoc[API Data Aggregator] - Similar pattern for API endpoints
|
|
620
|
+
* link:../file_processor/README.adoc[Batch File Processor] - Retry logic and error handling
|
|
621
|
+
* link:../log_analyzer/README.adoc[Log File Analyzer] - Parallel data processing
|
|
622
|
+
|
|
623
|
+
== License
|
|
624
|
+
|
|
625
|
+
This example is part of the Fractor gem and is available under the same license.
|