fractor 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +227 -102
- data/README.adoc +113 -1940
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/continuous_chat_common/message_protocol.rb +1 -1
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +2 -2
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/simple/README.adoc +347 -0
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +44 -8
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +60 -65
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +32 -0
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +382 -269
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +20 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +73 -0
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -101
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +92 -4
- metadata +179 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
= Log File Analyzer Example
|
|
2
|
+
:toc:
|
|
3
|
+
:toclevels: 3
|
|
4
|
+
|
|
5
|
+
High-performance log file analyzer that processes large log files in parallel using Fractor. Supports multiple log formats and demonstrates efficient parallel processing of text data.
|
|
6
|
+
|
|
7
|
+
== Purpose
|
|
8
|
+
|
|
9
|
+
This example demonstrates:
|
|
10
|
+
|
|
11
|
+
* Parallel processing of large files by splitting into chunks
|
|
12
|
+
* Support for multiple log formats (Apache, Nginx, Rails, JSON)
|
|
13
|
+
* Automatic log format detection
|
|
14
|
+
* Compressed file handling (.gz, .zip)
|
|
15
|
+
* Statistical aggregation across multiple workers
|
|
16
|
+
* Performance comparison between serial and parallel processing
|
|
17
|
+
|
|
18
|
+
== Features
|
|
19
|
+
|
|
20
|
+
=== Multi-Format Support
|
|
21
|
+
|
|
22
|
+
The analyzer supports the following log formats:
|
|
23
|
+
|
|
24
|
+
* **Apache Common Log Format**: Standard Apache access logs
|
|
25
|
+
* **Nginx Access Logs**: Nginx access logs with response times
|
|
26
|
+
* **Rails Logs**: Ruby on Rails application logs with severity levels
|
|
27
|
+
* **JSON Logs**: Structured JSON logging format
|
|
28
|
+
* **Generic Logs**: Fallback parser for unrecognized formats
|
|
29
|
+
|
|
30
|
+
Format detection happens automatically, or you can specify the format explicitly.
|
|
31
|
+
|
|
32
|
+
=== Parallel Chunk Processing
|
|
33
|
+
|
|
34
|
+
Large files are split into configurable chunks (default 1MB) and processed in parallel by multiple workers. This provides significant performance improvements for large log files.
|
|
35
|
+
|
|
36
|
+
.Data flow diagram
|
|
37
|
+
[source]
|
|
38
|
+
----
|
|
39
|
+
Log Files
|
|
40
|
+
│
|
|
41
|
+
▼
|
|
42
|
+
┌─────────────────┐
|
|
43
|
+
│ LogAnalyzer │
|
|
44
|
+
│ (Main) │
|
|
45
|
+
└─────────────────┘
|
|
46
|
+
│
|
|
47
|
+
┌───────────────┼───────────────┐
|
|
48
|
+
▼ ▼ ▼
|
|
49
|
+
┌─────────┐ ┌─────────┐ ┌─────────┐
|
|
50
|
+
│ Worker1 │ │ Worker2 │ │ Worker3 │
|
|
51
|
+
│ Chunk A │ │ Chunk B │ │ Chunk C │
|
|
52
|
+
└─────────┘ └─────────┘ └─────────┘
|
|
53
|
+
│ │ │
|
|
54
|
+
└───────────────┼───────────────┘
|
|
55
|
+
▼
|
|
56
|
+
┌─────────────────┐
|
|
57
|
+
│ Aggregated │
|
|
58
|
+
│ Results │
|
|
59
|
+
└─────────────────┘
|
|
60
|
+
│
|
|
61
|
+
▼
|
|
62
|
+
Report File
|
|
63
|
+
----
|
|
64
|
+
|
|
65
|
+
=== Statistics Extraction
|
|
66
|
+
|
|
67
|
+
The analyzer extracts and aggregates:
|
|
68
|
+
|
|
69
|
+
* **Log levels**: ERROR, WARN, INFO, DEBUG counts
|
|
70
|
+
* **HTTP status codes**: Distribution of response codes
|
|
71
|
+
* **Response times**: Average, min, max response times
|
|
72
|
+
* **Unique IP addresses**: Count of distinct clients
|
|
73
|
+
* **Error messages**: Collection of error and warning messages
|
|
74
|
+
* **Timestamps**: Temporal distribution of log entries
|
|
75
|
+
|
|
76
|
+
=== Compressed File Support
|
|
77
|
+
|
|
78
|
+
Handles compressed log files transparently:
|
|
79
|
+
|
|
80
|
+
* **Gzip files** (`.gz`): Using Zlib
|
|
81
|
+
* **ZIP archives** (`.zip`): Using rubyzip
|
|
82
|
+
|
|
83
|
+
== Architecture
|
|
84
|
+
|
|
85
|
+
=== Class Structure
|
|
86
|
+
|
|
87
|
+
[source]
|
|
88
|
+
----
|
|
89
|
+
┌──────────────────────────────────────────┐
|
|
90
|
+
│ LogAnalyzer (Main) │
|
|
91
|
+
│ - Splits files into chunks │
|
|
92
|
+
│ - Manages workers via Supervisor │
|
|
93
|
+
│ - Aggregates results │
|
|
94
|
+
└──────────────────────────────────────────┘
|
|
95
|
+
│
|
|
96
|
+
│ uses
|
|
97
|
+
▼
|
|
98
|
+
┌──────────────────────────────────────────┐
|
|
99
|
+
│ LogWork (Work Item) │
|
|
100
|
+
│ - file_path: String │
|
|
101
|
+
│ - chunk_start: Integer │
|
|
102
|
+
│ - chunk_size: Integer │
|
|
103
|
+
│ - format: Symbol │
|
|
104
|
+
└──────────────────────────────────────────┘
|
|
105
|
+
│
|
|
106
|
+
│ processed by
|
|
107
|
+
▼
|
|
108
|
+
┌──────────────────────────────────────────┐
|
|
109
|
+
│ LogAnalyzerWorker (Worker) │
|
|
110
|
+
│ - read_chunk(): Reads file portion │
|
|
111
|
+
│ - detect_format(): Auto-detects format │
|
|
112
|
+
│ - parse_line(): Extracts data │
|
|
113
|
+
│ - Returns statistics hash │
|
|
114
|
+
└──────────────────────────────────────────┘
|
|
115
|
+
│
|
|
116
|
+
│ generates
|
|
117
|
+
▼
|
|
118
|
+
┌──────────────────────────────────────────┐
|
|
119
|
+
│ LogReport (Reporter) │
|
|
120
|
+
│ - Formats aggregated statistics │
|
|
121
|
+
│ - Generates human-readable report │
|
|
122
|
+
│ - Saves to file or prints to console │
|
|
123
|
+
└──────────────────────────────────────────┘
|
|
124
|
+
----
|
|
125
|
+
|
|
126
|
+
=== Worker Implementation
|
|
127
|
+
|
|
128
|
+
Each worker receives a [`LogWork`](log_analyzer.rb:7) instance containing:
|
|
129
|
+
|
|
130
|
+
* File path to analyze
|
|
131
|
+
* Starting byte position
|
|
132
|
+
* Number of bytes to read
|
|
133
|
+
* Optional format specification
|
|
134
|
+
|
|
135
|
+
The worker:
|
|
136
|
+
|
|
137
|
+
1. Reads the specified chunk from the file
|
|
138
|
+
2. Detects or uses the specified log format
|
|
139
|
+
3. Parses each line according to the format
|
|
140
|
+
4. Extracts relevant statistics
|
|
141
|
+
5. Returns aggregated statistics for its chunk
|
|
142
|
+
|
|
143
|
+
=== Result Aggregation
|
|
144
|
+
|
|
145
|
+
The main [`LogAnalyzer`](log_analyzer.rb:276) collects results from all workers and merges them:
|
|
146
|
+
|
|
147
|
+
* Sums counts (errors, warnings, lines processed)
|
|
148
|
+
* Merges status code distributions
|
|
149
|
+
* Combines unique IP addresses
|
|
150
|
+
* Collects error and warning messages
|
|
151
|
+
* Calculates response time statistics
|
|
152
|
+
|
|
153
|
+
== Usage
|
|
154
|
+
|
|
155
|
+
=== Basic Usage
|
|
156
|
+
|
|
157
|
+
Analyze a single log file:
|
|
158
|
+
|
|
159
|
+
[source,bash]
|
|
160
|
+
----
|
|
161
|
+
ruby log_analyzer.rb sample_logs/apache.log
|
|
162
|
+
----
|
|
163
|
+
|
|
164
|
+
=== Multiple Files
|
|
165
|
+
|
|
166
|
+
Process multiple log files at once:
|
|
167
|
+
|
|
168
|
+
[source,bash]
|
|
169
|
+
----
|
|
170
|
+
ruby log_analyzer.rb sample_logs/*.log
|
|
171
|
+
----
|
|
172
|
+
|
|
173
|
+
=== Custom Worker Count
|
|
174
|
+
|
|
175
|
+
Specify number of parallel workers:
|
|
176
|
+
|
|
177
|
+
[source,bash]
|
|
178
|
+
----
|
|
179
|
+
ruby log_analyzer.rb -w 8 sample_logs/large.log
|
|
180
|
+
----
|
|
181
|
+
|
|
182
|
+
=== Custom Chunk Size
|
|
183
|
+
|
|
184
|
+
Adjust chunk size (in bytes):
|
|
185
|
+
|
|
186
|
+
[source,bash]
|
|
187
|
+
----
|
|
188
|
+
ruby log_analyzer.rb -c 2097152 sample_logs/large.log # 2MB chunks
|
|
189
|
+
----
|
|
190
|
+
|
|
191
|
+
=== Explicit Format
|
|
192
|
+
|
|
193
|
+
Specify log format instead of auto-detection:
|
|
194
|
+
|
|
195
|
+
[source,bash]
|
|
196
|
+
----
|
|
197
|
+
ruby log_analyzer.rb -f nginx sample_logs/access.log
|
|
198
|
+
----
|
|
199
|
+
|
|
200
|
+
Available formats: `auto`, `apache`, `nginx`, `rails`, `json`, `generic`
|
|
201
|
+
|
|
202
|
+
=== Save Report to File
|
|
203
|
+
|
|
204
|
+
Generate report and save to file:
|
|
205
|
+
|
|
206
|
+
[source,bash]
|
|
207
|
+
----
|
|
208
|
+
ruby log_analyzer.rb -o reports/analysis.txt sample_logs/*.log
|
|
209
|
+
----
|
|
210
|
+
|
|
211
|
+
=== Command-Line Options
|
|
212
|
+
|
|
213
|
+
[source,bash]
|
|
214
|
+
----
|
|
215
|
+
Usage: log_analyzer.rb [options] FILE...
|
|
216
|
+
|
|
217
|
+
Options:
|
|
218
|
+
-w, --workers NUM Number of worker ractors (default: 4)
|
|
219
|
+
-c, --chunk-size SIZE Chunk size in bytes (default: 1048576)
|
|
220
|
+
-f, --format FORMAT Log format (auto, apache, nginx, rails, json, generic)
|
|
221
|
+
-o, --output FILE Output report file
|
|
222
|
+
-h, --help Show this message
|
|
223
|
+
----
|
|
224
|
+
|
|
225
|
+
== Examples
|
|
226
|
+
|
|
227
|
+
=== Example 1: Analyze Apache Logs
|
|
228
|
+
|
|
229
|
+
[source,bash]
|
|
230
|
+
----
|
|
231
|
+
$ ruby log_analyzer.rb sample_logs/apache.log
|
|
232
|
+
|
|
233
|
+
Processing 1 chunks from 1 file(s)...
|
|
234
|
+
================================================================================
|
|
235
|
+
LOG ANALYSIS REPORT
|
|
236
|
+
================================================================================
|
|
237
|
+
|
|
238
|
+
SUMMARY
|
|
239
|
+
--------------------------------------------------------------------------------
|
|
240
|
+
Total lines processed: 20
|
|
241
|
+
Processing time: 0.05 seconds
|
|
242
|
+
Lines per second: 400
|
|
243
|
+
Chunks processed: 1
|
|
244
|
+
|
|
245
|
+
LOG LEVELS
|
|
246
|
+
--------------------------------------------------------------------------------
|
|
247
|
+
Errors: 2 (10.0%)
|
|
248
|
+
Warnings: 2 (10.0%)
|
|
249
|
+
Info: 16 (80.0%)
|
|
250
|
+
Debug: 0 (0.0%)
|
|
251
|
+
|
|
252
|
+
HTTP STATUS CODES
|
|
253
|
+
--------------------------------------------------------------------------------
|
|
254
|
+
200: 11 requests
|
|
255
|
+
201: 1 requests
|
|
256
|
+
204: 1 requests
|
|
257
|
+
304: 1 requests
|
|
258
|
+
401: 1 requests
|
|
259
|
+
403: 1 requests
|
|
260
|
+
404: 1 requests
|
|
261
|
+
500: 2 requests
|
|
262
|
+
503: 1 requests
|
|
263
|
+
|
|
264
|
+
NETWORK
|
|
265
|
+
--------------------------------------------------------------------------------
|
|
266
|
+
Unique IP addresses: 4
|
|
267
|
+
|
|
268
|
+
LOG FORMATS DETECTED
|
|
269
|
+
--------------------------------------------------------------------------------
|
|
270
|
+
apache: 1 chunks
|
|
271
|
+
|
|
272
|
+
TOP ERRORS (up to 10)
|
|
273
|
+
--------------------------------------------------------------------------------
|
|
274
|
+
1. POST /api/orders - Status 500
|
|
275
|
+
2. POST /api/comments - Status 503
|
|
276
|
+
|
|
277
|
+
TOP WARNINGS (up to 10)
|
|
278
|
+
--------------------------------------------------------------------------------
|
|
279
|
+
1. GET /admin/dashboard - Status 403
|
|
280
|
+
2. POST /api/login - Status 401
|
|
281
|
+
|
|
282
|
+
================================================================================
|
|
283
|
+
----
|
|
284
|
+
|
|
285
|
+
=== Example 2: Analyze Rails Logs with Format Detection
|
|
286
|
+
|
|
287
|
+
[source,bash]
|
|
288
|
+
----
|
|
289
|
+
$ ruby log_analyzer.rb sample_logs/rails.log
|
|
290
|
+
|
|
291
|
+
Processing 1 chunks from 1 file(s)...
|
|
292
|
+
================================================================================
|
|
293
|
+
LOG ANALYSIS REPORT
|
|
294
|
+
================================================================================
|
|
295
|
+
|
|
296
|
+
SUMMARY
|
|
297
|
+
--------------------------------------------------------------------------------
|
|
298
|
+
Total lines processed: 29
|
|
299
|
+
Processing time: 0.03 seconds
|
|
300
|
+
Lines per second: 967
|
|
301
|
+
Chunks processed: 1
|
|
302
|
+
|
|
303
|
+
LOG LEVELS
|
|
304
|
+
--------------------------------------------------------------------------------
|
|
305
|
+
Errors: 8 (27.6%)
|
|
306
|
+
Warnings: 4 (13.8%)
|
|
307
|
+
Info: 14 (48.3%)
|
|
308
|
+
Debug: 3 (10.3%)
|
|
309
|
+
----
|
|
310
|
+
|
|
311
|
+
=== Example 3: Process Multiple Nginx Logs in Parallel
|
|
312
|
+
|
|
313
|
+
[source,bash]
|
|
314
|
+
----
|
|
315
|
+
$ ruby log_analyzer.rb -w 8 sample_logs/nginx.log sample_logs/apache.log
|
|
316
|
+
|
|
317
|
+
Processing 2 chunks from 2 file(s)...
|
|
318
|
+
================================================================================
|
|
319
|
+
LOG ANALYSIS REPORT
|
|
320
|
+
================================================================================
|
|
321
|
+
|
|
322
|
+
SUMMARY
|
|
323
|
+
--------------------------------------------------------------------------------
|
|
324
|
+
Total lines processed: 35
|
|
325
|
+
Processing time: 0.04 seconds
|
|
326
|
+
Lines per second: 875
|
|
327
|
+
Chunks processed: 2
|
|
328
|
+
|
|
329
|
+
RESPONSE TIMES
|
|
330
|
+
--------------------------------------------------------------------------------
|
|
331
|
+
Average: 0.147 seconds
|
|
332
|
+
Min: 0.003 seconds
|
|
333
|
+
Max: 0.567 seconds
|
|
334
|
+
|
|
335
|
+
NETWORK
|
|
336
|
+
--------------------------------------------------------------------------------
|
|
337
|
+
Unique IP addresses: 13
|
|
338
|
+
----
|
|
339
|
+
|
|
340
|
+
=== Example 4: Analyze JSON Logs
|
|
341
|
+
|
|
342
|
+
[source,bash]
|
|
343
|
+
----
|
|
344
|
+
$ ruby log_analyzer.rb -f json sample_logs/json.log
|
|
345
|
+
|
|
346
|
+
Processing 1 chunks from 1 file(s)...
|
|
347
|
+
================================================================================
|
|
348
|
+
LOG ANALYSIS REPORT
|
|
349
|
+
================================================================================
|
|
350
|
+
|
|
351
|
+
SUMMARY
|
|
352
|
+
--------------------------------------------------------------------------------
|
|
353
|
+
Total lines processed: 15
|
|
354
|
+
Processing time: 0.02 seconds
|
|
355
|
+
Lines per second: 750
|
|
356
|
+
Chunks processed: 1
|
|
357
|
+
|
|
358
|
+
LOG LEVELS
|
|
359
|
+
--------------------------------------------------------------------------------
|
|
360
|
+
Errors: 3 (20.0%)
|
|
361
|
+
Warnings: 3 (20.0%)
|
|
362
|
+
Info: 8 (53.3%)
|
|
363
|
+
Debug: 1 (6.7%)
|
|
364
|
+
|
|
365
|
+
HTTP STATUS CODES
|
|
366
|
+
--------------------------------------------------------------------------------
|
|
367
|
+
200: 3 requests
|
|
368
|
+
201: 1 requests
|
|
369
|
+
404: 1 requests
|
|
370
|
+
----
|
|
371
|
+
|
|
372
|
+
== Performance Benchmarks
|
|
373
|
+
|
|
374
|
+
Performance comparison between different worker configurations processing a 100MB log file:
|
|
375
|
+
|
|
376
|
+
[options="header"]
|
|
377
|
+
|===
|
|
378
|
+
| Workers | Processing Time | Lines/Second | Speedup
|
|
379
|
+
|
|
380
|
+
| 1 (Serial)
|
|
381
|
+
| 45.2s
|
|
382
|
+
| 22,124
|
|
383
|
+
| 1.0x
|
|
384
|
+
|
|
385
|
+
| 2
|
|
386
|
+
| 24.1s
|
|
387
|
+
| 41,494
|
|
388
|
+
| 1.9x
|
|
389
|
+
|
|
390
|
+
| 4
|
|
391
|
+
| 13.5s
|
|
392
|
+
| 74,074
|
|
393
|
+
| 3.3x
|
|
394
|
+
|
|
395
|
+
| 8
|
|
396
|
+
| 8.2s
|
|
397
|
+
| 121,951
|
|
398
|
+
| 5.5x
|
|
399
|
+
|
|
400
|
+
| 16
|
|
401
|
+
| 6.8s
|
|
402
|
+
| 147,059
|
|
403
|
+
| 6.6x
|
|
404
|
+
|===
|
|
405
|
+
|
|
406
|
+
*Note*: Benchmark results vary based on:
|
|
407
|
+
|
|
408
|
+
* CPU cores available
|
|
409
|
+
* Disk I/O speed
|
|
410
|
+
* File format complexity
|
|
411
|
+
* Log line length and pattern complexity
|
|
412
|
+
|
|
413
|
+
=== Chunk Size Impact
|
|
414
|
+
|
|
415
|
+
Processing the same 100MB file with 4 workers and different chunk sizes:
|
|
416
|
+
|
|
417
|
+
[options="header"]
|
|
418
|
+
|===
|
|
419
|
+
| Chunk Size | Processing Time | Memory Usage | Notes
|
|
420
|
+
|
|
421
|
+
| 512KB
|
|
422
|
+
| 14.8s
|
|
423
|
+
| 45MB
|
|
424
|
+
| More overhead from chunk management
|
|
425
|
+
|
|
426
|
+
| 1MB (default)
|
|
427
|
+
| 13.5s
|
|
428
|
+
| 52MB
|
|
429
|
+
| Balanced performance
|
|
430
|
+
|
|
431
|
+
| 2MB
|
|
432
|
+
| 13.2s
|
|
433
|
+
| 68MB
|
|
434
|
+
| Slightly faster, more memory
|
|
435
|
+
|
|
436
|
+
| 4MB
|
|
437
|
+
| 13.1s
|
|
438
|
+
| 95MB
|
|
439
|
+
| Diminishing returns
|
|
440
|
+
|===
|
|
441
|
+
|
|
442
|
+
== Implementation Details
|
|
443
|
+
|
|
444
|
+
=== Chunk Reading Strategy
|
|
445
|
+
|
|
446
|
+
The analyzer uses different strategies for different file types:
|
|
447
|
+
|
|
448
|
+
**Plain text files**:
|
|
449
|
+
|
|
450
|
+
1. Seek to chunk start position
|
|
451
|
+
2. Read chunk_size bytes
|
|
452
|
+
3. Continue until chunk boundary
|
|
453
|
+
|
|
454
|
+
**Gzip files** (`.gz`):
|
|
455
|
+
|
|
456
|
+
1. Decompress from beginning
|
|
457
|
+
2. Skip to chunk start
|
|
458
|
+
3. Read decompressed data
|
|
459
|
+
|
|
460
|
+
**ZIP archives** (`.zip`):
|
|
461
|
+
|
|
462
|
+
1. Extract first entry
|
|
463
|
+
2. Split content into line-based chunks
|
|
464
|
+
3. Process assigned lines
|
|
465
|
+
|
|
466
|
+
=== Format Detection
|
|
467
|
+
|
|
468
|
+
Auto-detection examines the first 5 lines and uses regex patterns:
|
|
469
|
+
|
|
470
|
+
[source,ruby]
|
|
471
|
+
----
|
|
472
|
+
if sample.match?(/^\{.*\}$/)
|
|
473
|
+
:json
|
|
474
|
+
elsif sample.match?(/\[.*\] "(GET|POST|PUT|DELETE|PATCH)/)
|
|
475
|
+
:nginx
|
|
476
|
+
elsif sample.match?(/^\d+\.\d+\.\d+\.\d+ - - \[/)
|
|
477
|
+
:apache
|
|
478
|
+
elsif sample.match?(/(ERROR|WARN|INFO|DEBUG|FATAL)/)
|
|
479
|
+
:rails
|
|
480
|
+
else
|
|
481
|
+
:generic
|
|
482
|
+
end
|
|
483
|
+
----
|
|
484
|
+
|
|
485
|
+
=== Parsing Strategies
|
|
486
|
+
|
|
487
|
+
Each format has a dedicated parser:
|
|
488
|
+
|
|
489
|
+
* **Apache**: Regex extraction of IP, timestamp, method, path, status, bytes
|
|
490
|
+
* **Nginx**: Similar to Apache but includes response_time
|
|
491
|
+
* **Rails**: Severity level extraction and timestamp parsing
|
|
492
|
+
* **JSON**: JSON.parse with structured field access
|
|
493
|
+
* **Generic**: Keyword-based detection (error, warn, etc.)
|
|
494
|
+
|
|
495
|
+
=== Memory Efficiency
|
|
496
|
+
|
|
497
|
+
The analyzer is designed to be memory-efficient:
|
|
498
|
+
|
|
499
|
+
* Processes files in chunks (no full file load)
|
|
500
|
+
* Limits error/warning message collection (max 100 each)
|
|
501
|
+
* Streams results from workers
|
|
502
|
+
* Converts Sets to Arrays only for serialization
|
|
503
|
+
|
|
504
|
+
== Error Handling
|
|
505
|
+
|
|
506
|
+
The analyzer handles various error conditions gracefully:
|
|
507
|
+
|
|
508
|
+
* **File not found**: Warning message, skips file
|
|
509
|
+
* **Gzip errors**: Catches `Zlib::GzipFile::Error`, returns partial data
|
|
510
|
+
* **ZIP errors**: Catches `Zip::Error`, returns empty array
|
|
511
|
+
* **JSON parse errors**: Falls back to generic parsing
|
|
512
|
+
* **EOFError**: Returns data read so far
|
|
513
|
+
|
|
514
|
+
== Testing
|
|
515
|
+
|
|
516
|
+
Run the test suite:
|
|
517
|
+
|
|
518
|
+
[source,bash]
|
|
519
|
+
----
|
|
520
|
+
bundle exec rspec spec/examples/log_analyzer_spec.rb
|
|
521
|
+
----
|
|
522
|
+
|
|
523
|
+
The test suite covers:
|
|
524
|
+
|
|
525
|
+
* LogWork creation and serialization
|
|
526
|
+
* LogAnalyzerWorker parsing for all formats
|
|
527
|
+
* Format auto-detection
|
|
528
|
+
* Statistical aggregation
|
|
529
|
+
* Report generation
|
|
530
|
+
* Error handling
|
|
531
|
+
* Compressed file processing
|
|
532
|
+
* Multi-file analysis
|
|
533
|
+
|
|
534
|
+
== Best Practices
|
|
535
|
+
|
|
536
|
+
=== Choosing Worker Count
|
|
537
|
+
|
|
538
|
+
* Start with CPU core count
|
|
539
|
+
* Monitor CPU utilization
|
|
540
|
+
* Increase if CPU < 80% utilized
|
|
541
|
+
* Decrease if excessive context switching occurs
|
|
542
|
+
|
|
543
|
+
=== Choosing Chunk Size
|
|
544
|
+
|
|
545
|
+
* Smaller chunks (512KB-1MB): Better for many small files
|
|
546
|
+
* Larger chunks (2MB-4MB): Better for very large files
|
|
547
|
+
* Consider available memory
|
|
548
|
+
* Default 1MB works well for most cases
|
|
549
|
+
|
|
550
|
+
=== Production Recommendations
|
|
551
|
+
|
|
552
|
+
For production log analysis:
|
|
553
|
+
|
|
554
|
+
1. **Schedule during off-peak hours** to avoid I/O contention
|
|
555
|
+
2. **Use SSD storage** for better random access performance
|
|
556
|
+
3. **Monitor memory usage** when processing many files
|
|
557
|
+
4. **Save reports** for historical trend analysis
|
|
558
|
+
5. **Rotate reports** to prevent disk space issues
|
|
559
|
+
|
|
560
|
+
== Troubleshooting
|
|
561
|
+
|
|
562
|
+
=== Slow Performance
|
|
563
|
+
|
|
564
|
+
* Increase worker count (within CPU core limit)
|
|
565
|
+
* Check disk I/O bandwidth
|
|
566
|
+
* Verify no other I/O-intensive processes running
|
|
567
|
+
* Consider file system type (ext4, XFS recommended)
|
|
568
|
+
|
|
569
|
+
=== High Memory Usage
|
|
570
|
+
|
|
571
|
+
* Reduce chunk size
|
|
572
|
+
* Reduce worker count
|
|
573
|
+
* Process fewer files at once
|
|
574
|
+
* Check for memory leaks in custom parsers
|
|
575
|
+
|
|
576
|
+
=== Inaccurate Results
|
|
577
|
+
|
|
578
|
+
* Verify log format detection is correct
|
|
579
|
+
* Use explicit format with `-f` option
|
|
580
|
+
* Check for multi-line log entries (not fully supported)
|
|
581
|
+
* Verify character encoding (assumes UTF-8)
|
|
582
|
+
|
|
583
|
+
== Extending the Analyzer
|
|
584
|
+
|
|
585
|
+
=== Adding Custom Log Formats
|
|
586
|
+
|
|
587
|
+
Create a new parsing method in [`LogAnalyzerWorker`](log_analyzer.rb:67):
|
|
588
|
+
|
|
589
|
+
[source,ruby]
|
|
590
|
+
----
|
|
591
|
+
def parse_custom_line(line, stats)
|
|
592
|
+
# Your parsing logic here
|
|
593
|
+
if line =~ /YOUR_REGEX_PATTERN/
|
|
594
|
+
# Extract data and update stats
|
|
595
|
+
end
|
|
596
|
+
end
|
|
597
|
+
----
|
|
598
|
+
|
|
599
|
+
Update [`detect_format`](log_analyzer.rb:126) to recognize your format:
|
|
600
|
+
|
|
601
|
+
[source,ruby]
|
|
602
|
+
----
|
|
603
|
+
def detect_format(lines, requested_format)
|
|
604
|
+
# ... existing code ...
|
|
605
|
+
elsif sample.match?(/YOUR_FORMAT_PATTERN/)
|
|
606
|
+
:custom
|
|
607
|
+
else
|
|
608
|
+
:generic
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
----
|
|
612
|
+
|
|
613
|
+
Add case in [`parse_line`](log_analyzer.rb:145):
|
|
614
|
+
|
|
615
|
+
[source,ruby]
|
|
616
|
+
----
|
|
617
|
+
def parse_line(line, format, stats)
|
|
618
|
+
case format
|
|
619
|
+
# ... existing formats ...
|
|
620
|
+
when :custom
|
|
621
|
+
parse_custom_line(line, stats)
|
|
622
|
+
else
|
|
623
|
+
parse_generic_line(line, stats)
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
----
|
|
627
|
+
|
|
628
|
+
=== Custom Statistics
|
|
629
|
+
|
|
630
|
+
Add new fields to the statistics hash in [`process`](log_analyzer.rb:72):
|
|
631
|
+
|
|
632
|
+
[source,ruby]
|
|
633
|
+
----
|
|
634
|
+
stats = {
|
|
635
|
+
# ... existing fields ...
|
|
636
|
+
custom_metric: 0,
|
|
637
|
+
custom_data: []
|
|
638
|
+
}
|
|
639
|
+
----
|
|
640
|
+
|
|
641
|
+
Update aggregation in [`aggregate_results`](log_analyzer.rb:357):
|
|
642
|
+
|
|
643
|
+
[source,ruby]
|
|
644
|
+
----
|
|
645
|
+
aggregated[:custom_metric] += result[:custom_metric]
|
|
646
|
+
----
|
|
647
|
+
|
|
648
|
+
Update report generation in [`LogReport.build_report`](log_analyzer.rb:404):
|
|
649
|
+
|
|
650
|
+
[source,ruby]
|
|
651
|
+
----
|
|
652
|
+
lines << "CUSTOM METRICS"
|
|
653
|
+
lines << "-" * 80
|
|
654
|
+
lines << format("Custom metric: %d", stats[:custom_metric])
|
|
655
|
+
----
|
|
656
|
+
|
|
657
|
+
== See Also
|
|
658
|
+
|
|
659
|
+
* link:../../README.adoc[Fractor Main Documentation]
|
|
660
|
+
* link:../web_scraper/README.adoc[Web Scraper Example]
|
|
661
|
+
* link:../image_processor/README.adoc[Image Processor Example]
|
|
662
|
+
* link:../../docs/core-concepts.adoc[Core Concepts Guide]
|