logler 1.0.7__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logler/__init__.py +22 -0
- logler/bootstrap.py +57 -0
- logler/cache.py +75 -0
- logler/cli.py +589 -0
- logler/helpers.py +282 -0
- logler/investigate.py +3962 -0
- logler/llm_cli.py +1426 -0
- logler/log_reader.py +267 -0
- logler/parser.py +207 -0
- logler/safe_regex.py +124 -0
- logler/terminal.py +252 -0
- logler/tracker.py +138 -0
- logler/tree_formatter.py +807 -0
- logler/watcher.py +55 -0
- logler/web/__init__.py +3 -0
- logler/web/app.py +810 -0
- logler/web/static/css/tailwind.css +1 -0
- logler/web/static/css/tailwind.input.css +3 -0
- logler/web/static/logler-logo.png +0 -0
- logler/web/tailwind.config.cjs +9 -0
- logler/web/templates/index.html +1454 -0
- logler-1.0.7.dist-info/METADATA +584 -0
- logler-1.0.7.dist-info/RECORD +28 -0
- logler-1.0.7.dist-info/WHEEL +4 -0
- logler-1.0.7.dist-info/entry_points.txt +2 -0
- logler-1.0.7.dist-info/licenses/LICENSE +21 -0
- logler_rs/__init__.py +5 -0
- logler_rs/logler_rs.cp311-win_amd64.pyd +0 -0
logler/investigate.py
ADDED
|
@@ -0,0 +1,3962 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM Investigation Module - High-performance log investigation powered by Rust
|
|
3
|
+
|
|
4
|
+
This module provides fast log parsing, searching, and investigation capabilities
|
|
5
|
+
specifically designed for LLM agents like Claude.
|
|
6
|
+
|
|
7
|
+
Example Usage:
|
|
8
|
+
import logler.investigate as investigate
|
|
9
|
+
|
|
10
|
+
# Search for errors
|
|
11
|
+
results = investigate.search(
|
|
12
|
+
files=["app.log"],
|
|
13
|
+
query="database timeout",
|
|
14
|
+
level="ERROR",
|
|
15
|
+
limit=10
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Follow a thread
|
|
19
|
+
timeline = investigate.follow_thread(
|
|
20
|
+
files=["app.log"],
|
|
21
|
+
thread_id="worker-1"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Find patterns
|
|
25
|
+
patterns = investigate.find_patterns(
|
|
26
|
+
files=["app.log"],
|
|
27
|
+
min_occurrences=3
|
|
28
|
+
)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import json
|
|
32
|
+
import re
|
|
33
|
+
import warnings
|
|
34
|
+
from typing import List, Optional, Dict, Any, Tuple
|
|
35
|
+
from datetime import datetime
|
|
36
|
+
from collections import defaultdict
|
|
37
|
+
|
|
38
|
+
from .safe_regex import try_compile
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
import logler_rs
|
|
42
|
+
|
|
43
|
+
RUST_AVAILABLE = True
|
|
44
|
+
except ImportError:
|
|
45
|
+
try:
|
|
46
|
+
from .bootstrap import ensure_rust_backend
|
|
47
|
+
|
|
48
|
+
if ensure_rust_backend():
|
|
49
|
+
import logler_rs # type: ignore
|
|
50
|
+
|
|
51
|
+
RUST_AVAILABLE = True
|
|
52
|
+
else:
|
|
53
|
+
RUST_AVAILABLE = False
|
|
54
|
+
warnings.warn("Rust backend not available. Using Python fallback.", stacklevel=2)
|
|
55
|
+
except (ImportError, AttributeError, OSError):
|
|
56
|
+
RUST_AVAILABLE = False
|
|
57
|
+
warnings.warn("Rust backend not available. Using Python fallback.", stacklevel=2)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _normalize_entry(entry: Dict[str, Any]) -> None:
|
|
61
|
+
"""Normalize a single log entry in-place (e.g., ensure uppercase levels)."""
|
|
62
|
+
if not isinstance(entry, dict):
|
|
63
|
+
return
|
|
64
|
+
level = entry.get("level")
|
|
65
|
+
if isinstance(level, str):
|
|
66
|
+
entry["level"] = level.upper()
|
|
67
|
+
raw = entry.get("raw") or ""
|
|
68
|
+
# Always detect and set format based on raw content
|
|
69
|
+
if isinstance(raw, str):
|
|
70
|
+
stripped = raw.lstrip()
|
|
71
|
+
if stripped.startswith("{"):
|
|
72
|
+
entry["format"] = "Json"
|
|
73
|
+
elif stripped.startswith("<") and stripped[1:2].isdigit():
|
|
74
|
+
entry["format"] = "Syslog"
|
|
75
|
+
elif "level=" in raw or " msg=" in raw or raw.startswith("level="):
|
|
76
|
+
entry["format"] = "Logfmt"
|
|
77
|
+
elif entry.get("format") is None:
|
|
78
|
+
entry["format"] = "PlainText"
|
|
79
|
+
if entry.get("level") is None and isinstance(raw, str):
|
|
80
|
+
inferred = _infer_syslog_level(raw)
|
|
81
|
+
entry["level"] = inferred or "UNKNOWN"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _normalize_entries(entries: List[Dict[str, Any]]) -> None:
|
|
85
|
+
for entry in entries or []:
|
|
86
|
+
_normalize_entry(entry)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _normalize_search_result_levels(result: Dict[str, Any]) -> None:
|
|
90
|
+
"""Ensure search results and their contexts use consistent level casing."""
|
|
91
|
+
for item in result.get("results", []) or []:
|
|
92
|
+
_normalize_entry(item.get("entry", {}))
|
|
93
|
+
for ctx in item.get("context_before", []) or []:
|
|
94
|
+
_normalize_entry(ctx)
|
|
95
|
+
for ctx in item.get("context_after", []) or []:
|
|
96
|
+
_normalize_entry(ctx)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _apply_custom_regex_to_results(result: Dict[str, Any], pattern: Optional[str]) -> None:
|
|
100
|
+
"""Apply a user-provided regex to fill missing fields like timestamp/level."""
|
|
101
|
+
if not pattern:
|
|
102
|
+
return
|
|
103
|
+
regex = try_compile(pattern)
|
|
104
|
+
if regex is None:
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
for item in result.get("results", []) or []:
|
|
108
|
+
_apply_custom_regex_to_entry(item.get("entry", {}), regex)
|
|
109
|
+
for ctx in item.get("context_before", []) or []:
|
|
110
|
+
_apply_custom_regex_to_entry(ctx, regex)
|
|
111
|
+
for ctx in item.get("context_after", []) or []:
|
|
112
|
+
_apply_custom_regex_to_entry(ctx, regex)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _apply_custom_regex_to_entry(entry: Dict[str, Any], regex: re.Pattern[str]) -> None:
|
|
116
|
+
if not isinstance(entry, dict):
|
|
117
|
+
return
|
|
118
|
+
raw = entry.get("raw") or entry.get("message") or ""
|
|
119
|
+
match = regex.match(raw)
|
|
120
|
+
if not match:
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
groups = match.groupdict()
|
|
124
|
+
ts_val = groups.get("timestamp")
|
|
125
|
+
if ts_val and not entry.get("timestamp"):
|
|
126
|
+
parsed = _parse_timestamp_flex(ts_val)
|
|
127
|
+
if parsed:
|
|
128
|
+
entry["timestamp"] = parsed
|
|
129
|
+
if groups.get("level") and not entry.get("level"):
|
|
130
|
+
entry["level"] = groups["level"].upper()
|
|
131
|
+
if groups.get("message") and entry.get("message") == raw:
|
|
132
|
+
entry["message"] = groups["message"]
|
|
133
|
+
if groups.get("thread") and not entry.get("thread_id"):
|
|
134
|
+
entry["thread_id"] = groups["thread"]
|
|
135
|
+
if groups.get("correlation_id") and not entry.get("correlation_id"):
|
|
136
|
+
entry["correlation_id"] = groups["correlation_id"]
|
|
137
|
+
entry["format"] = "Custom"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _normalize_pattern_examples(result: Dict[str, Any]) -> None:
|
|
141
|
+
"""Normalize example entries inside pattern detection results."""
|
|
142
|
+
for pattern in result.get("patterns", []) or []:
|
|
143
|
+
for example in pattern.get("examples", []) or []:
|
|
144
|
+
_normalize_entry(example)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _infer_syslog_level(raw: str) -> Optional[str]:
|
|
148
|
+
match = re.match(r"<(?P<priority>\d+)>", raw.strip())
|
|
149
|
+
if not match:
|
|
150
|
+
return None
|
|
151
|
+
try:
|
|
152
|
+
priority = int(match.group("priority"))
|
|
153
|
+
except ValueError:
|
|
154
|
+
return None
|
|
155
|
+
severity = priority & 0x07
|
|
156
|
+
if severity == 0:
|
|
157
|
+
return "FATAL"
|
|
158
|
+
if severity <= 3:
|
|
159
|
+
return "ERROR"
|
|
160
|
+
if severity == 4:
|
|
161
|
+
return "WARN"
|
|
162
|
+
if severity <= 6:
|
|
163
|
+
return "INFO"
|
|
164
|
+
return "DEBUG"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _parse_timestamp_flex(value: str) -> Optional[str]:
|
|
168
|
+
for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S"):
|
|
169
|
+
try:
|
|
170
|
+
dt = datetime.strptime(value.replace("Z", "+0000"), fmt)
|
|
171
|
+
return dt.isoformat()
|
|
172
|
+
except Exception:
|
|
173
|
+
continue
|
|
174
|
+
try:
|
|
175
|
+
return datetime.fromisoformat(value.replace("Z", "+00:00")).isoformat()
|
|
176
|
+
except Exception:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _normalize_context_payload(payload: Dict[str, Any]) -> None:
|
|
181
|
+
"""Normalize context payload returned from Rust backend."""
|
|
182
|
+
_normalize_entry(payload.get("target", {}))
|
|
183
|
+
_normalize_entries(payload.get("context_before", []))
|
|
184
|
+
_normalize_entries(payload.get("context_after", []))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def search(
|
|
188
|
+
files: List[str],
|
|
189
|
+
query: Optional[str] = None,
|
|
190
|
+
level: Optional[str] = None,
|
|
191
|
+
thread_id: Optional[str] = None,
|
|
192
|
+
correlation_id: Optional[str] = None,
|
|
193
|
+
limit: Optional[int] = None,
|
|
194
|
+
context_lines: int = 3,
|
|
195
|
+
output_format: str = "full",
|
|
196
|
+
parser_format: Optional[str] = None,
|
|
197
|
+
custom_regex: Optional[str] = None,
|
|
198
|
+
) -> Dict[str, Any]:
|
|
199
|
+
"""
|
|
200
|
+
Search logs with filters.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
files: List of log file paths
|
|
204
|
+
query: Search query string
|
|
205
|
+
level: Filter by log level (ERROR, WARN, INFO, etc.)
|
|
206
|
+
thread_id: Filter by thread ID
|
|
207
|
+
correlation_id: Filter by correlation ID
|
|
208
|
+
limit: Maximum number of results
|
|
209
|
+
context_lines: Number of context lines before/after each result
|
|
210
|
+
output_format: Output format - "full", "summary", "count", or "compact"
|
|
211
|
+
- "full": Complete log entries (default)
|
|
212
|
+
- "summary": Aggregated summary with examples
|
|
213
|
+
- "count": Just counts, no log content
|
|
214
|
+
- "compact": Essential fields only (no raw logs)
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Dictionary with search results (format depends on output_format):
|
|
218
|
+
|
|
219
|
+
For "full":
|
|
220
|
+
{
|
|
221
|
+
"results": [...], # Full entries
|
|
222
|
+
"total_matches": 123,
|
|
223
|
+
"search_time_ms": 45
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
For "summary":
|
|
227
|
+
{
|
|
228
|
+
"total_matches": 123,
|
|
229
|
+
"unique_messages": 15,
|
|
230
|
+
"log_levels": {"ERROR": 100, "WARN": 23},
|
|
231
|
+
"top_messages": [
|
|
232
|
+
{"message": "...", "count": 50, "first_seen": "...", "last_seen": "..."},
|
|
233
|
+
...
|
|
234
|
+
],
|
|
235
|
+
"sample_entries": [...] # 3-5 examples
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
For "count":
|
|
239
|
+
{
|
|
240
|
+
"total_matches": 123,
|
|
241
|
+
"by_level": {"ERROR": 100, "WARN": 23},
|
|
242
|
+
"by_file": {"app.log": 80, "api.log": 43},
|
|
243
|
+
"time_range": {"start": "...", "end": "..."}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
For "compact":
|
|
247
|
+
{
|
|
248
|
+
"matches": [
|
|
249
|
+
{"time": "...", "level": "ERROR", "msg": "...", "thread": "..."},
|
|
250
|
+
...
|
|
251
|
+
],
|
|
252
|
+
"total": 123
|
|
253
|
+
}
|
|
254
|
+
"""
|
|
255
|
+
if not RUST_AVAILABLE:
|
|
256
|
+
raise RuntimeError("Rust backend not available")
|
|
257
|
+
|
|
258
|
+
investigator = logler_rs.PyInvestigator()
|
|
259
|
+
_load_files_with_config(investigator, files, parser_format, custom_regex)
|
|
260
|
+
|
|
261
|
+
# Build query
|
|
262
|
+
filters = {"levels": []}
|
|
263
|
+
if level:
|
|
264
|
+
level_map = {
|
|
265
|
+
"trace": "Trace",
|
|
266
|
+
"debug": "Debug",
|
|
267
|
+
"info": "Info",
|
|
268
|
+
"warn": "Warn",
|
|
269
|
+
"warning": "Warn",
|
|
270
|
+
"error": "Error",
|
|
271
|
+
"fatal": "Fatal",
|
|
272
|
+
"critical": "Fatal",
|
|
273
|
+
}
|
|
274
|
+
normalized_level = level_map.get(level.lower())
|
|
275
|
+
if not normalized_level:
|
|
276
|
+
raise ValueError(f"Unknown log level: {level}")
|
|
277
|
+
filters["levels"] = [normalized_level]
|
|
278
|
+
if thread_id:
|
|
279
|
+
filters["thread_id"] = thread_id
|
|
280
|
+
if correlation_id:
|
|
281
|
+
filters["correlation_id"] = correlation_id
|
|
282
|
+
|
|
283
|
+
query_dict = {
|
|
284
|
+
"files": files,
|
|
285
|
+
"query": query,
|
|
286
|
+
"filters": filters,
|
|
287
|
+
"limit": limit,
|
|
288
|
+
"context_lines": context_lines,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
# Call Rust engine with the full query payload
|
|
292
|
+
result_json = investigator.search(json.dumps(query_dict))
|
|
293
|
+
result = json.loads(result_json)
|
|
294
|
+
_normalize_search_result_levels(result)
|
|
295
|
+
_apply_custom_regex_to_results(result, custom_regex)
|
|
296
|
+
|
|
297
|
+
# Transform based on output_format
|
|
298
|
+
if output_format == "full":
|
|
299
|
+
return result
|
|
300
|
+
elif output_format == "summary":
|
|
301
|
+
return _format_as_summary(result)
|
|
302
|
+
elif output_format == "count":
|
|
303
|
+
return _format_as_count(result)
|
|
304
|
+
elif output_format == "compact":
|
|
305
|
+
return _format_as_compact(result)
|
|
306
|
+
else:
|
|
307
|
+
return result
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def follow_thread(
|
|
311
|
+
files: List[str],
|
|
312
|
+
thread_id: Optional[str] = None,
|
|
313
|
+
correlation_id: Optional[str] = None,
|
|
314
|
+
trace_id: Optional[str] = None,
|
|
315
|
+
parser_format: Optional[str] = None,
|
|
316
|
+
custom_regex: Optional[str] = None,
|
|
317
|
+
) -> Dict[str, Any]:
|
|
318
|
+
"""
|
|
319
|
+
Follow a thread/correlation/trace through log files.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
files: List of log file paths
|
|
323
|
+
thread_id: Thread ID to follow
|
|
324
|
+
correlation_id: Correlation ID to follow
|
|
325
|
+
trace_id: Trace ID to follow
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Dictionary with timeline:
|
|
329
|
+
{
|
|
330
|
+
"entries": [...],
|
|
331
|
+
"total_entries": 42,
|
|
332
|
+
"duration_ms": 1523,
|
|
333
|
+
"unique_spans": [...]
|
|
334
|
+
}
|
|
335
|
+
"""
|
|
336
|
+
if not RUST_AVAILABLE:
|
|
337
|
+
raise RuntimeError("Rust backend not available")
|
|
338
|
+
|
|
339
|
+
# Use Investigator when custom parsing is requested so parsing honors the config.
|
|
340
|
+
if parser_format or custom_regex:
|
|
341
|
+
inv = Investigator()
|
|
342
|
+
inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
|
|
343
|
+
return inv.follow_thread(
|
|
344
|
+
thread_id=thread_id, correlation_id=correlation_id, trace_id=trace_id
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
result_json = logler_rs.follow_thread(files, thread_id, correlation_id, trace_id)
|
|
348
|
+
result = json.loads(result_json)
|
|
349
|
+
_normalize_entries(result.get("entries", []))
|
|
350
|
+
return result
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def get_context(
|
|
354
|
+
file: str,
|
|
355
|
+
line_number: int,
|
|
356
|
+
lines_before: int = 10,
|
|
357
|
+
lines_after: int = 10,
|
|
358
|
+
) -> Dict[str, Any]:
|
|
359
|
+
"""
|
|
360
|
+
Get context around a specific log line.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
file: Log file path
|
|
364
|
+
line_number: Line number to get context for
|
|
365
|
+
lines_before: Number of lines before
|
|
366
|
+
lines_after: Number of lines after
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Dictionary with context:
|
|
370
|
+
{
|
|
371
|
+
"target": {...},
|
|
372
|
+
"context_before": [...],
|
|
373
|
+
"context_after": [...],
|
|
374
|
+
}
|
|
375
|
+
"""
|
|
376
|
+
if not RUST_AVAILABLE:
|
|
377
|
+
raise RuntimeError("Rust backend not available")
|
|
378
|
+
|
|
379
|
+
# Use Investigator class for more complex operations
|
|
380
|
+
investigator = logler_rs.PyInvestigator()
|
|
381
|
+
investigator.load_files([file])
|
|
382
|
+
result_json = investigator.get_context(file, line_number, lines_before, lines_after, False)
|
|
383
|
+
result = json.loads(result_json)
|
|
384
|
+
_normalize_context_payload(result)
|
|
385
|
+
return result
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def follow_thread_hierarchy(
|
|
389
|
+
files: List[str],
|
|
390
|
+
root_identifier: str,
|
|
391
|
+
max_depth: Optional[int] = None,
|
|
392
|
+
use_naming_patterns: bool = True,
|
|
393
|
+
use_temporal_inference: bool = True,
|
|
394
|
+
min_confidence: float = 0.0,
|
|
395
|
+
parser_format: Optional[str] = None,
|
|
396
|
+
custom_regex: Optional[str] = None,
|
|
397
|
+
) -> Dict[str, Any]:
|
|
398
|
+
"""
|
|
399
|
+
Build hierarchical tree of threads/spans showing parent-child relationships.
|
|
400
|
+
|
|
401
|
+
This detects sub-threads and nested operations using:
|
|
402
|
+
- Explicit parent_span_id fields (OpenTelemetry)
|
|
403
|
+
- Naming patterns (worker-1.task-a, main:subtask-1)
|
|
404
|
+
- Temporal inference (time-based proximity)
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
files: List of log file paths
|
|
408
|
+
root_identifier: Root thread ID, correlation ID, or span ID
|
|
409
|
+
max_depth: Maximum depth of hierarchy tree (default: unlimited)
|
|
410
|
+
use_naming_patterns: Enable naming pattern detection (default: True)
|
|
411
|
+
use_temporal_inference: Enable time-based inference (default: True)
|
|
412
|
+
min_confidence: Minimum confidence score (0.0-1.0, default: 0.0)
|
|
413
|
+
parser_format: Optional log format hint
|
|
414
|
+
custom_regex: Optional custom parsing regex
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Dictionary with hierarchical structure:
|
|
418
|
+
{
|
|
419
|
+
"roots": [
|
|
420
|
+
{
|
|
421
|
+
"id": "main-thread",
|
|
422
|
+
"node_type": "Thread" | "Span" | "CorrelationGroup",
|
|
423
|
+
"name": "Main Request Handler",
|
|
424
|
+
"parent_id": null,
|
|
425
|
+
"children": [
|
|
426
|
+
{
|
|
427
|
+
"id": "worker-1.db-query",
|
|
428
|
+
"node_type": "Span",
|
|
429
|
+
"name": "Database Query",
|
|
430
|
+
"parent_id": "main-thread",
|
|
431
|
+
"children": [],
|
|
432
|
+
"entry_ids": [...],
|
|
433
|
+
"start_time": "2024-01-15T10:00:00Z",
|
|
434
|
+
"end_time": "2024-01-15T10:00:02Z",
|
|
435
|
+
"duration_ms": 2000,
|
|
436
|
+
"entry_count": 15,
|
|
437
|
+
"error_count": 0,
|
|
438
|
+
"level_counts": {"INFO": 12, "DEBUG": 3},
|
|
439
|
+
"depth": 1,
|
|
440
|
+
"confidence": 0.8,
|
|
441
|
+
"relationship_evidence": ["Naming pattern: worker-1.db-query"]
|
|
442
|
+
}
|
|
443
|
+
],
|
|
444
|
+
"entry_ids": [...],
|
|
445
|
+
"start_time": "2024-01-15T10:00:00Z",
|
|
446
|
+
"end_time": "2024-01-15T10:00:05Z",
|
|
447
|
+
"duration_ms": 5000,
|
|
448
|
+
"entry_count": 42,
|
|
449
|
+
"error_count": 2,
|
|
450
|
+
"level_counts": {"INFO": 35, "ERROR": 2, "DEBUG": 5},
|
|
451
|
+
"depth": 0,
|
|
452
|
+
"confidence": 1.0,
|
|
453
|
+
"relationship_evidence": []
|
|
454
|
+
}
|
|
455
|
+
],
|
|
456
|
+
"total_nodes": 8,
|
|
457
|
+
"max_depth": 3,
|
|
458
|
+
"total_duration_ms": 5000,
|
|
459
|
+
"concurrent_count": 2,
|
|
460
|
+
"bottleneck": {
|
|
461
|
+
"node_id": "worker-1.db-query",
|
|
462
|
+
"duration_ms": 2000,
|
|
463
|
+
"percentage": 40.0,
|
|
464
|
+
"depth": 1
|
|
465
|
+
},
|
|
466
|
+
"error_nodes": ["worker-2.api-call"],
|
|
467
|
+
"detection_method": "ExplicitParentId" | "NamingPattern" | "TemporalInference" | "Mixed"
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
Example:
|
|
471
|
+
# Detect OpenTelemetry trace hierarchy
|
|
472
|
+
hierarchy = follow_thread_hierarchy(
|
|
473
|
+
files=["app.log"],
|
|
474
|
+
root_identifier="trace-abc123",
|
|
475
|
+
min_confidence=0.8
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Print tree structure
|
|
479
|
+
for root in hierarchy['roots']:
|
|
480
|
+
print_tree(root, indent=0)
|
|
481
|
+
|
|
482
|
+
# Find bottleneck
|
|
483
|
+
if hierarchy['bottleneck']:
|
|
484
|
+
print(f"Bottleneck: {hierarchy['bottleneck']['node_id']} ({hierarchy['bottleneck']['duration_ms']}ms)")
|
|
485
|
+
"""
|
|
486
|
+
if not RUST_AVAILABLE:
|
|
487
|
+
raise RuntimeError("Rust backend not available")
|
|
488
|
+
|
|
489
|
+
# Use Investigator when custom parsing is requested
|
|
490
|
+
if parser_format or custom_regex:
|
|
491
|
+
inv = Investigator()
|
|
492
|
+
inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
|
|
493
|
+
return inv.build_hierarchy(
|
|
494
|
+
root_identifier=root_identifier,
|
|
495
|
+
max_depth=max_depth,
|
|
496
|
+
use_naming_patterns=use_naming_patterns,
|
|
497
|
+
use_temporal_inference=use_temporal_inference,
|
|
498
|
+
min_confidence=min_confidence,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# Call Rust directly for better performance
|
|
502
|
+
result_json = logler_rs.build_hierarchy(
|
|
503
|
+
files,
|
|
504
|
+
root_identifier,
|
|
505
|
+
max_depth,
|
|
506
|
+
use_naming_patterns,
|
|
507
|
+
use_temporal_inference,
|
|
508
|
+
min_confidence,
|
|
509
|
+
)
|
|
510
|
+
return json.loads(result_json)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def get_hierarchy_summary(hierarchy: Dict[str, Any]) -> str:
|
|
514
|
+
"""
|
|
515
|
+
Generate a human-readable summary of a thread hierarchy.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
hierarchy: Hierarchy dictionary from follow_thread_hierarchy()
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
Formatted text summary
|
|
522
|
+
|
|
523
|
+
Example:
|
|
524
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
525
|
+
summary = get_hierarchy_summary(hierarchy)
|
|
526
|
+
print(summary)
|
|
527
|
+
"""
|
|
528
|
+
lines = []
|
|
529
|
+
|
|
530
|
+
# Overview
|
|
531
|
+
lines.append("=== Thread Hierarchy Summary ===")
|
|
532
|
+
lines.append(f"Total nodes: {hierarchy.get('total_nodes', 0)}")
|
|
533
|
+
lines.append(f"Max depth: {hierarchy.get('max_depth', 0)}")
|
|
534
|
+
lines.append(f"Detection method: {hierarchy.get('detection_method', 'Unknown')}")
|
|
535
|
+
|
|
536
|
+
# Duration
|
|
537
|
+
total_duration = hierarchy.get("total_duration_ms")
|
|
538
|
+
if total_duration:
|
|
539
|
+
lines.append(f"Total duration: {total_duration}ms ({total_duration/1000:.2f}s)")
|
|
540
|
+
|
|
541
|
+
# Concurrent operations
|
|
542
|
+
concurrent = hierarchy.get("concurrent_count", 0)
|
|
543
|
+
if concurrent > 1:
|
|
544
|
+
lines.append(f"Concurrent operations: {concurrent}")
|
|
545
|
+
|
|
546
|
+
# Bottleneck
|
|
547
|
+
bottleneck = hierarchy.get("bottleneck")
|
|
548
|
+
if bottleneck:
|
|
549
|
+
lines.append("")
|
|
550
|
+
lines.append("⚠️ BOTTLENECK DETECTED:")
|
|
551
|
+
lines.append(f" Node: {bottleneck.get('node_id')}")
|
|
552
|
+
lines.append(
|
|
553
|
+
f" Duration: {bottleneck.get('duration_ms')}ms ({bottleneck.get('percentage', 0):.1f}% of total)"
|
|
554
|
+
)
|
|
555
|
+
lines.append(f" Depth: {bottleneck.get('depth')}")
|
|
556
|
+
|
|
557
|
+
# Errors
|
|
558
|
+
error_nodes = hierarchy.get("error_nodes", [])
|
|
559
|
+
if error_nodes:
|
|
560
|
+
lines.append("")
|
|
561
|
+
lines.append(f"❌ Errors in {len(error_nodes)} node(s):")
|
|
562
|
+
for node_id in error_nodes[:5]: # Show first 5
|
|
563
|
+
lines.append(f" - {node_id}")
|
|
564
|
+
if len(error_nodes) > 5:
|
|
565
|
+
lines.append(f" ... and {len(error_nodes) - 5} more")
|
|
566
|
+
|
|
567
|
+
# Tree structure preview
|
|
568
|
+
roots = hierarchy.get("roots", [])
|
|
569
|
+
if roots:
|
|
570
|
+
lines.append("")
|
|
571
|
+
lines.append("Tree Structure:")
|
|
572
|
+
for root in roots[:3]: # Show first 3 roots
|
|
573
|
+
lines.append(
|
|
574
|
+
f" 📁 {root.get('id')} ({root.get('entry_count', 0)} entries, {len(root.get('children', []))} children)"
|
|
575
|
+
)
|
|
576
|
+
_append_tree_preview(root, lines, depth=1, max_depth=2)
|
|
577
|
+
if len(roots) > 3:
|
|
578
|
+
lines.append(f" ... and {len(roots) - 3} more root(s)")
|
|
579
|
+
|
|
580
|
+
return "\n".join(lines)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _append_tree_preview(node: Dict[str, Any], lines: List[str], depth: int, max_depth: int):
|
|
584
|
+
"""Helper to append tree preview to lines"""
|
|
585
|
+
if depth >= max_depth:
|
|
586
|
+
return
|
|
587
|
+
|
|
588
|
+
children = node.get("children", [])
|
|
589
|
+
for i, child in enumerate(children[:3]): # Show first 3 children
|
|
590
|
+
is_last = i == len(children) - 1
|
|
591
|
+
prefix = " " * depth + ("└─ " if is_last else "├─ ")
|
|
592
|
+
|
|
593
|
+
error_marker = "❌ " if child.get("error_count", 0) > 0 else ""
|
|
594
|
+
duration = child.get("duration_ms", 0)
|
|
595
|
+
duration_str = f" ({duration}ms)" if duration > 0 else ""
|
|
596
|
+
|
|
597
|
+
lines.append(
|
|
598
|
+
f"{prefix}{error_marker}{child.get('id')} ({child.get('entry_count', 0)} entries){duration_str}"
|
|
599
|
+
)
|
|
600
|
+
_append_tree_preview(child, lines, depth + 1, max_depth)
|
|
601
|
+
|
|
602
|
+
if len(children) > 3:
|
|
603
|
+
prefix = " " * depth + "└─ "
|
|
604
|
+
lines.append(f"{prefix}... and {len(children) - 3} more")
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def analyze_error_flow(
|
|
608
|
+
hierarchy: Dict[str, Any],
|
|
609
|
+
include_context: bool = True,
|
|
610
|
+
) -> Dict[str, Any]:
|
|
611
|
+
"""
|
|
612
|
+
Analyze error propagation through a hierarchy to identify root causes and cascading failures.
|
|
613
|
+
|
|
614
|
+
This function traces errors through parent-child relationships to find:
|
|
615
|
+
- Root cause: The first/originating error in the chain
|
|
616
|
+
- Propagation chain: How errors cascaded through the system
|
|
617
|
+
- Affected nodes: All nodes impacted by the error
|
|
618
|
+
- Impact assessment: Severity and scope of the failure
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
hierarchy: Hierarchy dictionary from follow_thread_hierarchy()
|
|
622
|
+
include_context: Include sample error messages (default: True)
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Dictionary with error flow analysis:
|
|
626
|
+
{
|
|
627
|
+
"has_errors": bool,
|
|
628
|
+
"total_error_nodes": int,
|
|
629
|
+
"root_causes": [
|
|
630
|
+
{
|
|
631
|
+
"node_id": "redis-write",
|
|
632
|
+
"node_type": "Span",
|
|
633
|
+
"error_count": 1,
|
|
634
|
+
"depth": 3,
|
|
635
|
+
"timestamp": "2024-01-15T10:00:01.020Z",
|
|
636
|
+
"path": ["api-gateway", "product-service", "cache-update", "redis-write"],
|
|
637
|
+
"is_leaf": True,
|
|
638
|
+
"confidence": 0.95
|
|
639
|
+
}
|
|
640
|
+
],
|
|
641
|
+
"propagation_chains": [
|
|
642
|
+
{
|
|
643
|
+
"root_cause": "redis-write",
|
|
644
|
+
"chain": [
|
|
645
|
+
{"node_id": "redis-write", "error_count": 1, "depth": 3},
|
|
646
|
+
{"node_id": "cache-update", "error_count": 1, "depth": 2},
|
|
647
|
+
{"node_id": "product-service", "error_count": 1, "depth": 1}
|
|
648
|
+
],
|
|
649
|
+
"total_affected": 3,
|
|
650
|
+
"propagation_type": "upward" # errors bubbled up to parent
|
|
651
|
+
}
|
|
652
|
+
],
|
|
653
|
+
"impact_summary": {
|
|
654
|
+
"total_affected_nodes": 5,
|
|
655
|
+
"affected_percentage": 35.7,
|
|
656
|
+
"max_propagation_depth": 3,
|
|
657
|
+
"concurrent_failures": 2
|
|
658
|
+
},
|
|
659
|
+
"recommendations": [
|
|
660
|
+
"Investigate redis-write first - it appears to be the root cause",
|
|
661
|
+
"Consider adding retry logic for cache operations",
|
|
662
|
+
"3 nodes show cascading failures from a single source"
|
|
663
|
+
]
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
Example:
|
|
667
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
668
|
+
error_analysis = analyze_error_flow(hierarchy)
|
|
669
|
+
|
|
670
|
+
if error_analysis['has_errors']:
|
|
671
|
+
print(f"Root cause: {error_analysis['root_causes'][0]['node_id']}")
|
|
672
|
+
for rec in error_analysis['recommendations']:
|
|
673
|
+
print(f" - {rec}")
|
|
674
|
+
"""
|
|
675
|
+
result = {
|
|
676
|
+
"has_errors": False,
|
|
677
|
+
"total_error_nodes": 0,
|
|
678
|
+
"root_causes": [],
|
|
679
|
+
"propagation_chains": [],
|
|
680
|
+
"impact_summary": {
|
|
681
|
+
"total_affected_nodes": 0,
|
|
682
|
+
"affected_percentage": 0.0,
|
|
683
|
+
"max_propagation_depth": 0,
|
|
684
|
+
"concurrent_failures": 0,
|
|
685
|
+
},
|
|
686
|
+
"recommendations": [],
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
error_nodes = hierarchy.get("error_nodes", [])
|
|
690
|
+
if not error_nodes:
|
|
691
|
+
return result
|
|
692
|
+
|
|
693
|
+
result["has_errors"] = True
|
|
694
|
+
result["total_error_nodes"] = len(error_nodes)
|
|
695
|
+
|
|
696
|
+
# Build node lookup and parent mapping
|
|
697
|
+
all_nodes = {}
|
|
698
|
+
parent_map = {} # child_id -> parent_id
|
|
699
|
+
|
|
700
|
+
def collect_nodes(node: Dict[str, Any], parent_id: Optional[str] = None):
|
|
701
|
+
node_id = node.get("id")
|
|
702
|
+
if node_id:
|
|
703
|
+
all_nodes[node_id] = node
|
|
704
|
+
if parent_id:
|
|
705
|
+
parent_map[node_id] = parent_id
|
|
706
|
+
for child in node.get("children", []):
|
|
707
|
+
collect_nodes(child, node_id)
|
|
708
|
+
|
|
709
|
+
for root in hierarchy.get("roots", []):
|
|
710
|
+
collect_nodes(root)
|
|
711
|
+
|
|
712
|
+
# Find root causes (errors at leaf nodes or deepest error in each chain)
|
|
713
|
+
error_node_data = []
|
|
714
|
+
for node_id in error_nodes:
|
|
715
|
+
node = all_nodes.get(node_id)
|
|
716
|
+
if node:
|
|
717
|
+
error_node_data.append(
|
|
718
|
+
{
|
|
719
|
+
"node_id": node_id,
|
|
720
|
+
"node_type": node.get("node_type", "Unknown"),
|
|
721
|
+
"error_count": node.get("error_count", 0),
|
|
722
|
+
"depth": node.get("depth", 0),
|
|
723
|
+
"timestamp": node.get("start_time"),
|
|
724
|
+
"is_leaf": len(node.get("children", [])) == 0,
|
|
725
|
+
"children_with_errors": sum(
|
|
726
|
+
1 for c in node.get("children", []) if c.get("error_count", 0) > 0
|
|
727
|
+
),
|
|
728
|
+
}
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Sort by depth (deepest first) and timestamp (earliest first)
|
|
732
|
+
error_node_data.sort(key=lambda x: (-x["depth"], x["timestamp"] or ""))
|
|
733
|
+
|
|
734
|
+
# Identify root causes - errors that didn't come from children
|
|
735
|
+
root_causes = []
|
|
736
|
+
|
|
737
|
+
for error_node in error_node_data:
|
|
738
|
+
node_id = error_node["node_id"]
|
|
739
|
+
|
|
740
|
+
# Build path from root to this node
|
|
741
|
+
path = []
|
|
742
|
+
current = node_id
|
|
743
|
+
while current:
|
|
744
|
+
path.insert(0, current)
|
|
745
|
+
current = parent_map.get(current)
|
|
746
|
+
|
|
747
|
+
# Check if this is a root cause (no child errors, or leaf node)
|
|
748
|
+
if error_node["children_with_errors"] == 0:
|
|
749
|
+
# Calculate confidence based on evidence
|
|
750
|
+
confidence = 1.0 if error_node["is_leaf"] else 0.85
|
|
751
|
+
|
|
752
|
+
root_causes.append(
|
|
753
|
+
{
|
|
754
|
+
"node_id": node_id,
|
|
755
|
+
"node_type": error_node["node_type"],
|
|
756
|
+
"error_count": error_node["error_count"],
|
|
757
|
+
"depth": error_node["depth"],
|
|
758
|
+
"timestamp": error_node["timestamp"],
|
|
759
|
+
"path": path,
|
|
760
|
+
"is_leaf": error_node["is_leaf"],
|
|
761
|
+
"confidence": confidence,
|
|
762
|
+
}
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
result["root_causes"] = root_causes
|
|
766
|
+
|
|
767
|
+
# Build propagation chains (trace errors upward from root causes)
|
|
768
|
+
propagation_chains = []
|
|
769
|
+
|
|
770
|
+
for root_cause in root_causes:
|
|
771
|
+
chain = []
|
|
772
|
+
current_id = root_cause["node_id"]
|
|
773
|
+
|
|
774
|
+
# Walk up the tree
|
|
775
|
+
while current_id:
|
|
776
|
+
node = all_nodes.get(current_id)
|
|
777
|
+
if node:
|
|
778
|
+
chain.append(
|
|
779
|
+
{
|
|
780
|
+
"node_id": current_id,
|
|
781
|
+
"error_count": node.get("error_count", 0),
|
|
782
|
+
"depth": node.get("depth", 0),
|
|
783
|
+
}
|
|
784
|
+
)
|
|
785
|
+
current_id = parent_map.get(current_id)
|
|
786
|
+
|
|
787
|
+
# Only include chains where errors actually propagated
|
|
788
|
+
if len(chain) > 1:
|
|
789
|
+
# Check if parent nodes also have errors
|
|
790
|
+
propagated_chain = [c for c in chain if c["error_count"] > 0]
|
|
791
|
+
if len(propagated_chain) > 1:
|
|
792
|
+
propagation_chains.append(
|
|
793
|
+
{
|
|
794
|
+
"root_cause": root_cause["node_id"],
|
|
795
|
+
"chain": propagated_chain,
|
|
796
|
+
"total_affected": len(propagated_chain),
|
|
797
|
+
"propagation_type": "upward",
|
|
798
|
+
}
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
result["propagation_chains"] = propagation_chains
|
|
802
|
+
|
|
803
|
+
# Calculate impact summary
|
|
804
|
+
total_nodes = hierarchy.get("total_nodes", 1)
|
|
805
|
+
affected_nodes = len(set(error_nodes))
|
|
806
|
+
max_depth = max((rc["depth"] for rc in root_causes), default=0)
|
|
807
|
+
|
|
808
|
+
# Count concurrent failures (root causes at same depth)
|
|
809
|
+
depth_counts = defaultdict(int)
|
|
810
|
+
for rc in root_causes:
|
|
811
|
+
depth_counts[rc["depth"]] += 1
|
|
812
|
+
concurrent = max(depth_counts.values(), default=0)
|
|
813
|
+
|
|
814
|
+
result["impact_summary"] = {
|
|
815
|
+
"total_affected_nodes": affected_nodes,
|
|
816
|
+
"affected_percentage": (affected_nodes / total_nodes * 100) if total_nodes > 0 else 0,
|
|
817
|
+
"max_propagation_depth": max_depth,
|
|
818
|
+
"concurrent_failures": concurrent if concurrent > 1 else 0,
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
# Generate recommendations
|
|
822
|
+
recommendations = []
|
|
823
|
+
|
|
824
|
+
if root_causes:
|
|
825
|
+
primary_cause = root_causes[0]
|
|
826
|
+
recommendations.append(
|
|
827
|
+
f"Investigate {primary_cause['node_id']} first - it appears to be the root cause"
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
if primary_cause["is_leaf"]:
|
|
831
|
+
recommendations.append(
|
|
832
|
+
f"Error originated at leaf node (depth {primary_cause['depth']}) - check external dependencies"
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
if len(propagation_chains) > 0:
|
|
836
|
+
total_propagated = sum(c["total_affected"] for c in propagation_chains)
|
|
837
|
+
recommendations.append(
|
|
838
|
+
f"{total_propagated} nodes show cascading failures - consider adding circuit breakers"
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
if concurrent > 1:
|
|
842
|
+
recommendations.append(
|
|
843
|
+
f"{concurrent} concurrent failures detected - possible systemic issue"
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
if result["impact_summary"]["affected_percentage"] > 50:
|
|
847
|
+
recommendations.append(
|
|
848
|
+
"High impact failure (>50% of nodes affected) - prioritize investigation"
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
result["recommendations"] = recommendations
|
|
852
|
+
|
|
853
|
+
return result
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def format_error_flow(
|
|
857
|
+
error_analysis: Dict[str, Any],
|
|
858
|
+
show_chains: bool = True,
|
|
859
|
+
show_recommendations: bool = True,
|
|
860
|
+
) -> str:
|
|
861
|
+
"""
|
|
862
|
+
Format error flow analysis as human-readable text.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
error_analysis: Error analysis from analyze_error_flow()
|
|
866
|
+
show_chains: Show propagation chains (default: True)
|
|
867
|
+
show_recommendations: Show recommendations (default: True)
|
|
868
|
+
|
|
869
|
+
Returns:
|
|
870
|
+
Formatted error flow string
|
|
871
|
+
|
|
872
|
+
Example:
|
|
873
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
874
|
+
error_analysis = analyze_error_flow(hierarchy)
|
|
875
|
+
print(format_error_flow(error_analysis))
|
|
876
|
+
"""
|
|
877
|
+
lines = []
|
|
878
|
+
|
|
879
|
+
if not error_analysis.get("has_errors"):
|
|
880
|
+
return "✅ No errors detected in hierarchy"
|
|
881
|
+
|
|
882
|
+
# Header
|
|
883
|
+
lines.append("=" * 70)
|
|
884
|
+
lines.append("🔍 ERROR FLOW ANALYSIS")
|
|
885
|
+
lines.append("=" * 70)
|
|
886
|
+
lines.append("")
|
|
887
|
+
|
|
888
|
+
# Summary
|
|
889
|
+
total = error_analysis.get("total_error_nodes", 0)
|
|
890
|
+
impact = error_analysis.get("impact_summary", {})
|
|
891
|
+
lines.append(f"Total error nodes: {total}")
|
|
892
|
+
lines.append(f"Affected: {impact.get('affected_percentage', 0):.1f}% of hierarchy")
|
|
893
|
+
|
|
894
|
+
if impact.get("concurrent_failures", 0) > 1:
|
|
895
|
+
lines.append(f"Concurrent failures: {impact['concurrent_failures']}")
|
|
896
|
+
|
|
897
|
+
lines.append("")
|
|
898
|
+
|
|
899
|
+
# Root Causes
|
|
900
|
+
root_causes = error_analysis.get("root_causes", [])
|
|
901
|
+
if root_causes:
|
|
902
|
+
lines.append("-" * 70)
|
|
903
|
+
lines.append("🔴 ROOT CAUSE(S)")
|
|
904
|
+
lines.append("-" * 70)
|
|
905
|
+
|
|
906
|
+
for i, cause in enumerate(root_causes, 1):
|
|
907
|
+
confidence_pct = int(cause.get("confidence", 0) * 100)
|
|
908
|
+
leaf_marker = " (leaf node)" if cause.get("is_leaf") else ""
|
|
909
|
+
|
|
910
|
+
lines.append(f"\n {i}. {cause['node_id']}{leaf_marker}")
|
|
911
|
+
lines.append(f" Type: {cause.get('node_type', 'Unknown')}")
|
|
912
|
+
lines.append(f" Errors: {cause.get('error_count', 0)}")
|
|
913
|
+
lines.append(f" Depth: {cause.get('depth', 0)}")
|
|
914
|
+
lines.append(f" Confidence: {confidence_pct}%")
|
|
915
|
+
|
|
916
|
+
if cause.get("timestamp"):
|
|
917
|
+
lines.append(f" Time: {cause['timestamp']}")
|
|
918
|
+
|
|
919
|
+
if cause.get("path"):
|
|
920
|
+
path_str = " → ".join(cause["path"])
|
|
921
|
+
lines.append(f" Path: {path_str}")
|
|
922
|
+
|
|
923
|
+
# Propagation Chains
|
|
924
|
+
if show_chains:
|
|
925
|
+
chains = error_analysis.get("propagation_chains", [])
|
|
926
|
+
if chains:
|
|
927
|
+
lines.append("")
|
|
928
|
+
lines.append("-" * 70)
|
|
929
|
+
lines.append("📈 ERROR PROPAGATION")
|
|
930
|
+
lines.append("-" * 70)
|
|
931
|
+
|
|
932
|
+
for chain_data in chains:
|
|
933
|
+
lines.append(f"\n From: {chain_data['root_cause']}")
|
|
934
|
+
lines.append(f" Affected nodes: {chain_data['total_affected']}")
|
|
935
|
+
lines.append(" Chain:")
|
|
936
|
+
|
|
937
|
+
chain = chain_data.get("chain", [])
|
|
938
|
+
for j, node in enumerate(chain):
|
|
939
|
+
is_last = j == len(chain) - 1
|
|
940
|
+
prefix = " └─" if is_last else " ├─"
|
|
941
|
+
arrow = " ← ROOT CAUSE" if j == 0 else ""
|
|
942
|
+
lines.append(
|
|
943
|
+
f"{prefix} {node['node_id']} ({node['error_count']} errors){arrow}"
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
# Recommendations
|
|
947
|
+
if show_recommendations:
|
|
948
|
+
recommendations = error_analysis.get("recommendations", [])
|
|
949
|
+
if recommendations:
|
|
950
|
+
lines.append("")
|
|
951
|
+
lines.append("-" * 70)
|
|
952
|
+
lines.append("💡 RECOMMENDATIONS")
|
|
953
|
+
lines.append("-" * 70)
|
|
954
|
+
|
|
955
|
+
for rec in recommendations:
|
|
956
|
+
lines.append(f" • {rec}")
|
|
957
|
+
|
|
958
|
+
lines.append("")
|
|
959
|
+
lines.append("=" * 70)
|
|
960
|
+
|
|
961
|
+
return "\n".join(lines)
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def detect_correlation_chains(
|
|
965
|
+
files: List[str],
|
|
966
|
+
root_correlation_id: Optional[str] = None,
|
|
967
|
+
chain_patterns: Optional[List[str]] = None,
|
|
968
|
+
parser_format: Optional[str] = None,
|
|
969
|
+
) -> Dict[str, Any]:
|
|
970
|
+
"""
|
|
971
|
+
Detect correlation ID chaining where one request spawns sub-requests.
|
|
972
|
+
|
|
973
|
+
This function identifies parent-child relationships between correlation IDs
|
|
974
|
+
by analyzing log messages for patterns like:
|
|
975
|
+
- "spawning request {child_id}"
|
|
976
|
+
- "child_correlation_id": "xxx"
|
|
977
|
+
- "parent_request_id": "xxx"
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
files: List of log file paths to analyze
|
|
981
|
+
root_correlation_id: Optional root correlation ID to start from
|
|
982
|
+
chain_patterns: Optional custom regex patterns for detecting chains
|
|
983
|
+
parser_format: Optional log format hint
|
|
984
|
+
|
|
985
|
+
Returns:
|
|
986
|
+
Dictionary with correlation chain information:
|
|
987
|
+
{
|
|
988
|
+
"chains": [
|
|
989
|
+
{
|
|
990
|
+
"parent_correlation_id": "req-123",
|
|
991
|
+
"child_correlation_id": "subreq-456",
|
|
992
|
+
"evidence": "Spawning sub-request subreq-456",
|
|
993
|
+
"timestamp": "2024-01-15T10:00:00Z",
|
|
994
|
+
"confidence": 0.9
|
|
995
|
+
}
|
|
996
|
+
],
|
|
997
|
+
"root_ids": ["req-123"],
|
|
998
|
+
"hierarchy": {
|
|
999
|
+
"req-123": ["subreq-456", "subreq-789"],
|
|
1000
|
+
"subreq-456": ["subreq-456-a"]
|
|
1001
|
+
},
|
|
1002
|
+
"total_chains": 3
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
Example:
|
|
1006
|
+
chains = detect_correlation_chains(
|
|
1007
|
+
files=["app.log", "service.log"],
|
|
1008
|
+
root_correlation_id="req-main-001"
|
|
1009
|
+
)
|
|
1010
|
+
for chain in chains['chains']:
|
|
1011
|
+
print(f"{chain['parent_correlation_id']} -> {chain['child_correlation_id']}")
|
|
1012
|
+
"""
|
|
1013
|
+
# Default patterns to detect correlation chaining
|
|
1014
|
+
default_patterns = [
|
|
1015
|
+
# Explicit field patterns
|
|
1016
|
+
r'child_correlation_id["\s:=]+([a-zA-Z0-9_-]+)',
|
|
1017
|
+
r'parent_correlation_id["\s:=]+([a-zA-Z0-9_-]+)',
|
|
1018
|
+
r'parent_request_id["\s:=]+([a-zA-Z0-9_-]+)',
|
|
1019
|
+
r'spawned_request["\s:=]+([a-zA-Z0-9_-]+)',
|
|
1020
|
+
# Message patterns
|
|
1021
|
+
r"[Ss]pawning (?:sub-?)?request[:\s]+([a-zA-Z0-9_-]+)",
|
|
1022
|
+
r"[Cc]reating child request[:\s]+([a-zA-Z0-9_-]+)",
|
|
1023
|
+
r"[Ff]orked to[:\s]+([a-zA-Z0-9_-]+)",
|
|
1024
|
+
r"[Dd]elegating to[:\s]+([a-zA-Z0-9_-]+)",
|
|
1025
|
+
r"[Ss]ub-?request[:\s]+([a-zA-Z0-9_-]+)",
|
|
1026
|
+
]
|
|
1027
|
+
|
|
1028
|
+
patterns = chain_patterns or default_patterns
|
|
1029
|
+
compiled_patterns = [re.compile(p) for p in patterns]
|
|
1030
|
+
|
|
1031
|
+
# Read and parse logs
|
|
1032
|
+
entries = []
|
|
1033
|
+
if RUST_AVAILABLE:
|
|
1034
|
+
for file_path in files:
|
|
1035
|
+
result_json = logler_rs.search(
|
|
1036
|
+
[file_path],
|
|
1037
|
+
"", # No query filter
|
|
1038
|
+
None, # level
|
|
1039
|
+
None, # thread_id
|
|
1040
|
+
None, # correlation_id
|
|
1041
|
+
None, # trace_id
|
|
1042
|
+
None, # start_time
|
|
1043
|
+
None, # end_time
|
|
1044
|
+
10000, # limit - get many entries
|
|
1045
|
+
0, # offset
|
|
1046
|
+
)
|
|
1047
|
+
result = json.loads(result_json)
|
|
1048
|
+
entries.extend(result.get("entries", []))
|
|
1049
|
+
else:
|
|
1050
|
+
# Fallback to Python parsing
|
|
1051
|
+
from .parser import LogParser
|
|
1052
|
+
|
|
1053
|
+
parser = LogParser()
|
|
1054
|
+
for file_path in files:
|
|
1055
|
+
with open(file_path, "r") as f:
|
|
1056
|
+
for line in f:
|
|
1057
|
+
entry = parser.parse_line(line)
|
|
1058
|
+
if entry:
|
|
1059
|
+
entries.append(entry.__dict__ if hasattr(entry, "__dict__") else entry)
|
|
1060
|
+
|
|
1061
|
+
# Detect chains
|
|
1062
|
+
chains = []
|
|
1063
|
+
hierarchy = defaultdict(list)
|
|
1064
|
+
all_correlation_ids = set()
|
|
1065
|
+
|
|
1066
|
+
for entry in entries:
|
|
1067
|
+
correlation_id = entry.get("correlation_id")
|
|
1068
|
+
message = entry.get("message", "")
|
|
1069
|
+
timestamp = entry.get("timestamp")
|
|
1070
|
+
fields = entry.get("fields", {})
|
|
1071
|
+
|
|
1072
|
+
if correlation_id:
|
|
1073
|
+
all_correlation_ids.add(correlation_id)
|
|
1074
|
+
|
|
1075
|
+
# Check explicit fields first
|
|
1076
|
+
child_id = fields.get("child_correlation_id") or fields.get("spawned_request")
|
|
1077
|
+
parent_id = fields.get("parent_correlation_id") or fields.get("parent_request_id")
|
|
1078
|
+
|
|
1079
|
+
if child_id and correlation_id:
|
|
1080
|
+
chains.append(
|
|
1081
|
+
{
|
|
1082
|
+
"parent_correlation_id": correlation_id,
|
|
1083
|
+
"child_correlation_id": child_id,
|
|
1084
|
+
"evidence": f"Explicit field: child_correlation_id={child_id}",
|
|
1085
|
+
"timestamp": timestamp,
|
|
1086
|
+
"confidence": 1.0,
|
|
1087
|
+
}
|
|
1088
|
+
)
|
|
1089
|
+
hierarchy[correlation_id].append(child_id)
|
|
1090
|
+
all_correlation_ids.add(child_id)
|
|
1091
|
+
|
|
1092
|
+
if parent_id and correlation_id:
|
|
1093
|
+
chains.append(
|
|
1094
|
+
{
|
|
1095
|
+
"parent_correlation_id": parent_id,
|
|
1096
|
+
"child_correlation_id": correlation_id,
|
|
1097
|
+
"evidence": f"Explicit field: parent_correlation_id={parent_id}",
|
|
1098
|
+
"timestamp": timestamp,
|
|
1099
|
+
"confidence": 1.0,
|
|
1100
|
+
}
|
|
1101
|
+
)
|
|
1102
|
+
hierarchy[parent_id].append(correlation_id)
|
|
1103
|
+
all_correlation_ids.add(parent_id)
|
|
1104
|
+
|
|
1105
|
+
# Check message patterns
|
|
1106
|
+
for pattern in compiled_patterns:
|
|
1107
|
+
match = pattern.search(message)
|
|
1108
|
+
if match and correlation_id:
|
|
1109
|
+
detected_id = match.group(1)
|
|
1110
|
+
if detected_id != correlation_id:
|
|
1111
|
+
# Determine if it's a parent or child reference
|
|
1112
|
+
if "parent" in pattern.pattern.lower():
|
|
1113
|
+
chains.append(
|
|
1114
|
+
{
|
|
1115
|
+
"parent_correlation_id": detected_id,
|
|
1116
|
+
"child_correlation_id": correlation_id,
|
|
1117
|
+
"evidence": f"Pattern match in message: {match.group(0)}",
|
|
1118
|
+
"timestamp": timestamp,
|
|
1119
|
+
"confidence": 0.85,
|
|
1120
|
+
}
|
|
1121
|
+
)
|
|
1122
|
+
hierarchy[detected_id].append(correlation_id)
|
|
1123
|
+
else:
|
|
1124
|
+
chains.append(
|
|
1125
|
+
{
|
|
1126
|
+
"parent_correlation_id": correlation_id,
|
|
1127
|
+
"child_correlation_id": detected_id,
|
|
1128
|
+
"evidence": f"Pattern match in message: {match.group(0)}",
|
|
1129
|
+
"timestamp": timestamp,
|
|
1130
|
+
"confidence": 0.85,
|
|
1131
|
+
}
|
|
1132
|
+
)
|
|
1133
|
+
hierarchy[correlation_id].append(detected_id)
|
|
1134
|
+
all_correlation_ids.add(detected_id)
|
|
1135
|
+
|
|
1136
|
+
# Deduplicate chains
|
|
1137
|
+
seen = set()
|
|
1138
|
+
unique_chains = []
|
|
1139
|
+
for chain in chains:
|
|
1140
|
+
key = (chain["parent_correlation_id"], chain["child_correlation_id"])
|
|
1141
|
+
if key not in seen:
|
|
1142
|
+
seen.add(key)
|
|
1143
|
+
unique_chains.append(chain)
|
|
1144
|
+
|
|
1145
|
+
# Find root IDs (correlation IDs that are never a child)
|
|
1146
|
+
all_children = set()
|
|
1147
|
+
for children in hierarchy.values():
|
|
1148
|
+
all_children.update(children)
|
|
1149
|
+
|
|
1150
|
+
root_ids = [cid for cid in all_correlation_ids if cid not in all_children]
|
|
1151
|
+
|
|
1152
|
+
# Filter by root_correlation_id if specified
|
|
1153
|
+
if root_correlation_id:
|
|
1154
|
+
# Build the tree from root
|
|
1155
|
+
def get_descendants(cid: str, seen: set) -> set:
|
|
1156
|
+
if cid in seen:
|
|
1157
|
+
return set()
|
|
1158
|
+
seen.add(cid)
|
|
1159
|
+
result = {cid}
|
|
1160
|
+
for child in hierarchy.get(cid, []):
|
|
1161
|
+
result.update(get_descendants(child, seen))
|
|
1162
|
+
return result
|
|
1163
|
+
|
|
1164
|
+
relevant_ids = get_descendants(root_correlation_id, set())
|
|
1165
|
+
unique_chains = [
|
|
1166
|
+
c
|
|
1167
|
+
for c in unique_chains
|
|
1168
|
+
if c["parent_correlation_id"] in relevant_ids
|
|
1169
|
+
or c["child_correlation_id"] in relevant_ids
|
|
1170
|
+
]
|
|
1171
|
+
root_ids = [root_correlation_id] if root_correlation_id in root_ids else []
|
|
1172
|
+
|
|
1173
|
+
# Convert hierarchy to regular dict
|
|
1174
|
+
hierarchy_dict = {k: list(set(v)) for k, v in hierarchy.items()}
|
|
1175
|
+
|
|
1176
|
+
return {
|
|
1177
|
+
"chains": unique_chains,
|
|
1178
|
+
"root_ids": sorted(root_ids),
|
|
1179
|
+
"hierarchy": hierarchy_dict,
|
|
1180
|
+
"total_chains": len(unique_chains),
|
|
1181
|
+
"total_correlation_ids": len(all_correlation_ids),
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
def build_hierarchy_with_correlation_chains(
|
|
1186
|
+
files: List[str],
|
|
1187
|
+
root_identifier: str,
|
|
1188
|
+
include_correlation_chains: bool = True,
|
|
1189
|
+
max_depth: Optional[int] = None,
|
|
1190
|
+
use_naming_patterns: bool = True,
|
|
1191
|
+
use_temporal_inference: bool = True,
|
|
1192
|
+
min_confidence: float = 0.0,
|
|
1193
|
+
) -> Dict[str, Any]:
|
|
1194
|
+
"""
|
|
1195
|
+
Build hierarchy that includes correlation ID chaining relationships.
|
|
1196
|
+
|
|
1197
|
+
This extends follow_thread_hierarchy by also detecting when one correlation ID
|
|
1198
|
+
spawns sub-requests with different correlation IDs.
|
|
1199
|
+
|
|
1200
|
+
Args:
|
|
1201
|
+
files: List of log file paths
|
|
1202
|
+
root_identifier: Root correlation ID, thread ID, or span ID
|
|
1203
|
+
include_correlation_chains: Whether to detect correlation chaining (default: True)
|
|
1204
|
+
max_depth: Maximum hierarchy depth
|
|
1205
|
+
use_naming_patterns: Enable naming pattern detection
|
|
1206
|
+
use_temporal_inference: Enable temporal inference
|
|
1207
|
+
min_confidence: Minimum confidence score
|
|
1208
|
+
|
|
1209
|
+
Returns:
|
|
1210
|
+
Hierarchy dictionary with additional correlation chain information
|
|
1211
|
+
|
|
1212
|
+
Example:
|
|
1213
|
+
hierarchy = build_hierarchy_with_correlation_chains(
|
|
1214
|
+
files=["api.log", "worker.log"],
|
|
1215
|
+
root_identifier="req-main-001",
|
|
1216
|
+
include_correlation_chains=True
|
|
1217
|
+
)
|
|
1218
|
+
# hierarchy now includes sub-requests spawned by req-main-001
|
|
1219
|
+
"""
|
|
1220
|
+
# First build the regular hierarchy
|
|
1221
|
+
hierarchy = follow_thread_hierarchy(
|
|
1222
|
+
files=files,
|
|
1223
|
+
root_identifier=root_identifier,
|
|
1224
|
+
max_depth=max_depth,
|
|
1225
|
+
use_naming_patterns=use_naming_patterns,
|
|
1226
|
+
use_temporal_inference=use_temporal_inference,
|
|
1227
|
+
min_confidence=min_confidence,
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
if not include_correlation_chains:
|
|
1231
|
+
return hierarchy
|
|
1232
|
+
|
|
1233
|
+
# Detect correlation chains
|
|
1234
|
+
chains = detect_correlation_chains(files=files, root_correlation_id=root_identifier)
|
|
1235
|
+
|
|
1236
|
+
# Add chain information to hierarchy
|
|
1237
|
+
hierarchy["correlation_chains"] = chains["chains"]
|
|
1238
|
+
hierarchy["chained_correlation_ids"] = list(chains["hierarchy"].keys())
|
|
1239
|
+
|
|
1240
|
+
# If there are chained correlation IDs, we could optionally merge their hierarchies
|
|
1241
|
+
# For now, just add metadata about them
|
|
1242
|
+
if chains["total_chains"] > 0:
|
|
1243
|
+
hierarchy["has_correlation_chains"] = True
|
|
1244
|
+
hierarchy["correlation_chain_count"] = chains["total_chains"]
|
|
1245
|
+
|
|
1246
|
+
# Add note about additional correlation IDs that could be explored
|
|
1247
|
+
child_ids = set()
|
|
1248
|
+
for chain in chains["chains"]:
|
|
1249
|
+
child_ids.add(chain["child_correlation_id"])
|
|
1250
|
+
|
|
1251
|
+
hierarchy["related_correlation_ids"] = sorted(child_ids)
|
|
1252
|
+
|
|
1253
|
+
return hierarchy
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
def analyze_bottlenecks(
|
|
1257
|
+
hierarchy: Dict[str, Any],
|
|
1258
|
+
threshold_percentage: float = 20.0,
|
|
1259
|
+
) -> Dict[str, Any]:
|
|
1260
|
+
"""
|
|
1261
|
+
AI-powered bottleneck detection with optimization suggestions.
|
|
1262
|
+
|
|
1263
|
+
Analyzes hierarchy to identify:
|
|
1264
|
+
- Primary bottleneck (longest duration)
|
|
1265
|
+
- Secondary bottlenecks
|
|
1266
|
+
- Potential parallelization opportunities
|
|
1267
|
+
- Caching opportunities
|
|
1268
|
+
- Circuit breaker recommendations
|
|
1269
|
+
|
|
1270
|
+
Args:
|
|
1271
|
+
hierarchy: Hierarchy from follow_thread_hierarchy()
|
|
1272
|
+
threshold_percentage: Minimum % of total time to be considered significant
|
|
1273
|
+
|
|
1274
|
+
Returns:
|
|
1275
|
+
Dictionary with bottleneck analysis:
|
|
1276
|
+
{
|
|
1277
|
+
"primary_bottleneck": {...},
|
|
1278
|
+
"secondary_bottlenecks": [...],
|
|
1279
|
+
"optimization_suggestions": [...],
|
|
1280
|
+
"parallelization_opportunities": [...],
|
|
1281
|
+
"estimated_improvement_ms": float
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
Example:
|
|
1285
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
1286
|
+
analysis = analyze_bottlenecks(hierarchy)
|
|
1287
|
+
for suggestion in analysis['optimization_suggestions']:
|
|
1288
|
+
print(f" - {suggestion}")
|
|
1289
|
+
"""
|
|
1290
|
+
result = {
|
|
1291
|
+
"primary_bottleneck": None,
|
|
1292
|
+
"secondary_bottlenecks": [],
|
|
1293
|
+
"optimization_suggestions": [],
|
|
1294
|
+
"parallelization_opportunities": [],
|
|
1295
|
+
"caching_opportunities": [],
|
|
1296
|
+
"estimated_improvement_ms": 0,
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
total_duration = hierarchy.get("total_duration_ms", 0)
|
|
1300
|
+
if total_duration <= 0:
|
|
1301
|
+
return result
|
|
1302
|
+
|
|
1303
|
+
bottleneck = hierarchy.get("bottleneck")
|
|
1304
|
+
if bottleneck:
|
|
1305
|
+
result["primary_bottleneck"] = bottleneck
|
|
1306
|
+
|
|
1307
|
+
# Collect all nodes with timing
|
|
1308
|
+
all_nodes = []
|
|
1309
|
+
|
|
1310
|
+
def collect_nodes(node: Dict[str, Any]):
|
|
1311
|
+
duration = node.get("duration_ms", 0)
|
|
1312
|
+
if duration and duration > 0:
|
|
1313
|
+
percentage = (duration / total_duration) * 100
|
|
1314
|
+
all_nodes.append(
|
|
1315
|
+
{
|
|
1316
|
+
"id": node.get("id"),
|
|
1317
|
+
"duration_ms": duration,
|
|
1318
|
+
"percentage": percentage,
|
|
1319
|
+
"depth": node.get("depth", 0),
|
|
1320
|
+
"children_count": len(node.get("children", [])),
|
|
1321
|
+
"is_leaf": len(node.get("children", [])) == 0,
|
|
1322
|
+
"error_count": node.get("error_count", 0),
|
|
1323
|
+
}
|
|
1324
|
+
)
|
|
1325
|
+
for child in node.get("children", []):
|
|
1326
|
+
collect_nodes(child)
|
|
1327
|
+
|
|
1328
|
+
for root in hierarchy.get("roots", []):
|
|
1329
|
+
collect_nodes(root)
|
|
1330
|
+
|
|
1331
|
+
# Sort by duration
|
|
1332
|
+
all_nodes.sort(key=lambda x: -x["duration_ms"])
|
|
1333
|
+
|
|
1334
|
+
# Find secondary bottlenecks
|
|
1335
|
+
for node in all_nodes[1:5]: # Top 5 excluding primary
|
|
1336
|
+
if node["percentage"] >= threshold_percentage:
|
|
1337
|
+
result["secondary_bottlenecks"].append(node)
|
|
1338
|
+
|
|
1339
|
+
# Generate optimization suggestions
|
|
1340
|
+
suggestions = []
|
|
1341
|
+
|
|
1342
|
+
# Check for parallelization opportunities
|
|
1343
|
+
# Look for siblings at same depth with no dependencies
|
|
1344
|
+
depth_groups = defaultdict(list)
|
|
1345
|
+
for node in all_nodes:
|
|
1346
|
+
depth_groups[node["depth"]].append(node)
|
|
1347
|
+
|
|
1348
|
+
for depth, nodes in depth_groups.items():
|
|
1349
|
+
if len(nodes) >= 2:
|
|
1350
|
+
total_sibling_time = sum(n["duration_ms"] for n in nodes)
|
|
1351
|
+
max_sibling_time = max(n["duration_ms"] for n in nodes)
|
|
1352
|
+
savings = total_sibling_time - max_sibling_time
|
|
1353
|
+
|
|
1354
|
+
if savings > total_duration * 0.1: # >10% potential savings
|
|
1355
|
+
sibling_names = [n["id"] for n in nodes[:3]]
|
|
1356
|
+
result["parallelization_opportunities"].append(
|
|
1357
|
+
{
|
|
1358
|
+
"depth": depth,
|
|
1359
|
+
"nodes": sibling_names,
|
|
1360
|
+
"potential_savings_ms": savings,
|
|
1361
|
+
}
|
|
1362
|
+
)
|
|
1363
|
+
suggestions.append(
|
|
1364
|
+
f"Parallelize operations at depth {depth} ({', '.join(sibling_names[:2])}) - "
|
|
1365
|
+
f"potential savings: {savings:.0f}ms"
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
# Check for caching opportunities (repeated patterns)
|
|
1369
|
+
leaf_nodes = [n for n in all_nodes if n["is_leaf"]]
|
|
1370
|
+
if len(leaf_nodes) > 3:
|
|
1371
|
+
avg_leaf_time = sum(n["duration_ms"] for n in leaf_nodes) / len(leaf_nodes)
|
|
1372
|
+
slow_leaves = [n for n in leaf_nodes if n["duration_ms"] > avg_leaf_time * 2]
|
|
1373
|
+
if slow_leaves:
|
|
1374
|
+
suggestions.append(
|
|
1375
|
+
f"Consider caching for slow leaf operations: {', '.join(n['id'] for n in slow_leaves[:3])}"
|
|
1376
|
+
)
|
|
1377
|
+
result["caching_opportunities"] = [n["id"] for n in slow_leaves[:3]]
|
|
1378
|
+
|
|
1379
|
+
# Primary bottleneck specific suggestions
|
|
1380
|
+
if bottleneck:
|
|
1381
|
+
percentage = bottleneck.get("percentage", 0)
|
|
1382
|
+
if percentage > 50:
|
|
1383
|
+
suggestions.append(
|
|
1384
|
+
f"CRITICAL: {bottleneck['node_id']} takes {percentage:.0f}% of total time - prioritize optimization"
|
|
1385
|
+
)
|
|
1386
|
+
elif percentage > 30:
|
|
1387
|
+
suggestions.append(
|
|
1388
|
+
f"IMPORTANT: Consider optimizing {bottleneck['node_id']} ({percentage:.0f}% of time)"
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
if bottleneck.get("depth", 0) > 2:
|
|
1392
|
+
suggestions.append(
|
|
1393
|
+
f"Bottleneck is deep in call stack (depth {bottleneck['depth']}) - consider moving to async"
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
# Check for error-prone bottlenecks
|
|
1397
|
+
error_nodes = [n for n in all_nodes if n["error_count"] > 0 and n["percentage"] > 10]
|
|
1398
|
+
for node in error_nodes:
|
|
1399
|
+
suggestions.append(
|
|
1400
|
+
f"Add circuit breaker for {node['id']} - errors detected and {node['percentage']:.0f}% of time"
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
result["optimization_suggestions"] = suggestions
|
|
1404
|
+
|
|
1405
|
+
# Estimate potential improvement
|
|
1406
|
+
if result["parallelization_opportunities"]:
|
|
1407
|
+
result["estimated_improvement_ms"] = sum(
|
|
1408
|
+
p["potential_savings_ms"] for p in result["parallelization_opportunities"]
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
return result
|
|
1412
|
+
|
|
1413
|
+
|
|
1414
|
+
def diff_hierarchies(
|
|
1415
|
+
hierarchy_a: Dict[str, Any],
|
|
1416
|
+
hierarchy_b: Dict[str, Any],
|
|
1417
|
+
label_a: str = "Before",
|
|
1418
|
+
label_b: str = "After",
|
|
1419
|
+
) -> Dict[str, Any]:
|
|
1420
|
+
"""
|
|
1421
|
+
Compare two hierarchies to identify performance changes.
|
|
1422
|
+
|
|
1423
|
+
Useful for before/after deployment comparisons, A/B testing,
|
|
1424
|
+
or debugging performance regressions.
|
|
1425
|
+
|
|
1426
|
+
Args:
|
|
1427
|
+
hierarchy_a: First hierarchy (baseline)
|
|
1428
|
+
hierarchy_b: Second hierarchy (comparison)
|
|
1429
|
+
label_a: Label for first hierarchy
|
|
1430
|
+
label_b: Label for second hierarchy
|
|
1431
|
+
|
|
1432
|
+
Returns:
|
|
1433
|
+
Dictionary with comparison results:
|
|
1434
|
+
{
|
|
1435
|
+
"summary": {
|
|
1436
|
+
"total_duration_change_ms": float,
|
|
1437
|
+
"total_duration_change_pct": float,
|
|
1438
|
+
"node_count_change": int,
|
|
1439
|
+
"new_errors": int,
|
|
1440
|
+
"resolved_errors": int
|
|
1441
|
+
},
|
|
1442
|
+
"improved_nodes": [...],
|
|
1443
|
+
"degraded_nodes": [...],
|
|
1444
|
+
"new_nodes": [...],
|
|
1445
|
+
"removed_nodes": [...],
|
|
1446
|
+
"error_changes": {...}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
Example:
|
|
1450
|
+
before = follow_thread_hierarchy(files=["before.log"], root_identifier="req-123")
|
|
1451
|
+
after = follow_thread_hierarchy(files=["after.log"], root_identifier="req-123")
|
|
1452
|
+
diff = diff_hierarchies(before, after)
|
|
1453
|
+
print(f"Performance change: {diff['summary']['total_duration_change_pct']:.1f}%")
|
|
1454
|
+
"""
|
|
1455
|
+
result = {
|
|
1456
|
+
"label_a": label_a,
|
|
1457
|
+
"label_b": label_b,
|
|
1458
|
+
"summary": {
|
|
1459
|
+
"total_duration_change_ms": 0,
|
|
1460
|
+
"total_duration_change_pct": 0,
|
|
1461
|
+
"node_count_change": 0,
|
|
1462
|
+
"new_errors": 0,
|
|
1463
|
+
"resolved_errors": 0,
|
|
1464
|
+
},
|
|
1465
|
+
"improved_nodes": [],
|
|
1466
|
+
"degraded_nodes": [],
|
|
1467
|
+
"new_nodes": [],
|
|
1468
|
+
"removed_nodes": [],
|
|
1469
|
+
"error_changes": {
|
|
1470
|
+
"new_errors": [],
|
|
1471
|
+
"resolved_errors": [],
|
|
1472
|
+
},
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
# Collect nodes from both hierarchies
|
|
1476
|
+
def collect_nodes(hierarchy: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
1477
|
+
nodes = {}
|
|
1478
|
+
|
|
1479
|
+
def walk(node: Dict[str, Any]):
|
|
1480
|
+
node_id = node.get("id")
|
|
1481
|
+
if node_id:
|
|
1482
|
+
nodes[node_id] = {
|
|
1483
|
+
"duration_ms": node.get("duration_ms", 0),
|
|
1484
|
+
"error_count": node.get("error_count", 0),
|
|
1485
|
+
"entry_count": node.get("entry_count", 0),
|
|
1486
|
+
}
|
|
1487
|
+
for child in node.get("children", []):
|
|
1488
|
+
walk(child)
|
|
1489
|
+
|
|
1490
|
+
for root in hierarchy.get("roots", []):
|
|
1491
|
+
walk(root)
|
|
1492
|
+
|
|
1493
|
+
return nodes
|
|
1494
|
+
|
|
1495
|
+
nodes_a = collect_nodes(hierarchy_a)
|
|
1496
|
+
nodes_b = collect_nodes(hierarchy_b)
|
|
1497
|
+
|
|
1498
|
+
# Duration changes
|
|
1499
|
+
duration_a = hierarchy_a.get("total_duration_ms", 0)
|
|
1500
|
+
duration_b = hierarchy_b.get("total_duration_ms", 0)
|
|
1501
|
+
|
|
1502
|
+
result["summary"]["total_duration_change_ms"] = duration_b - duration_a
|
|
1503
|
+
if duration_a > 0:
|
|
1504
|
+
result["summary"]["total_duration_change_pct"] = (
|
|
1505
|
+
(duration_b - duration_a) / duration_a * 100
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
# Node count changes
|
|
1509
|
+
result["summary"]["node_count_change"] = len(nodes_b) - len(nodes_a)
|
|
1510
|
+
|
|
1511
|
+
# Compare individual nodes
|
|
1512
|
+
all_node_ids = set(nodes_a.keys()) | set(nodes_b.keys())
|
|
1513
|
+
|
|
1514
|
+
for node_id in all_node_ids:
|
|
1515
|
+
in_a = node_id in nodes_a
|
|
1516
|
+
in_b = node_id in nodes_b
|
|
1517
|
+
|
|
1518
|
+
if in_a and not in_b:
|
|
1519
|
+
result["removed_nodes"].append(
|
|
1520
|
+
{
|
|
1521
|
+
"id": node_id,
|
|
1522
|
+
"duration_ms": nodes_a[node_id]["duration_ms"],
|
|
1523
|
+
}
|
|
1524
|
+
)
|
|
1525
|
+
elif in_b and not in_a:
|
|
1526
|
+
result["new_nodes"].append(
|
|
1527
|
+
{
|
|
1528
|
+
"id": node_id,
|
|
1529
|
+
"duration_ms": nodes_b[node_id]["duration_ms"],
|
|
1530
|
+
}
|
|
1531
|
+
)
|
|
1532
|
+
else:
|
|
1533
|
+
# Both exist - compare
|
|
1534
|
+
dur_a = nodes_a[node_id]["duration_ms"]
|
|
1535
|
+
dur_b = nodes_b[node_id]["duration_ms"]
|
|
1536
|
+
change_ms = dur_b - dur_a
|
|
1537
|
+
change_pct = ((dur_b - dur_a) / dur_a * 100) if dur_a > 0 else 0
|
|
1538
|
+
|
|
1539
|
+
if change_ms < -10: # >10ms improvement
|
|
1540
|
+
result["improved_nodes"].append(
|
|
1541
|
+
{
|
|
1542
|
+
"id": node_id,
|
|
1543
|
+
"before_ms": dur_a,
|
|
1544
|
+
"after_ms": dur_b,
|
|
1545
|
+
"change_ms": change_ms,
|
|
1546
|
+
"change_pct": change_pct,
|
|
1547
|
+
}
|
|
1548
|
+
)
|
|
1549
|
+
elif change_ms > 10: # >10ms degradation
|
|
1550
|
+
result["degraded_nodes"].append(
|
|
1551
|
+
{
|
|
1552
|
+
"id": node_id,
|
|
1553
|
+
"before_ms": dur_a,
|
|
1554
|
+
"after_ms": dur_b,
|
|
1555
|
+
"change_ms": change_ms,
|
|
1556
|
+
"change_pct": change_pct,
|
|
1557
|
+
}
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
# Error changes
|
|
1561
|
+
err_a = nodes_a[node_id]["error_count"]
|
|
1562
|
+
err_b = nodes_b[node_id]["error_count"]
|
|
1563
|
+
|
|
1564
|
+
if err_a == 0 and err_b > 0:
|
|
1565
|
+
result["error_changes"]["new_errors"].append(node_id)
|
|
1566
|
+
result["summary"]["new_errors"] += 1
|
|
1567
|
+
elif err_a > 0 and err_b == 0:
|
|
1568
|
+
result["error_changes"]["resolved_errors"].append(node_id)
|
|
1569
|
+
result["summary"]["resolved_errors"] += 1
|
|
1570
|
+
|
|
1571
|
+
# Sort by impact
|
|
1572
|
+
result["improved_nodes"].sort(key=lambda x: x["change_ms"])
|
|
1573
|
+
result["degraded_nodes"].sort(key=lambda x: -x["change_ms"])
|
|
1574
|
+
|
|
1575
|
+
return result
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
def format_hierarchy_diff(diff: Dict[str, Any]) -> str:
|
|
1579
|
+
"""
|
|
1580
|
+
Format hierarchy diff as human-readable text.
|
|
1581
|
+
|
|
1582
|
+
Args:
|
|
1583
|
+
diff: Diff from diff_hierarchies()
|
|
1584
|
+
|
|
1585
|
+
Returns:
|
|
1586
|
+
Formatted diff string
|
|
1587
|
+
"""
|
|
1588
|
+
lines = []
|
|
1589
|
+
|
|
1590
|
+
lines.append("=" * 70)
|
|
1591
|
+
lines.append("📊 HIERARCHY COMPARISON")
|
|
1592
|
+
lines.append(f" {diff['label_a']} vs {diff['label_b']}")
|
|
1593
|
+
lines.append("=" * 70)
|
|
1594
|
+
|
|
1595
|
+
summary = diff["summary"]
|
|
1596
|
+
change_ms = summary["total_duration_change_ms"]
|
|
1597
|
+
change_pct = summary["total_duration_change_pct"]
|
|
1598
|
+
|
|
1599
|
+
direction = "⬇️ IMPROVED" if change_ms < 0 else "⬆️ DEGRADED" if change_ms > 0 else "➡️ UNCHANGED"
|
|
1600
|
+
lines.append(f"\n{direction}: {abs(change_ms):.0f}ms ({abs(change_pct):.1f}%)")
|
|
1601
|
+
|
|
1602
|
+
if summary["new_errors"] > 0:
|
|
1603
|
+
lines.append(f"❌ New errors: {summary['new_errors']}")
|
|
1604
|
+
if summary["resolved_errors"] > 0:
|
|
1605
|
+
lines.append(f"✅ Resolved errors: {summary['resolved_errors']}")
|
|
1606
|
+
|
|
1607
|
+
if diff["improved_nodes"]:
|
|
1608
|
+
lines.append("\n" + "-" * 70)
|
|
1609
|
+
lines.append("✅ IMPROVED NODES")
|
|
1610
|
+
for node in diff["improved_nodes"][:5]:
|
|
1611
|
+
lines.append(
|
|
1612
|
+
f" • {node['id']}: {node['before_ms']:.0f}ms → {node['after_ms']:.0f}ms "
|
|
1613
|
+
f"({node['change_pct']:.1f}%)"
|
|
1614
|
+
)
|
|
1615
|
+
|
|
1616
|
+
if diff["degraded_nodes"]:
|
|
1617
|
+
lines.append("\n" + "-" * 70)
|
|
1618
|
+
lines.append("⚠️ DEGRADED NODES")
|
|
1619
|
+
for node in diff["degraded_nodes"][:5]:
|
|
1620
|
+
lines.append(
|
|
1621
|
+
f" • {node['id']}: {node['before_ms']:.0f}ms → {node['after_ms']:.0f}ms "
|
|
1622
|
+
f"(+{node['change_pct']:.1f}%)"
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
if diff["new_nodes"]:
|
|
1626
|
+
lines.append("\n" + "-" * 70)
|
|
1627
|
+
lines.append("🆕 NEW NODES")
|
|
1628
|
+
for node in diff["new_nodes"][:5]:
|
|
1629
|
+
lines.append(f" • {node['id']}: {node['duration_ms']:.0f}ms")
|
|
1630
|
+
|
|
1631
|
+
if diff["removed_nodes"]:
|
|
1632
|
+
lines.append("\n" + "-" * 70)
|
|
1633
|
+
lines.append("🗑️ REMOVED NODES")
|
|
1634
|
+
for node in diff["removed_nodes"][:5]:
|
|
1635
|
+
lines.append(f" • {node['id']}: was {node['duration_ms']:.0f}ms")
|
|
1636
|
+
|
|
1637
|
+
lines.append("\n" + "=" * 70)
|
|
1638
|
+
|
|
1639
|
+
return "\n".join(lines)
|
|
1640
|
+
|
|
1641
|
+
|
|
1642
|
+
def export_to_jaeger(
|
|
1643
|
+
hierarchy: Dict[str, Any],
|
|
1644
|
+
service_name: str = "logler-export",
|
|
1645
|
+
) -> Dict[str, Any]:
|
|
1646
|
+
"""
|
|
1647
|
+
Export hierarchy to Jaeger-compatible format.
|
|
1648
|
+
|
|
1649
|
+
The output follows the Jaeger JSON format and can be imported
|
|
1650
|
+
into Jaeger UI for visualization.
|
|
1651
|
+
|
|
1652
|
+
Args:
|
|
1653
|
+
hierarchy: Hierarchy from follow_thread_hierarchy()
|
|
1654
|
+
service_name: Name of the service for Jaeger
|
|
1655
|
+
|
|
1656
|
+
Returns:
|
|
1657
|
+
Dictionary in Jaeger trace format
|
|
1658
|
+
|
|
1659
|
+
Example:
|
|
1660
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
1661
|
+
jaeger_trace = export_to_jaeger(hierarchy, service_name="my-service")
|
|
1662
|
+
|
|
1663
|
+
with open("trace.json", "w") as f:
|
|
1664
|
+
json.dump(jaeger_trace, f)
|
|
1665
|
+
|
|
1666
|
+
# Import with: jaeger-query --grpc.host-port=localhost:16685
|
|
1667
|
+
"""
|
|
1668
|
+
import uuid
|
|
1669
|
+
from datetime import datetime
|
|
1670
|
+
|
|
1671
|
+
trace_id = uuid.uuid4().hex[:32]
|
|
1672
|
+
spans = []
|
|
1673
|
+
|
|
1674
|
+
def convert_node(node: Dict[str, Any], parent_span_id: Optional[str] = None):
|
|
1675
|
+
span_id = uuid.uuid4().hex[:16]
|
|
1676
|
+
|
|
1677
|
+
# Parse timestamps
|
|
1678
|
+
start_time = node.get("start_time")
|
|
1679
|
+
if start_time:
|
|
1680
|
+
if isinstance(start_time, str):
|
|
1681
|
+
try:
|
|
1682
|
+
dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
|
1683
|
+
start_us = int(dt.timestamp() * 1_000_000)
|
|
1684
|
+
except Exception:
|
|
1685
|
+
start_us = 0
|
|
1686
|
+
else:
|
|
1687
|
+
start_us = 0
|
|
1688
|
+
else:
|
|
1689
|
+
start_us = 0
|
|
1690
|
+
|
|
1691
|
+
duration_us = int((node.get("duration_ms", 0) or 0) * 1000)
|
|
1692
|
+
|
|
1693
|
+
span = {
|
|
1694
|
+
"traceID": trace_id,
|
|
1695
|
+
"spanID": span_id,
|
|
1696
|
+
"operationName": node.get("id", "unknown"),
|
|
1697
|
+
"references": [],
|
|
1698
|
+
"startTime": start_us,
|
|
1699
|
+
"duration": duration_us,
|
|
1700
|
+
"tags": [
|
|
1701
|
+
{"key": "node_type", "type": "string", "value": node.get("node_type", "unknown")},
|
|
1702
|
+
{"key": "entry_count", "type": "int64", "value": node.get("entry_count", 0)},
|
|
1703
|
+
{"key": "error_count", "type": "int64", "value": node.get("error_count", 0)},
|
|
1704
|
+
],
|
|
1705
|
+
"logs": [],
|
|
1706
|
+
"processID": "p1",
|
|
1707
|
+
"warnings": [],
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
if parent_span_id:
|
|
1711
|
+
span["references"].append(
|
|
1712
|
+
{
|
|
1713
|
+
"refType": "CHILD_OF",
|
|
1714
|
+
"traceID": trace_id,
|
|
1715
|
+
"spanID": parent_span_id,
|
|
1716
|
+
}
|
|
1717
|
+
)
|
|
1718
|
+
|
|
1719
|
+
if node.get("error_count", 0) > 0:
|
|
1720
|
+
span["tags"].append({"key": "error", "type": "bool", "value": True})
|
|
1721
|
+
|
|
1722
|
+
spans.append(span)
|
|
1723
|
+
|
|
1724
|
+
# Process children
|
|
1725
|
+
for child in node.get("children", []):
|
|
1726
|
+
convert_node(child, span_id)
|
|
1727
|
+
|
|
1728
|
+
# Convert all roots
|
|
1729
|
+
for root in hierarchy.get("roots", []):
|
|
1730
|
+
convert_node(root)
|
|
1731
|
+
|
|
1732
|
+
return {
|
|
1733
|
+
"data": [
|
|
1734
|
+
{
|
|
1735
|
+
"traceID": trace_id,
|
|
1736
|
+
"spans": spans,
|
|
1737
|
+
"processes": {
|
|
1738
|
+
"p1": {
|
|
1739
|
+
"serviceName": service_name,
|
|
1740
|
+
"tags": [
|
|
1741
|
+
{"key": "exported_by", "type": "string", "value": "logler"},
|
|
1742
|
+
],
|
|
1743
|
+
}
|
|
1744
|
+
},
|
|
1745
|
+
"warnings": [],
|
|
1746
|
+
}
|
|
1747
|
+
]
|
|
1748
|
+
}
|
|
1749
|
+
|
|
1750
|
+
|
|
1751
|
+
def export_to_zipkin(
|
|
1752
|
+
hierarchy: Dict[str, Any],
|
|
1753
|
+
service_name: str = "logler-export",
|
|
1754
|
+
) -> List[Dict[str, Any]]:
|
|
1755
|
+
"""
|
|
1756
|
+
Export hierarchy to Zipkin-compatible format.
|
|
1757
|
+
|
|
1758
|
+
Args:
|
|
1759
|
+
hierarchy: Hierarchy from follow_thread_hierarchy()
|
|
1760
|
+
service_name: Name of the service
|
|
1761
|
+
|
|
1762
|
+
Returns:
|
|
1763
|
+
List of spans in Zipkin V2 format
|
|
1764
|
+
|
|
1765
|
+
Example:
|
|
1766
|
+
hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
|
|
1767
|
+
zipkin_spans = export_to_zipkin(hierarchy)
|
|
1768
|
+
|
|
1769
|
+
# POST to Zipkin: curl -X POST http://localhost:9411/api/v2/spans -H 'Content-Type: application/json' -d '@spans.json'
|
|
1770
|
+
"""
|
|
1771
|
+
import uuid
|
|
1772
|
+
from datetime import datetime
|
|
1773
|
+
|
|
1774
|
+
trace_id = uuid.uuid4().hex[:32]
|
|
1775
|
+
spans = []
|
|
1776
|
+
|
|
1777
|
+
def convert_node(node: Dict[str, Any], parent_id: Optional[str] = None):
|
|
1778
|
+
span_id = uuid.uuid4().hex[:16]
|
|
1779
|
+
|
|
1780
|
+
# Parse timestamp
|
|
1781
|
+
start_time = node.get("start_time")
|
|
1782
|
+
timestamp_us = 0
|
|
1783
|
+
if start_time:
|
|
1784
|
+
if isinstance(start_time, str):
|
|
1785
|
+
try:
|
|
1786
|
+
dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
|
1787
|
+
timestamp_us = int(dt.timestamp() * 1_000_000)
|
|
1788
|
+
except Exception:
|
|
1789
|
+
pass
|
|
1790
|
+
|
|
1791
|
+
duration_us = int((node.get("duration_ms", 0) or 0) * 1000)
|
|
1792
|
+
|
|
1793
|
+
span = {
|
|
1794
|
+
"traceId": trace_id,
|
|
1795
|
+
"id": span_id,
|
|
1796
|
+
"name": node.get("id", "unknown"),
|
|
1797
|
+
"timestamp": timestamp_us,
|
|
1798
|
+
"duration": duration_us,
|
|
1799
|
+
"localEndpoint": {
|
|
1800
|
+
"serviceName": service_name,
|
|
1801
|
+
},
|
|
1802
|
+
"tags": {
|
|
1803
|
+
"node_type": node.get("node_type", "unknown"),
|
|
1804
|
+
"entry_count": str(node.get("entry_count", 0)),
|
|
1805
|
+
},
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
if parent_id:
|
|
1809
|
+
span["parentId"] = parent_id
|
|
1810
|
+
|
|
1811
|
+
if node.get("error_count", 0) > 0:
|
|
1812
|
+
span["tags"]["error"] = "true"
|
|
1813
|
+
|
|
1814
|
+
spans.append(span)
|
|
1815
|
+
|
|
1816
|
+
for child in node.get("children", []):
|
|
1817
|
+
convert_node(child, span_id)
|
|
1818
|
+
|
|
1819
|
+
for root in hierarchy.get("roots", []):
|
|
1820
|
+
convert_node(root)
|
|
1821
|
+
|
|
1822
|
+
return spans
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
def find_patterns(
|
|
1826
|
+
files: List[str],
|
|
1827
|
+
min_occurrences: int = 3,
|
|
1828
|
+
parser_format: Optional[str] = None,
|
|
1829
|
+
custom_regex: Optional[str] = None,
|
|
1830
|
+
) -> Dict[str, Any]:
|
|
1831
|
+
"""
|
|
1832
|
+
Find repeated patterns and anomalies in logs.
|
|
1833
|
+
|
|
1834
|
+
Args:
|
|
1835
|
+
files: List of log file paths
|
|
1836
|
+
min_occurrences: Minimum number of occurrences to consider a pattern
|
|
1837
|
+
|
|
1838
|
+
Returns:
|
|
1839
|
+
Dictionary with patterns:
|
|
1840
|
+
{
|
|
1841
|
+
"patterns": [
|
|
1842
|
+
{
|
|
1843
|
+
"pattern": "...",
|
|
1844
|
+
"occurrences": 15,
|
|
1845
|
+
"first_seen": "...",
|
|
1846
|
+
"last_seen": "...",
|
|
1847
|
+
"affected_threads": [...],
|
|
1848
|
+
"examples": [...]
|
|
1849
|
+
}
|
|
1850
|
+
]
|
|
1851
|
+
}
|
|
1852
|
+
"""
|
|
1853
|
+
if not RUST_AVAILABLE:
|
|
1854
|
+
raise RuntimeError("Rust backend not available")
|
|
1855
|
+
|
|
1856
|
+
if parser_format or custom_regex:
|
|
1857
|
+
inv = Investigator()
|
|
1858
|
+
inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
|
|
1859
|
+
return inv.find_patterns(min_occurrences=min_occurrences)
|
|
1860
|
+
|
|
1861
|
+
result_json = logler_rs.find_patterns(files, min_occurrences)
|
|
1862
|
+
result = json.loads(result_json)
|
|
1863
|
+
_normalize_pattern_examples(result)
|
|
1864
|
+
_apply_custom_regex_to_results(result, custom_regex)
|
|
1865
|
+
return result
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
def get_metadata(files: List[str]) -> Dict[str, Any]:
|
|
1869
|
+
"""
|
|
1870
|
+
Get metadata about log files.
|
|
1871
|
+
|
|
1872
|
+
Args:
|
|
1873
|
+
files: List of log file paths
|
|
1874
|
+
|
|
1875
|
+
Returns:
|
|
1876
|
+
List of file metadata:
|
|
1877
|
+
[
|
|
1878
|
+
{
|
|
1879
|
+
"path": "...",
|
|
1880
|
+
"size_bytes": 12345,
|
|
1881
|
+
"lines": 5000,
|
|
1882
|
+
"format": "json",
|
|
1883
|
+
"time_range": {...},
|
|
1884
|
+
"available_fields": [...],
|
|
1885
|
+
"unique_threads": 8,
|
|
1886
|
+
"unique_correlation_ids": 123,
|
|
1887
|
+
"log_levels": {...}
|
|
1888
|
+
}
|
|
1889
|
+
]
|
|
1890
|
+
"""
|
|
1891
|
+
if not RUST_AVAILABLE:
|
|
1892
|
+
raise RuntimeError("Rust backend not available")
|
|
1893
|
+
|
|
1894
|
+
result_json = logler_rs.get_metadata(files)
|
|
1895
|
+
return json.loads(result_json)
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
# Advanced API using Investigator class
|
|
1899
|
+
class Investigator:
|
|
1900
|
+
"""
|
|
1901
|
+
Advanced investigation API with persistent index.
|
|
1902
|
+
|
|
1903
|
+
Use this when you need to perform multiple operations on the same files
|
|
1904
|
+
for better performance.
|
|
1905
|
+
|
|
1906
|
+
Example:
|
|
1907
|
+
investigator = Investigator()
|
|
1908
|
+
investigator.load_files(["app.log", "api.log"])
|
|
1909
|
+
|
|
1910
|
+
results = investigator.search(query="error", limit=10)
|
|
1911
|
+
patterns = investigator.find_patterns(min_occurrences=5)
|
|
1912
|
+
metadata = investigator.get_metadata()
|
|
1913
|
+
"""
|
|
1914
|
+
|
|
1915
|
+
def __init__(self):
|
|
1916
|
+
if not RUST_AVAILABLE:
|
|
1917
|
+
raise RuntimeError("Rust backend not available")
|
|
1918
|
+
self._investigator = logler_rs.PyInvestigator()
|
|
1919
|
+
self._files = []
|
|
1920
|
+
self._custom_regex = None
|
|
1921
|
+
|
|
1922
|
+
def load_files(
|
|
1923
|
+
self,
|
|
1924
|
+
files: List[str],
|
|
1925
|
+
parser_format: Optional[str] = None,
|
|
1926
|
+
custom_regex: Optional[str] = None,
|
|
1927
|
+
):
|
|
1928
|
+
"""Load log files and build index."""
|
|
1929
|
+
_load_files_with_config(self._investigator, files, parser_format, custom_regex)
|
|
1930
|
+
self._files = files
|
|
1931
|
+
self._custom_regex = custom_regex
|
|
1932
|
+
|
|
1933
|
+
def search(
|
|
1934
|
+
self,
|
|
1935
|
+
query: Optional[str] = None,
|
|
1936
|
+
level: Optional[str] = None,
|
|
1937
|
+
thread_id: Optional[str] = None,
|
|
1938
|
+
correlation_id: Optional[str] = None,
|
|
1939
|
+
limit: Optional[int] = None,
|
|
1940
|
+
context_lines: int = 3,
|
|
1941
|
+
) -> Dict[str, Any]:
|
|
1942
|
+
"""Search loaded files."""
|
|
1943
|
+
filters = {"levels": []}
|
|
1944
|
+
if level:
|
|
1945
|
+
filters["levels"] = [level.upper()]
|
|
1946
|
+
if thread_id:
|
|
1947
|
+
filters["thread_id"] = thread_id
|
|
1948
|
+
if correlation_id:
|
|
1949
|
+
filters["correlation_id"] = correlation_id
|
|
1950
|
+
|
|
1951
|
+
query_dict = {
|
|
1952
|
+
"files": self._files,
|
|
1953
|
+
"query": query,
|
|
1954
|
+
"filters": filters,
|
|
1955
|
+
"limit": limit,
|
|
1956
|
+
"context_lines": context_lines,
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
result_json = self._investigator.search(json.dumps(query_dict))
|
|
1960
|
+
result = json.loads(result_json)
|
|
1961
|
+
_normalize_search_result_levels(result)
|
|
1962
|
+
_apply_custom_regex_to_results(result, self._custom_regex)
|
|
1963
|
+
return result
|
|
1964
|
+
|
|
1965
|
+
def follow_thread(
|
|
1966
|
+
self,
|
|
1967
|
+
thread_id: Optional[str] = None,
|
|
1968
|
+
correlation_id: Optional[str] = None,
|
|
1969
|
+
trace_id: Optional[str] = None,
|
|
1970
|
+
) -> Dict[str, Any]:
|
|
1971
|
+
"""Follow thread in loaded files."""
|
|
1972
|
+
result_json = self._investigator.follow_thread(
|
|
1973
|
+
self._files, thread_id, correlation_id, trace_id
|
|
1974
|
+
)
|
|
1975
|
+
result = json.loads(result_json)
|
|
1976
|
+
_normalize_entries(result.get("entries", []))
|
|
1977
|
+
return result
|
|
1978
|
+
|
|
1979
|
+
def find_patterns(self, min_occurrences: int = 3) -> Dict[str, Any]:
|
|
1980
|
+
"""Find patterns in loaded files."""
|
|
1981
|
+
result_json = self._investigator.find_patterns(self._files, min_occurrences)
|
|
1982
|
+
result = json.loads(result_json)
|
|
1983
|
+
_normalize_pattern_examples(result)
|
|
1984
|
+
return result
|
|
1985
|
+
|
|
1986
|
+
def get_metadata(self) -> Dict[str, Any]:
|
|
1987
|
+
"""Get metadata for loaded files."""
|
|
1988
|
+
result_json = self._investigator.get_metadata(self._files)
|
|
1989
|
+
return json.loads(result_json)
|
|
1990
|
+
|
|
1991
|
+
def get_context(
|
|
1992
|
+
self,
|
|
1993
|
+
file: str,
|
|
1994
|
+
line_number: int,
|
|
1995
|
+
lines_before: int = 10,
|
|
1996
|
+
lines_after: int = 10,
|
|
1997
|
+
) -> Dict[str, Any]:
|
|
1998
|
+
"""Get context around a line."""
|
|
1999
|
+
result_json = self._investigator.get_context(
|
|
2000
|
+
file, line_number, lines_before, lines_after, False
|
|
2001
|
+
)
|
|
2002
|
+
result = json.loads(result_json)
|
|
2003
|
+
_normalize_context_payload(result)
|
|
2004
|
+
return result
|
|
2005
|
+
|
|
2006
|
+
def sql_query(self, query: str) -> List[Dict[str, Any]]:
|
|
2007
|
+
"""
|
|
2008
|
+
Execute SQL query on loaded logs (requires 'sql' feature).
|
|
2009
|
+
|
|
2010
|
+
Args:
|
|
2011
|
+
query: SQL query string
|
|
2012
|
+
|
|
2013
|
+
Returns:
|
|
2014
|
+
List of result rows as dictionaries
|
|
2015
|
+
|
|
2016
|
+
Example:
|
|
2017
|
+
results = investigator.sql_query(\"\"\"
|
|
2018
|
+
SELECT level, COUNT(*) as count
|
|
2019
|
+
FROM logs
|
|
2020
|
+
GROUP BY level
|
|
2021
|
+
ORDER BY count DESC
|
|
2022
|
+
\"\"\")
|
|
2023
|
+
"""
|
|
2024
|
+
if not hasattr(self._investigator, "sql_query"):
|
|
2025
|
+
raise RuntimeError("SQL feature not available. Build with --features sql")
|
|
2026
|
+
result_json = self._investigator.sql_query(query)
|
|
2027
|
+
return json.loads(result_json)
|
|
2028
|
+
|
|
2029
|
+
def sql_tables(self) -> List[str]:
|
|
2030
|
+
"""Get list of available SQL tables (requires 'sql' feature)."""
|
|
2031
|
+
if not hasattr(self._investigator, "sql_tables"):
|
|
2032
|
+
raise RuntimeError("SQL feature not available. Build with --features sql")
|
|
2033
|
+
return self._investigator.sql_tables()
|
|
2034
|
+
|
|
2035
|
+
def sql_schema(self, table: str) -> List[Dict[str, Any]]:
|
|
2036
|
+
"""Get schema for a SQL table (requires 'sql' feature)."""
|
|
2037
|
+
if not hasattr(self._investigator, "sql_schema"):
|
|
2038
|
+
raise RuntimeError("SQL feature not available. Build with --features sql")
|
|
2039
|
+
result_json = self._investigator.sql_schema(table)
|
|
2040
|
+
return json.loads(result_json)
|
|
2041
|
+
|
|
2042
|
+
def build_hierarchy(
|
|
2043
|
+
self,
|
|
2044
|
+
root_identifier: str,
|
|
2045
|
+
max_depth: Optional[int] = None,
|
|
2046
|
+
use_naming_patterns: bool = True,
|
|
2047
|
+
use_temporal_inference: bool = True,
|
|
2048
|
+
min_confidence: float = 0.0,
|
|
2049
|
+
) -> Dict[str, Any]:
|
|
2050
|
+
"""
|
|
2051
|
+
Build hierarchical tree of threads/spans from loaded files.
|
|
2052
|
+
|
|
2053
|
+
Args:
|
|
2054
|
+
root_identifier: Root thread ID, correlation ID, or span ID
|
|
2055
|
+
max_depth: Maximum depth of hierarchy tree
|
|
2056
|
+
use_naming_patterns: Enable naming pattern detection
|
|
2057
|
+
use_temporal_inference: Enable time-based inference
|
|
2058
|
+
min_confidence: Minimum confidence score (0.0-1.0)
|
|
2059
|
+
|
|
2060
|
+
Returns:
|
|
2061
|
+
Hierarchy dictionary (see follow_thread_hierarchy for structure)
|
|
2062
|
+
|
|
2063
|
+
Example:
|
|
2064
|
+
inv = Investigator()
|
|
2065
|
+
inv.load_files(["app.log"])
|
|
2066
|
+
hierarchy = inv.build_hierarchy(root_identifier="req-123")
|
|
2067
|
+
summary = get_hierarchy_summary(hierarchy)
|
|
2068
|
+
print(summary)
|
|
2069
|
+
"""
|
|
2070
|
+
result_json = self._investigator.build_hierarchy(
|
|
2071
|
+
self._files,
|
|
2072
|
+
root_identifier,
|
|
2073
|
+
max_depth,
|
|
2074
|
+
use_naming_patterns,
|
|
2075
|
+
use_temporal_inference,
|
|
2076
|
+
min_confidence,
|
|
2077
|
+
)
|
|
2078
|
+
return json.loads(result_json)
|
|
2079
|
+
|
|
2080
|
+
|
|
2081
|
+
# Advanced LLM-optimized features
|
|
2082
|
+
|
|
2083
|
+
|
|
2084
|
+
def cross_service_timeline(
|
|
2085
|
+
files: Dict[str, List[str]],
|
|
2086
|
+
time_window: Optional[Tuple[str, str]] = None,
|
|
2087
|
+
correlation_id: Optional[str] = None,
|
|
2088
|
+
trace_id: Optional[str] = None,
|
|
2089
|
+
limit: Optional[int] = None,
|
|
2090
|
+
parser_format: Optional[str] = None,
|
|
2091
|
+
custom_regex: Optional[str] = None,
|
|
2092
|
+
) -> Dict[str, Any]:
|
|
2093
|
+
"""
|
|
2094
|
+
Create a unified timeline across multiple services/log files.
|
|
2095
|
+
|
|
2096
|
+
This is perfect for investigating distributed systems where a single
|
|
2097
|
+
request flows through multiple services (API Gateway → Auth → Database → Cache).
|
|
2098
|
+
|
|
2099
|
+
Args:
|
|
2100
|
+
files: Dictionary mapping service names to log file lists
|
|
2101
|
+
e.g., {"api": ["api.log"], "database": ["db.log"], "cache": ["cache.log"]}
|
|
2102
|
+
time_window: Optional tuple of (start_time, end_time) in ISO format
|
|
2103
|
+
correlation_id: Filter to specific correlation ID
|
|
2104
|
+
trace_id: Filter to specific trace ID
|
|
2105
|
+
limit: Maximum number of entries to return
|
|
2106
|
+
|
|
2107
|
+
Returns:
|
|
2108
|
+
Dictionary with unified timeline:
|
|
2109
|
+
{
|
|
2110
|
+
"timeline": [
|
|
2111
|
+
{
|
|
2112
|
+
"service": "api",
|
|
2113
|
+
"timestamp": "2024-01-01T10:30:15.123Z",
|
|
2114
|
+
"entry": {...},
|
|
2115
|
+
"relative_time_ms": 0
|
|
2116
|
+
},
|
|
2117
|
+
{
|
|
2118
|
+
"service": "database",
|
|
2119
|
+
"timestamp": "2024-01-01T10:30:15.456Z",
|
|
2120
|
+
"entry": {...},
|
|
2121
|
+
"relative_time_ms": 333
|
|
2122
|
+
},
|
|
2123
|
+
...
|
|
2124
|
+
],
|
|
2125
|
+
"services": ["api", "database", "cache"],
|
|
2126
|
+
"total_entries": 42,
|
|
2127
|
+
"duration_ms": 1523,
|
|
2128
|
+
"service_breakdown": {
|
|
2129
|
+
"api": 15,
|
|
2130
|
+
"database": 20,
|
|
2131
|
+
"cache": 7
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
|
|
2135
|
+
Example:
|
|
2136
|
+
# Investigate a failed request across services
|
|
2137
|
+
timeline = cross_service_timeline(
|
|
2138
|
+
files={
|
|
2139
|
+
"api": ["logs/api.log"],
|
|
2140
|
+
"auth": ["logs/auth.log"],
|
|
2141
|
+
"db": ["logs/db.log"]
|
|
2142
|
+
},
|
|
2143
|
+
correlation_id="req-12345"
|
|
2144
|
+
)
|
|
2145
|
+
|
|
2146
|
+
# See the flow
|
|
2147
|
+
for entry in timeline['timeline']:
|
|
2148
|
+
print(f"[{entry['service']:10s}] +{entry['relative_time_ms']:4d}ms: {entry['entry']['message']}")
|
|
2149
|
+
"""
|
|
2150
|
+
if not RUST_AVAILABLE:
|
|
2151
|
+
raise RuntimeError("Rust backend not available")
|
|
2152
|
+
|
|
2153
|
+
# Collect entries from all services
|
|
2154
|
+
all_entries = []
|
|
2155
|
+
service_counts = defaultdict(int)
|
|
2156
|
+
|
|
2157
|
+
for service_name, service_files in files.items():
|
|
2158
|
+
if correlation_id:
|
|
2159
|
+
result = follow_thread(service_files, correlation_id=correlation_id, trace_id=trace_id)
|
|
2160
|
+
entries = result.get("entries", [])
|
|
2161
|
+
elif trace_id:
|
|
2162
|
+
result = follow_thread(service_files, trace_id=trace_id)
|
|
2163
|
+
entries = result.get("entries", [])
|
|
2164
|
+
else:
|
|
2165
|
+
# Get all entries
|
|
2166
|
+
result = search(
|
|
2167
|
+
service_files, limit=None, parser_format=parser_format, custom_regex=custom_regex
|
|
2168
|
+
)
|
|
2169
|
+
entries = [r["entry"] for r in result.get("results", [])]
|
|
2170
|
+
|
|
2171
|
+
# Add service label to each entry
|
|
2172
|
+
for entry in entries:
|
|
2173
|
+
# Parse timestamp if present
|
|
2174
|
+
timestamp_str = entry.get("timestamp")
|
|
2175
|
+
if timestamp_str:
|
|
2176
|
+
try:
|
|
2177
|
+
timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
|
|
2178
|
+
except (ValueError, TypeError):
|
|
2179
|
+
timestamp = None
|
|
2180
|
+
else:
|
|
2181
|
+
timestamp = None
|
|
2182
|
+
|
|
2183
|
+
all_entries.append(
|
|
2184
|
+
{
|
|
2185
|
+
"service": service_name,
|
|
2186
|
+
"timestamp": timestamp,
|
|
2187
|
+
"timestamp_str": timestamp_str,
|
|
2188
|
+
"entry": entry,
|
|
2189
|
+
}
|
|
2190
|
+
)
|
|
2191
|
+
service_counts[service_name] += 1
|
|
2192
|
+
|
|
2193
|
+
# Filter by time window if specified
|
|
2194
|
+
if time_window:
|
|
2195
|
+
start_time, end_time = time_window
|
|
2196
|
+
try:
|
|
2197
|
+
start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
|
2198
|
+
end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
|
|
2199
|
+
all_entries = [
|
|
2200
|
+
e for e in all_entries if e["timestamp"] and start_dt <= e["timestamp"] <= end_dt
|
|
2201
|
+
]
|
|
2202
|
+
except Exception as e:
|
|
2203
|
+
warnings.warn(f"Could not parse time window: {e}", stacklevel=2)
|
|
2204
|
+
|
|
2205
|
+
# Sort by timestamp
|
|
2206
|
+
all_entries.sort(key=lambda e: e["timestamp"] if e["timestamp"] else datetime.min)
|
|
2207
|
+
|
|
2208
|
+
# Calculate relative times
|
|
2209
|
+
if all_entries and all_entries[0]["timestamp"]:
|
|
2210
|
+
start_time = all_entries[0]["timestamp"]
|
|
2211
|
+
for entry in all_entries:
|
|
2212
|
+
if entry["timestamp"]:
|
|
2213
|
+
delta = entry["timestamp"] - start_time
|
|
2214
|
+
entry["relative_time_ms"] = int(delta.total_seconds() * 1000)
|
|
2215
|
+
else:
|
|
2216
|
+
entry["relative_time_ms"] = None
|
|
2217
|
+
else:
|
|
2218
|
+
for entry in all_entries:
|
|
2219
|
+
entry["relative_time_ms"] = None
|
|
2220
|
+
|
|
2221
|
+
# Apply limit if specified
|
|
2222
|
+
if limit:
|
|
2223
|
+
all_entries = all_entries[:limit]
|
|
2224
|
+
|
|
2225
|
+
# Calculate duration
|
|
2226
|
+
duration_ms = None
|
|
2227
|
+
if len(all_entries) >= 2 and all_entries[0]["timestamp"] and all_entries[-1]["timestamp"]:
|
|
2228
|
+
duration = all_entries[-1]["timestamp"] - all_entries[0]["timestamp"]
|
|
2229
|
+
duration_ms = int(duration.total_seconds() * 1000)
|
|
2230
|
+
|
|
2231
|
+
# Clean up entries for output (remove internal timestamp objects)
|
|
2232
|
+
timeline = []
|
|
2233
|
+
for e in all_entries:
|
|
2234
|
+
timeline.append(
|
|
2235
|
+
{
|
|
2236
|
+
"service": e["service"],
|
|
2237
|
+
"timestamp": e["timestamp_str"],
|
|
2238
|
+
"entry": e["entry"],
|
|
2239
|
+
"relative_time_ms": e["relative_time_ms"],
|
|
2240
|
+
}
|
|
2241
|
+
)
|
|
2242
|
+
|
|
2243
|
+
return {
|
|
2244
|
+
"timeline": timeline,
|
|
2245
|
+
"services": list(files.keys()),
|
|
2246
|
+
"total_entries": len(timeline),
|
|
2247
|
+
"duration_ms": duration_ms,
|
|
2248
|
+
"service_breakdown": dict(service_counts),
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2251
|
+
|
|
2252
|
+
def compare_threads(
|
|
2253
|
+
files: List[str],
|
|
2254
|
+
thread_a: Optional[str] = None,
|
|
2255
|
+
thread_b: Optional[str] = None,
|
|
2256
|
+
correlation_a: Optional[str] = None,
|
|
2257
|
+
correlation_b: Optional[str] = None,
|
|
2258
|
+
trace_a: Optional[str] = None,
|
|
2259
|
+
trace_b: Optional[str] = None,
|
|
2260
|
+
) -> Dict[str, Any]:
|
|
2261
|
+
"""
|
|
2262
|
+
Compare two threads/requests to find differences.
|
|
2263
|
+
|
|
2264
|
+
Perfect for root cause analysis: "What's different between the successful
|
|
2265
|
+
request and the failed one?"
|
|
2266
|
+
|
|
2267
|
+
Args:
|
|
2268
|
+
files: List of log file paths
|
|
2269
|
+
thread_a: First thread ID to compare
|
|
2270
|
+
thread_b: Second thread ID to compare
|
|
2271
|
+
correlation_a: First correlation ID to compare
|
|
2272
|
+
correlation_b: Second correlation ID to compare
|
|
2273
|
+
trace_a: First trace ID to compare
|
|
2274
|
+
trace_b: Second trace ID to compare
|
|
2275
|
+
|
|
2276
|
+
Returns:
|
|
2277
|
+
Dictionary with comparison:
|
|
2278
|
+
{
|
|
2279
|
+
"thread_a": {
|
|
2280
|
+
"id": "...",
|
|
2281
|
+
"entries": [...],
|
|
2282
|
+
"duration_ms": 1523,
|
|
2283
|
+
"error_count": 0,
|
|
2284
|
+
"log_levels": {"INFO": 15, "ERROR": 0},
|
|
2285
|
+
"unique_messages": 15,
|
|
2286
|
+
"services": [...]
|
|
2287
|
+
},
|
|
2288
|
+
"thread_b": {...},
|
|
2289
|
+
"differences": {
|
|
2290
|
+
"duration_diff_ms": 2341, # B took 2341ms longer
|
|
2291
|
+
"error_diff": 5, # B had 5 more errors
|
|
2292
|
+
"only_in_a": ["cache hit", ...], # Messages only in A
|
|
2293
|
+
"only_in_b": ["cache miss", "timeout", ...], # Messages only in B
|
|
2294
|
+
"level_changes": {"ERROR": +5, "WARN": +2}
|
|
2295
|
+
},
|
|
2296
|
+
"summary": "Thread B took 2.3s longer and had 5 errors (cache miss, timeout)"
|
|
2297
|
+
}
|
|
2298
|
+
|
|
2299
|
+
Example:
|
|
2300
|
+
# Compare successful vs failed request
|
|
2301
|
+
diff = compare_threads(
|
|
2302
|
+
files=["app.log"],
|
|
2303
|
+
correlation_a="req-success-123",
|
|
2304
|
+
correlation_b="req-failed-456"
|
|
2305
|
+
)
|
|
2306
|
+
print(diff['summary'])
|
|
2307
|
+
"""
|
|
2308
|
+
if not RUST_AVAILABLE:
|
|
2309
|
+
raise RuntimeError("Rust backend not available")
|
|
2310
|
+
|
|
2311
|
+
# Get both threads
|
|
2312
|
+
timeline_a = follow_thread(
|
|
2313
|
+
files, thread_id=thread_a, correlation_id=correlation_a, trace_id=trace_a
|
|
2314
|
+
)
|
|
2315
|
+
timeline_b = follow_thread(
|
|
2316
|
+
files, thread_id=thread_b, correlation_id=correlation_b, trace_id=trace_b
|
|
2317
|
+
)
|
|
2318
|
+
|
|
2319
|
+
# Analyze thread A
|
|
2320
|
+
entries_a = timeline_a.get("entries", [])
|
|
2321
|
+
analysis_a = _analyze_thread(entries_a, thread_a or correlation_a or trace_a or "Thread A")
|
|
2322
|
+
|
|
2323
|
+
# Analyze thread B
|
|
2324
|
+
entries_b = timeline_b.get("entries", [])
|
|
2325
|
+
analysis_b = _analyze_thread(entries_b, thread_b or correlation_b or trace_b or "Thread B")
|
|
2326
|
+
|
|
2327
|
+
# Compare
|
|
2328
|
+
differences = _compute_differences(analysis_a, analysis_b)
|
|
2329
|
+
|
|
2330
|
+
# Generate summary
|
|
2331
|
+
summary = _generate_comparison_summary(analysis_a, analysis_b, differences)
|
|
2332
|
+
|
|
2333
|
+
return {
|
|
2334
|
+
"thread_a": analysis_a,
|
|
2335
|
+
"thread_b": analysis_b,
|
|
2336
|
+
"differences": differences,
|
|
2337
|
+
"summary": summary,
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2340
|
+
|
|
2341
|
+
def compare_time_periods(
|
|
2342
|
+
files: List[str],
|
|
2343
|
+
period_a_start: str,
|
|
2344
|
+
period_a_end: str,
|
|
2345
|
+
period_b_start: str,
|
|
2346
|
+
period_b_end: str,
|
|
2347
|
+
) -> Dict[str, Any]:
|
|
2348
|
+
"""
|
|
2349
|
+
Compare two time periods to find what changed.
|
|
2350
|
+
|
|
2351
|
+
Perfect for questions like: "What changed after the deployment?"
|
|
2352
|
+
or "Why did error rates spike at 3pm?"
|
|
2353
|
+
|
|
2354
|
+
Args:
|
|
2355
|
+
files: List of log file paths
|
|
2356
|
+
period_a_start: Start time for period A (ISO format)
|
|
2357
|
+
period_a_end: End time for period A (ISO format)
|
|
2358
|
+
period_b_start: Start time for period B (ISO format)
|
|
2359
|
+
period_b_end: End time for period B (ISO format)
|
|
2360
|
+
|
|
2361
|
+
Returns:
|
|
2362
|
+
Dictionary with comparison:
|
|
2363
|
+
{
|
|
2364
|
+
"period_a": {
|
|
2365
|
+
"start": "...",
|
|
2366
|
+
"end": "...",
|
|
2367
|
+
"total_logs": 1523,
|
|
2368
|
+
"error_rate": 0.02,
|
|
2369
|
+
"log_levels": {...},
|
|
2370
|
+
"top_errors": [...],
|
|
2371
|
+
"unique_threads": 45
|
|
2372
|
+
},
|
|
2373
|
+
"period_b": {...},
|
|
2374
|
+
"changes": {
|
|
2375
|
+
"log_volume_change_pct": 150, # 150% increase
|
|
2376
|
+
"error_rate_change": 10.5, # 10.5x more errors
|
|
2377
|
+
"new_errors": ["OutOfMemoryError", ...],
|
|
2378
|
+
"resolved_errors": [],
|
|
2379
|
+
"new_threads": 23
|
|
2380
|
+
},
|
|
2381
|
+
"summary": "Period B had 150% more logs and 10.5x error rate. New errors: OutOfMemoryError"
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
Example:
|
|
2385
|
+
# Compare before/after deployment
|
|
2386
|
+
diff = compare_time_periods(
|
|
2387
|
+
files=["app.log"],
|
|
2388
|
+
period_a_start="2024-01-01T14:00:00Z",
|
|
2389
|
+
period_a_end="2024-01-01T15:00:00Z",
|
|
2390
|
+
period_b_start="2024-01-01T15:00:00Z",
|
|
2391
|
+
period_b_end="2024-01-01T16:00:00Z"
|
|
2392
|
+
)
|
|
2393
|
+
print(diff['summary'])
|
|
2394
|
+
"""
|
|
2395
|
+
if not RUST_AVAILABLE:
|
|
2396
|
+
raise RuntimeError("Rust backend not available")
|
|
2397
|
+
|
|
2398
|
+
# Search each period
|
|
2399
|
+
# Period A
|
|
2400
|
+
inv = Investigator()
|
|
2401
|
+
inv.load_files(files)
|
|
2402
|
+
|
|
2403
|
+
results_a = search(files, limit=None)
|
|
2404
|
+
results_b = search(files, limit=None)
|
|
2405
|
+
|
|
2406
|
+
# Filter by time
|
|
2407
|
+
entries_a = [
|
|
2408
|
+
r["entry"]
|
|
2409
|
+
for r in results_a.get("results", [])
|
|
2410
|
+
if _in_time_range(r["entry"], period_a_start, period_a_end)
|
|
2411
|
+
]
|
|
2412
|
+
entries_b = [
|
|
2413
|
+
r["entry"]
|
|
2414
|
+
for r in results_b.get("results", [])
|
|
2415
|
+
if _in_time_range(r["entry"], period_b_start, period_b_end)
|
|
2416
|
+
]
|
|
2417
|
+
|
|
2418
|
+
# Analyze periods
|
|
2419
|
+
analysis_a = _analyze_period(entries_a, period_a_start, period_a_end)
|
|
2420
|
+
analysis_b = _analyze_period(entries_b, period_b_start, period_b_end)
|
|
2421
|
+
|
|
2422
|
+
# Compute changes
|
|
2423
|
+
changes = _compute_period_changes(analysis_a, analysis_b)
|
|
2424
|
+
|
|
2425
|
+
# Generate summary
|
|
2426
|
+
summary = _generate_period_summary(analysis_a, analysis_b, changes)
|
|
2427
|
+
|
|
2428
|
+
return {"period_a": analysis_a, "period_b": analysis_b, "changes": changes, "summary": summary}
|
|
2429
|
+
|
|
2430
|
+
|
|
2431
|
+
# Helper functions for comparison
|
|
2432
|
+
|
|
2433
|
+
|
|
2434
|
+
def _analyze_thread(entries: List[Dict], thread_id: str) -> Dict[str, Any]:
|
|
2435
|
+
"""Analyze a single thread's entries"""
|
|
2436
|
+
if not entries:
|
|
2437
|
+
return {
|
|
2438
|
+
"id": thread_id,
|
|
2439
|
+
"entries": [],
|
|
2440
|
+
"duration_ms": 0,
|
|
2441
|
+
"error_count": 0,
|
|
2442
|
+
"log_levels": {},
|
|
2443
|
+
"unique_messages": 0,
|
|
2444
|
+
"messages": [],
|
|
2445
|
+
"services": [],
|
|
2446
|
+
}
|
|
2447
|
+
|
|
2448
|
+
# Count log levels
|
|
2449
|
+
level_counts = defaultdict(int)
|
|
2450
|
+
error_count = 0
|
|
2451
|
+
messages = []
|
|
2452
|
+
services = set()
|
|
2453
|
+
|
|
2454
|
+
for entry in entries:
|
|
2455
|
+
level = entry.get("level", "INFO")
|
|
2456
|
+
level_counts[level] += 1
|
|
2457
|
+
if level in ["ERROR", "FATAL"]:
|
|
2458
|
+
error_count += 1
|
|
2459
|
+
|
|
2460
|
+
message = entry.get("message", "")
|
|
2461
|
+
messages.append(message)
|
|
2462
|
+
|
|
2463
|
+
service = entry.get("service") or entry.get("service_name")
|
|
2464
|
+
if service:
|
|
2465
|
+
services.add(service)
|
|
2466
|
+
|
|
2467
|
+
# Calculate duration
|
|
2468
|
+
duration_ms = 0
|
|
2469
|
+
if len(entries) >= 2:
|
|
2470
|
+
try:
|
|
2471
|
+
start = datetime.fromisoformat(entries[0].get("timestamp", "").replace("Z", "+00:00"))
|
|
2472
|
+
end = datetime.fromisoformat(entries[-1].get("timestamp", "").replace("Z", "+00:00"))
|
|
2473
|
+
duration_ms = int((end - start).total_seconds() * 1000)
|
|
2474
|
+
except (ValueError, TypeError, AttributeError):
|
|
2475
|
+
pass # Skip if timestamps are missing or invalid
|
|
2476
|
+
|
|
2477
|
+
return {
|
|
2478
|
+
"id": thread_id,
|
|
2479
|
+
"entries": entries,
|
|
2480
|
+
"entry_count": len(entries),
|
|
2481
|
+
"duration_ms": duration_ms,
|
|
2482
|
+
"error_count": error_count,
|
|
2483
|
+
"log_levels": dict(level_counts),
|
|
2484
|
+
"unique_messages": len(set(messages)),
|
|
2485
|
+
"messages": messages,
|
|
2486
|
+
"services": list(services),
|
|
2487
|
+
}
|
|
2488
|
+
|
|
2489
|
+
|
|
2490
|
+
def _compute_differences(analysis_a: Dict, analysis_b: Dict) -> Dict[str, Any]:
|
|
2491
|
+
"""Compute differences between two thread analyses"""
|
|
2492
|
+
# Duration difference
|
|
2493
|
+
duration_diff_ms = analysis_b["duration_ms"] - analysis_a["duration_ms"]
|
|
2494
|
+
|
|
2495
|
+
# Error difference
|
|
2496
|
+
error_diff = analysis_b["error_count"] - analysis_a["error_count"]
|
|
2497
|
+
|
|
2498
|
+
# Message differences
|
|
2499
|
+
messages_a = set(analysis_a["messages"])
|
|
2500
|
+
messages_b = set(analysis_b["messages"])
|
|
2501
|
+
only_in_a = list(messages_a - messages_b)
|
|
2502
|
+
only_in_b = list(messages_b - messages_a)
|
|
2503
|
+
|
|
2504
|
+
# Log level changes
|
|
2505
|
+
level_changes = {}
|
|
2506
|
+
all_levels = set(list(analysis_a["log_levels"].keys()) + list(analysis_b["log_levels"].keys()))
|
|
2507
|
+
for level in all_levels:
|
|
2508
|
+
count_a = analysis_a["log_levels"].get(level, 0)
|
|
2509
|
+
count_b = analysis_b["log_levels"].get(level, 0)
|
|
2510
|
+
if count_a != count_b:
|
|
2511
|
+
level_changes[level] = count_b - count_a
|
|
2512
|
+
|
|
2513
|
+
return {
|
|
2514
|
+
"duration_diff_ms": duration_diff_ms,
|
|
2515
|
+
"error_diff": error_diff,
|
|
2516
|
+
"only_in_a": only_in_a[:10], # Limit to 10
|
|
2517
|
+
"only_in_b": only_in_b[:10],
|
|
2518
|
+
"level_changes": level_changes,
|
|
2519
|
+
"entry_count_diff": analysis_b["entry_count"] - analysis_a["entry_count"],
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
|
|
2523
|
+
def _generate_comparison_summary(analysis_a: Dict, analysis_b: Dict, differences: Dict) -> str:
|
|
2524
|
+
"""Generate human-readable summary of comparison"""
|
|
2525
|
+
parts = []
|
|
2526
|
+
|
|
2527
|
+
# Duration
|
|
2528
|
+
duration_diff = differences["duration_diff_ms"]
|
|
2529
|
+
if abs(duration_diff) > 100:
|
|
2530
|
+
if duration_diff > 0:
|
|
2531
|
+
parts.append(f"Thread B took {duration_diff}ms longer")
|
|
2532
|
+
else:
|
|
2533
|
+
parts.append(f"Thread B was {-duration_diff}ms faster")
|
|
2534
|
+
|
|
2535
|
+
# Errors
|
|
2536
|
+
error_diff = differences["error_diff"]
|
|
2537
|
+
if error_diff > 0:
|
|
2538
|
+
parts.append(f"Thread B had {error_diff} more error(s)")
|
|
2539
|
+
if differences["only_in_b"]:
|
|
2540
|
+
examples = differences["only_in_b"][:3]
|
|
2541
|
+
parts.append(f"including: {', '.join(examples)}")
|
|
2542
|
+
elif error_diff < 0:
|
|
2543
|
+
parts.append(f"Thread B had {-error_diff} fewer error(s)")
|
|
2544
|
+
|
|
2545
|
+
# New messages in B
|
|
2546
|
+
if differences["only_in_b"] and error_diff == 0:
|
|
2547
|
+
parts.append(f"Thread B had unique messages: {', '.join(differences['only_in_b'][:3])}")
|
|
2548
|
+
|
|
2549
|
+
if not parts:
|
|
2550
|
+
parts.append("Threads are similar")
|
|
2551
|
+
|
|
2552
|
+
return ". ".join(parts)
|
|
2553
|
+
|
|
2554
|
+
|
|
2555
|
+
def _analyze_period(entries: List[Dict], start: str, end: str) -> Dict[str, Any]:
|
|
2556
|
+
"""Analyze a time period's entries"""
|
|
2557
|
+
level_counts = defaultdict(int)
|
|
2558
|
+
error_messages = []
|
|
2559
|
+
threads = set()
|
|
2560
|
+
|
|
2561
|
+
for entry in entries:
|
|
2562
|
+
level = entry.get("level", "INFO")
|
|
2563
|
+
level_counts[level] += 1
|
|
2564
|
+
|
|
2565
|
+
if level in ["ERROR", "FATAL"]:
|
|
2566
|
+
error_messages.append(entry.get("message", ""))
|
|
2567
|
+
|
|
2568
|
+
thread = entry.get("thread_id") or entry.get("correlation_id")
|
|
2569
|
+
if thread:
|
|
2570
|
+
threads.add(thread)
|
|
2571
|
+
|
|
2572
|
+
total = len(entries)
|
|
2573
|
+
error_count = level_counts.get("ERROR", 0) + level_counts.get("FATAL", 0)
|
|
2574
|
+
error_rate = error_count / total if total > 0 else 0
|
|
2575
|
+
|
|
2576
|
+
return {
|
|
2577
|
+
"start": start,
|
|
2578
|
+
"end": end,
|
|
2579
|
+
"total_logs": total,
|
|
2580
|
+
"error_count": error_count,
|
|
2581
|
+
"error_rate": error_rate,
|
|
2582
|
+
"log_levels": dict(level_counts),
|
|
2583
|
+
"top_errors": list(set(error_messages))[:10],
|
|
2584
|
+
"unique_threads": len(threads),
|
|
2585
|
+
}
|
|
2586
|
+
|
|
2587
|
+
|
|
2588
|
+
def _compute_period_changes(analysis_a: Dict, analysis_b: Dict) -> Dict[str, Any]:
|
|
2589
|
+
"""Compute changes between two time periods"""
|
|
2590
|
+
# Volume change
|
|
2591
|
+
if analysis_a["total_logs"] > 0:
|
|
2592
|
+
volume_change_pct = (
|
|
2593
|
+
(analysis_b["total_logs"] - analysis_a["total_logs"]) / analysis_a["total_logs"]
|
|
2594
|
+
) * 100
|
|
2595
|
+
else:
|
|
2596
|
+
volume_change_pct = 100 if analysis_b["total_logs"] > 0 else 0
|
|
2597
|
+
|
|
2598
|
+
# Error rate change
|
|
2599
|
+
if analysis_a["error_rate"] > 0:
|
|
2600
|
+
error_rate_multiplier = analysis_b["error_rate"] / analysis_a["error_rate"]
|
|
2601
|
+
else:
|
|
2602
|
+
error_rate_multiplier = float("inf") if analysis_b["error_rate"] > 0 else 1.0
|
|
2603
|
+
|
|
2604
|
+
# New vs resolved errors
|
|
2605
|
+
errors_a = set(analysis_a["top_errors"])
|
|
2606
|
+
errors_b = set(analysis_b["top_errors"])
|
|
2607
|
+
new_errors = list(errors_b - errors_a)
|
|
2608
|
+
resolved_errors = list(errors_a - errors_b)
|
|
2609
|
+
|
|
2610
|
+
return {
|
|
2611
|
+
"log_volume_change_pct": volume_change_pct,
|
|
2612
|
+
"error_rate_multiplier": error_rate_multiplier,
|
|
2613
|
+
"error_count_change": analysis_b["error_count"] - analysis_a["error_count"],
|
|
2614
|
+
"new_errors": new_errors[:10],
|
|
2615
|
+
"resolved_errors": resolved_errors[:10],
|
|
2616
|
+
"thread_count_change": analysis_b["unique_threads"] - analysis_a["unique_threads"],
|
|
2617
|
+
}
|
|
2618
|
+
|
|
2619
|
+
|
|
2620
|
+
def _generate_period_summary(analysis_a: Dict, analysis_b: Dict, changes: Dict) -> str:
|
|
2621
|
+
"""Generate human-readable summary of period comparison"""
|
|
2622
|
+
parts = []
|
|
2623
|
+
|
|
2624
|
+
# Volume
|
|
2625
|
+
vol_change = changes["log_volume_change_pct"]
|
|
2626
|
+
if abs(vol_change) > 20:
|
|
2627
|
+
parts.append(
|
|
2628
|
+
f"Log volume {'increased' if vol_change > 0 else 'decreased'} by {abs(vol_change):.1f}%"
|
|
2629
|
+
)
|
|
2630
|
+
|
|
2631
|
+
# Error rate
|
|
2632
|
+
err_mult = changes["error_rate_multiplier"]
|
|
2633
|
+
if err_mult > 1.5:
|
|
2634
|
+
parts.append(f"Error rate increased {err_mult:.1f}x")
|
|
2635
|
+
elif err_mult < 0.7 and err_mult > 0:
|
|
2636
|
+
parts.append(f"Error rate decreased to {err_mult:.1f}x")
|
|
2637
|
+
|
|
2638
|
+
# New errors
|
|
2639
|
+
if changes["new_errors"]:
|
|
2640
|
+
parts.append(f"New errors: {', '.join(changes['new_errors'][:3])}")
|
|
2641
|
+
|
|
2642
|
+
if not parts:
|
|
2643
|
+
parts.append("Periods are similar")
|
|
2644
|
+
|
|
2645
|
+
return ". ".join(parts)
|
|
2646
|
+
|
|
2647
|
+
|
|
2648
|
+
def _in_time_range(entry: Dict, start: str, end: str) -> bool:
|
|
2649
|
+
"""Check if entry timestamp is within range"""
|
|
2650
|
+
timestamp_str = entry.get("timestamp")
|
|
2651
|
+
if not timestamp_str:
|
|
2652
|
+
return False
|
|
2653
|
+
|
|
2654
|
+
try:
|
|
2655
|
+
timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
|
|
2656
|
+
start_dt = datetime.fromisoformat(start.replace("Z", "+00:00"))
|
|
2657
|
+
end_dt = datetime.fromisoformat(end.replace("Z", "+00:00"))
|
|
2658
|
+
return start_dt <= timestamp <= end_dt
|
|
2659
|
+
except (ValueError, TypeError, AttributeError):
|
|
2660
|
+
return False
|
|
2661
|
+
|
|
2662
|
+
|
|
2663
|
+
# Token-efficient output formatters
|
|
2664
|
+
|
|
2665
|
+
|
|
2666
|
+
def _format_as_summary(result: Dict[str, Any]) -> Dict[str, Any]:
|
|
2667
|
+
"""
|
|
2668
|
+
Convert full search results to token-efficient summary format.
|
|
2669
|
+
|
|
2670
|
+
Instead of returning all log entries, groups them by message and
|
|
2671
|
+
provides aggregated statistics with a few examples.
|
|
2672
|
+
"""
|
|
2673
|
+
results = result.get("results", [])
|
|
2674
|
+
if not results:
|
|
2675
|
+
return {
|
|
2676
|
+
"total_matches": 0,
|
|
2677
|
+
"unique_messages": 0,
|
|
2678
|
+
"log_levels": {},
|
|
2679
|
+
"top_messages": [],
|
|
2680
|
+
"sample_entries": [],
|
|
2681
|
+
}
|
|
2682
|
+
|
|
2683
|
+
# Group by message
|
|
2684
|
+
message_groups = defaultdict(
|
|
2685
|
+
lambda: {
|
|
2686
|
+
"count": 0,
|
|
2687
|
+
"first_seen": None,
|
|
2688
|
+
"last_seen": None,
|
|
2689
|
+
"levels": defaultdict(int),
|
|
2690
|
+
"examples": [],
|
|
2691
|
+
}
|
|
2692
|
+
)
|
|
2693
|
+
|
|
2694
|
+
level_counts = defaultdict(int)
|
|
2695
|
+
file_counts = defaultdict(int)
|
|
2696
|
+
|
|
2697
|
+
for item in results:
|
|
2698
|
+
entry = item.get("entry", {})
|
|
2699
|
+
message = entry.get("message", "").strip()
|
|
2700
|
+
level = entry.get("level", "INFO")
|
|
2701
|
+
timestamp = entry.get("timestamp")
|
|
2702
|
+
file_path = entry.get("file", "")
|
|
2703
|
+
|
|
2704
|
+
# Update level counts
|
|
2705
|
+
level_counts[level] += 1
|
|
2706
|
+
file_counts[file_path] += 1
|
|
2707
|
+
|
|
2708
|
+
# Update message group
|
|
2709
|
+
group = message_groups[message]
|
|
2710
|
+
group["count"] += 1
|
|
2711
|
+
group["levels"][level] += 1
|
|
2712
|
+
|
|
2713
|
+
if group["first_seen"] is None or (timestamp and timestamp < group["first_seen"]):
|
|
2714
|
+
group["first_seen"] = timestamp
|
|
2715
|
+
|
|
2716
|
+
if group["last_seen"] is None or (timestamp and timestamp > group["last_seen"]):
|
|
2717
|
+
group["last_seen"] = timestamp
|
|
2718
|
+
|
|
2719
|
+
# Keep up to 2 examples per message
|
|
2720
|
+
if len(group["examples"]) < 2:
|
|
2721
|
+
group["examples"].append(
|
|
2722
|
+
{
|
|
2723
|
+
"file": file_path,
|
|
2724
|
+
"line": entry.get("line_number"),
|
|
2725
|
+
"timestamp": timestamp,
|
|
2726
|
+
"level": level,
|
|
2727
|
+
}
|
|
2728
|
+
)
|
|
2729
|
+
|
|
2730
|
+
# Convert to sorted list (most frequent first)
|
|
2731
|
+
top_messages = []
|
|
2732
|
+
for message, data in sorted(message_groups.items(), key=lambda x: x[1]["count"], reverse=True)[
|
|
2733
|
+
:20
|
|
2734
|
+
]:
|
|
2735
|
+
top_messages.append(
|
|
2736
|
+
{
|
|
2737
|
+
"message": message[:200], # Truncate long messages
|
|
2738
|
+
"count": data["count"],
|
|
2739
|
+
"first_seen": data["first_seen"],
|
|
2740
|
+
"last_seen": data["last_seen"],
|
|
2741
|
+
"levels": dict(data["levels"]),
|
|
2742
|
+
"examples": data["examples"],
|
|
2743
|
+
}
|
|
2744
|
+
)
|
|
2745
|
+
|
|
2746
|
+
# Sample entries (diverse selection)
|
|
2747
|
+
sample_entries = _select_diverse_samples(results, max_samples=5)
|
|
2748
|
+
|
|
2749
|
+
return {
|
|
2750
|
+
"total_matches": len(results),
|
|
2751
|
+
"unique_messages": len(message_groups),
|
|
2752
|
+
"log_levels": dict(level_counts),
|
|
2753
|
+
"by_file": dict(file_counts),
|
|
2754
|
+
"top_messages": top_messages,
|
|
2755
|
+
"sample_entries": sample_entries,
|
|
2756
|
+
"full_results_available": True,
|
|
2757
|
+
}
|
|
2758
|
+
|
|
2759
|
+
|
|
2760
|
+
def _format_as_count(result: Dict[str, Any]) -> Dict[str, Any]:
|
|
2761
|
+
"""
|
|
2762
|
+
Convert full search results to count-only format (minimal tokens).
|
|
2763
|
+
|
|
2764
|
+
Returns only statistics, no actual log content.
|
|
2765
|
+
"""
|
|
2766
|
+
results = result.get("results", [])
|
|
2767
|
+
if not results:
|
|
2768
|
+
return {"total_matches": 0, "by_level": {}, "by_file": {}, "time_range": None}
|
|
2769
|
+
|
|
2770
|
+
level_counts = defaultdict(int)
|
|
2771
|
+
file_counts = defaultdict(int)
|
|
2772
|
+
timestamps = []
|
|
2773
|
+
|
|
2774
|
+
for item in results:
|
|
2775
|
+
entry = item.get("entry", {})
|
|
2776
|
+
level = entry.get("level", "INFO")
|
|
2777
|
+
file_path = entry.get("file", "")
|
|
2778
|
+
timestamp = entry.get("timestamp")
|
|
2779
|
+
|
|
2780
|
+
level_counts[level] += 1
|
|
2781
|
+
file_counts[file_path] += 1
|
|
2782
|
+
|
|
2783
|
+
if timestamp:
|
|
2784
|
+
timestamps.append(timestamp)
|
|
2785
|
+
|
|
2786
|
+
# Time range
|
|
2787
|
+
time_range = None
|
|
2788
|
+
if timestamps:
|
|
2789
|
+
timestamps.sort()
|
|
2790
|
+
time_range = {"start": timestamps[0], "end": timestamps[-1]}
|
|
2791
|
+
|
|
2792
|
+
return {
|
|
2793
|
+
"total_matches": len(results),
|
|
2794
|
+
"by_level": dict(level_counts),
|
|
2795
|
+
"by_file": dict(file_counts),
|
|
2796
|
+
"time_range": time_range,
|
|
2797
|
+
}
|
|
2798
|
+
|
|
2799
|
+
|
|
2800
|
+
def _format_as_compact(result: Dict[str, Any]) -> Dict[str, Any]:
|
|
2801
|
+
"""
|
|
2802
|
+
Convert full search results to compact format.
|
|
2803
|
+
|
|
2804
|
+
Returns only essential fields, removing raw logs and extra context.
|
|
2805
|
+
"""
|
|
2806
|
+
results = result.get("results", [])
|
|
2807
|
+
if not results:
|
|
2808
|
+
return {"matches": [], "total": 0}
|
|
2809
|
+
|
|
2810
|
+
compact_matches = []
|
|
2811
|
+
for item in results:
|
|
2812
|
+
entry = item.get("entry", {})
|
|
2813
|
+
compact_matches.append(
|
|
2814
|
+
{
|
|
2815
|
+
"time": entry.get("timestamp"),
|
|
2816
|
+
"level": entry.get("level"),
|
|
2817
|
+
"msg": entry.get("message", "")[:150], # Truncate messages
|
|
2818
|
+
"thread": entry.get("thread_id") or entry.get("correlation_id"),
|
|
2819
|
+
"file": entry.get("file", "").split("/")[-1], # Just filename
|
|
2820
|
+
"line": entry.get("line_number"),
|
|
2821
|
+
}
|
|
2822
|
+
)
|
|
2823
|
+
|
|
2824
|
+
return {"matches": compact_matches, "total": len(results)}
|
|
2825
|
+
|
|
2826
|
+
|
|
2827
|
+
def _select_diverse_samples(results: List[Dict], max_samples: int = 5) -> List[Dict]:
|
|
2828
|
+
"""
|
|
2829
|
+
Select a diverse set of sample entries.
|
|
2830
|
+
|
|
2831
|
+
Tries to include:
|
|
2832
|
+
- First and last entry
|
|
2833
|
+
- Different log levels
|
|
2834
|
+
- Different files
|
|
2835
|
+
- Errors if present
|
|
2836
|
+
"""
|
|
2837
|
+
if not results:
|
|
2838
|
+
return []
|
|
2839
|
+
|
|
2840
|
+
if len(results) <= max_samples:
|
|
2841
|
+
return [r.get("entry", {}) for r in results]
|
|
2842
|
+
|
|
2843
|
+
samples = []
|
|
2844
|
+
indices_used = set()
|
|
2845
|
+
|
|
2846
|
+
# Always include first and last
|
|
2847
|
+
samples.append(results[0].get("entry", {}))
|
|
2848
|
+
indices_used.add(0)
|
|
2849
|
+
|
|
2850
|
+
if len(results) > 1:
|
|
2851
|
+
samples.append(results[-1].get("entry", {}))
|
|
2852
|
+
indices_used.add(len(results) - 1)
|
|
2853
|
+
|
|
2854
|
+
# Find first error
|
|
2855
|
+
for i, item in enumerate(results):
|
|
2856
|
+
if i in indices_used:
|
|
2857
|
+
continue
|
|
2858
|
+
entry = item.get("entry", {})
|
|
2859
|
+
if entry.get("level") in ["ERROR", "FATAL"]:
|
|
2860
|
+
samples.append(entry)
|
|
2861
|
+
indices_used.add(i)
|
|
2862
|
+
break
|
|
2863
|
+
|
|
2864
|
+
# Fill remaining slots with evenly spaced entries
|
|
2865
|
+
remaining = max_samples - len(samples)
|
|
2866
|
+
if remaining > 0 and len(results) > len(indices_used):
|
|
2867
|
+
step = len(results) // (remaining + 1)
|
|
2868
|
+
for i in range(1, remaining + 1):
|
|
2869
|
+
idx = min(i * step, len(results) - 1)
|
|
2870
|
+
if idx not in indices_used:
|
|
2871
|
+
samples.append(results[idx].get("entry", {}))
|
|
2872
|
+
indices_used.add(idx)
|
|
2873
|
+
|
|
2874
|
+
return samples[:max_samples]
|
|
2875
|
+
|
|
2876
|
+
|
|
2877
|
+
# Investigation Session Management
|
|
2878
|
+
|
|
2879
|
+
|
|
2880
|
+
class InvestigationSession:
|
|
2881
|
+
"""
|
|
2882
|
+
Track investigation state and history for multi-step analysis.
|
|
2883
|
+
|
|
2884
|
+
This allows LLMs to:
|
|
2885
|
+
- Track what they've already investigated
|
|
2886
|
+
- Undo/redo operations
|
|
2887
|
+
- Save and resume investigations
|
|
2888
|
+
- Generate reports of their investigation process
|
|
2889
|
+
|
|
2890
|
+
Example:
|
|
2891
|
+
session = InvestigationSession(files=["app.log"])
|
|
2892
|
+
|
|
2893
|
+
# Perform investigation
|
|
2894
|
+
session.search(level="ERROR")
|
|
2895
|
+
session.follow_thread(correlation_id="req-123")
|
|
2896
|
+
session.find_patterns()
|
|
2897
|
+
|
|
2898
|
+
# Review history
|
|
2899
|
+
history = session.get_history()
|
|
2900
|
+
|
|
2901
|
+
# Undo last operation
|
|
2902
|
+
session.undo()
|
|
2903
|
+
|
|
2904
|
+
# Save for later
|
|
2905
|
+
session.save("incident_2024_01_15.json")
|
|
2906
|
+
|
|
2907
|
+
# Resume later
|
|
2908
|
+
session2 = InvestigationSession.load("incident_2024_01_15.json")
|
|
2909
|
+
"""
|
|
2910
|
+
|
|
2911
|
+
def __init__(self, files: Optional[List[str]] = None, name: Optional[str] = None):
|
|
2912
|
+
self.files = files or []
|
|
2913
|
+
self.name = name or f"investigation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
2914
|
+
self.history = []
|
|
2915
|
+
self.current_index = -1
|
|
2916
|
+
self.metadata = {}
|
|
2917
|
+
|
|
2918
|
+
if files:
|
|
2919
|
+
self._add_to_history("init", "Initialize investigation", {"files": files}, None)
|
|
2920
|
+
|
|
2921
|
+
def search(
|
|
2922
|
+
self,
|
|
2923
|
+
query: Optional[str] = None,
|
|
2924
|
+
level: Optional[str] = None,
|
|
2925
|
+
output_format: str = "summary",
|
|
2926
|
+
**kwargs,
|
|
2927
|
+
) -> Dict[str, Any]:
|
|
2928
|
+
"""Perform search and track in history"""
|
|
2929
|
+
params = {"query": query, "level": level, "output_format": output_format, **kwargs}
|
|
2930
|
+
result = search(self.files, query=query, level=level, output_format=output_format, **kwargs)
|
|
2931
|
+
|
|
2932
|
+
self._add_to_history(
|
|
2933
|
+
"search",
|
|
2934
|
+
f"Search for {level or 'all'} logs" + (f" matching '{query}'" if query else ""),
|
|
2935
|
+
params,
|
|
2936
|
+
result,
|
|
2937
|
+
)
|
|
2938
|
+
|
|
2939
|
+
return result
|
|
2940
|
+
|
|
2941
|
+
def follow_thread(
|
|
2942
|
+
self,
|
|
2943
|
+
thread_id: Optional[str] = None,
|
|
2944
|
+
correlation_id: Optional[str] = None,
|
|
2945
|
+
trace_id: Optional[str] = None,
|
|
2946
|
+
) -> Dict[str, Any]:
|
|
2947
|
+
"""Follow thread and track in history"""
|
|
2948
|
+
params = {"thread_id": thread_id, "correlation_id": correlation_id, "trace_id": trace_id}
|
|
2949
|
+
result = follow_thread(
|
|
2950
|
+
self.files, thread_id=thread_id, correlation_id=correlation_id, trace_id=trace_id
|
|
2951
|
+
)
|
|
2952
|
+
|
|
2953
|
+
thread_desc = thread_id or correlation_id or trace_id
|
|
2954
|
+
self._add_to_history("follow_thread", f"Follow thread: {thread_desc}", params, result)
|
|
2955
|
+
|
|
2956
|
+
return result
|
|
2957
|
+
|
|
2958
|
+
def find_patterns(self, min_occurrences: int = 3) -> Dict[str, Any]:
|
|
2959
|
+
"""Find patterns and track in history"""
|
|
2960
|
+
params = {"min_occurrences": min_occurrences}
|
|
2961
|
+
result = find_patterns(self.files, min_occurrences=min_occurrences)
|
|
2962
|
+
|
|
2963
|
+
self._add_to_history(
|
|
2964
|
+
"find_patterns", f"Find patterns (min {min_occurrences} occurrences)", params, result
|
|
2965
|
+
)
|
|
2966
|
+
|
|
2967
|
+
return result
|
|
2968
|
+
|
|
2969
|
+
def compare_threads(self, **kwargs) -> Dict[str, Any]:
|
|
2970
|
+
"""Compare threads and track in history"""
|
|
2971
|
+
result = compare_threads(self.files, **kwargs)
|
|
2972
|
+
|
|
2973
|
+
desc = f"Compare {kwargs.get('correlation_a', 'A')} vs {kwargs.get('correlation_b', 'B')}"
|
|
2974
|
+
self._add_to_history("compare_threads", desc, kwargs, result)
|
|
2975
|
+
|
|
2976
|
+
return result
|
|
2977
|
+
|
|
2978
|
+
def cross_service_timeline(
|
|
2979
|
+
self, service_files: Dict[str, List[str]], **kwargs
|
|
2980
|
+
) -> Dict[str, Any]:
|
|
2981
|
+
"""Create cross-service timeline and track in history"""
|
|
2982
|
+
result = cross_service_timeline(service_files, **kwargs)
|
|
2983
|
+
|
|
2984
|
+
desc = f"Cross-service timeline for {list(service_files.keys())}"
|
|
2985
|
+
self._add_to_history(
|
|
2986
|
+
"cross_service_timeline", desc, {"service_files": service_files, **kwargs}, result
|
|
2987
|
+
)
|
|
2988
|
+
|
|
2989
|
+
return result
|
|
2990
|
+
|
|
2991
|
+
def add_note(self, note: str):
|
|
2992
|
+
"""Add a text note to the investigation"""
|
|
2993
|
+
self._add_to_history("note", f"Note: {note[:50]}...", {"note": note}, None)
|
|
2994
|
+
|
|
2995
|
+
def _add_to_history(
|
|
2996
|
+
self,
|
|
2997
|
+
operation_type: str,
|
|
2998
|
+
description: str,
|
|
2999
|
+
params: Dict[str, Any],
|
|
3000
|
+
result: Optional[Dict[str, Any]],
|
|
3001
|
+
):
|
|
3002
|
+
"""Add operation to history"""
|
|
3003
|
+
# Remove any operations after current index (for undo/redo)
|
|
3004
|
+
self.history = self.history[: self.current_index + 1]
|
|
3005
|
+
|
|
3006
|
+
entry = {
|
|
3007
|
+
"timestamp": datetime.now().isoformat(),
|
|
3008
|
+
"operation": operation_type,
|
|
3009
|
+
"description": description,
|
|
3010
|
+
"params": params,
|
|
3011
|
+
"result_summary": self._summarize_result(result) if result else None,
|
|
3012
|
+
}
|
|
3013
|
+
|
|
3014
|
+
self.history.append(entry)
|
|
3015
|
+
self.current_index = len(self.history) - 1
|
|
3016
|
+
|
|
3017
|
+
def _summarize_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
|
3018
|
+
"""Create a compact summary of operation result"""
|
|
3019
|
+
if not result:
|
|
3020
|
+
return {}
|
|
3021
|
+
|
|
3022
|
+
summary = {}
|
|
3023
|
+
|
|
3024
|
+
# Common fields
|
|
3025
|
+
if "total_matches" in result:
|
|
3026
|
+
summary["total_matches"] = result["total_matches"]
|
|
3027
|
+
if "total_entries" in result:
|
|
3028
|
+
summary["total_entries"] = result["total_entries"]
|
|
3029
|
+
if "duration_ms" in result:
|
|
3030
|
+
summary["duration_ms"] = result["duration_ms"]
|
|
3031
|
+
if "summary" in result:
|
|
3032
|
+
summary["summary"] = result["summary"]
|
|
3033
|
+
|
|
3034
|
+
# Pattern results
|
|
3035
|
+
if "patterns" in result:
|
|
3036
|
+
summary["pattern_count"] = len(result["patterns"])
|
|
3037
|
+
|
|
3038
|
+
# Timeline results
|
|
3039
|
+
if "timeline" in result:
|
|
3040
|
+
summary["timeline_length"] = len(result["timeline"])
|
|
3041
|
+
|
|
3042
|
+
return summary
|
|
3043
|
+
|
|
3044
|
+
def get_history(self, include_results: bool = False) -> List[Dict[str, Any]]:
|
|
3045
|
+
"""Get investigation history"""
|
|
3046
|
+
if include_results:
|
|
3047
|
+
return self.history
|
|
3048
|
+
else:
|
|
3049
|
+
# Return without full results (more token-efficient)
|
|
3050
|
+
return [
|
|
3051
|
+
{
|
|
3052
|
+
"timestamp": h["timestamp"],
|
|
3053
|
+
"operation": h["operation"],
|
|
3054
|
+
"description": h["description"],
|
|
3055
|
+
"result_summary": h.get("result_summary"),
|
|
3056
|
+
}
|
|
3057
|
+
for h in self.history
|
|
3058
|
+
]
|
|
3059
|
+
|
|
3060
|
+
def undo(self) -> bool:
|
|
3061
|
+
"""Undo last operation"""
|
|
3062
|
+
if self.current_index > 0:
|
|
3063
|
+
self.current_index -= 1
|
|
3064
|
+
return True
|
|
3065
|
+
return False
|
|
3066
|
+
|
|
3067
|
+
def redo(self) -> bool:
|
|
3068
|
+
"""Redo previously undone operation"""
|
|
3069
|
+
if self.current_index < len(self.history) - 1:
|
|
3070
|
+
self.current_index += 1
|
|
3071
|
+
return True
|
|
3072
|
+
return False
|
|
3073
|
+
|
|
3074
|
+
def get_current_focus(self) -> Optional[Dict[str, Any]]:
|
|
3075
|
+
"""Get the current operation being focused on"""
|
|
3076
|
+
if 0 <= self.current_index < len(self.history):
|
|
3077
|
+
return self.history[self.current_index]
|
|
3078
|
+
return None
|
|
3079
|
+
|
|
3080
|
+
def save(self, filepath: str):
|
|
3081
|
+
"""Save session to file"""
|
|
3082
|
+
import json
|
|
3083
|
+
|
|
3084
|
+
data = {
|
|
3085
|
+
"name": self.name,
|
|
3086
|
+
"files": self.files,
|
|
3087
|
+
"history": self.history,
|
|
3088
|
+
"current_index": self.current_index,
|
|
3089
|
+
"metadata": self.metadata,
|
|
3090
|
+
"saved_at": datetime.now().isoformat(),
|
|
3091
|
+
}
|
|
3092
|
+
|
|
3093
|
+
with open(filepath, "w") as f:
|
|
3094
|
+
json.dump(data, f, indent=2)
|
|
3095
|
+
|
|
3096
|
+
@classmethod
|
|
3097
|
+
def load(cls, filepath: str) -> "InvestigationSession":
|
|
3098
|
+
"""Load session from file"""
|
|
3099
|
+
import json
|
|
3100
|
+
|
|
3101
|
+
with open(filepath, "r") as f:
|
|
3102
|
+
data = json.load(f)
|
|
3103
|
+
|
|
3104
|
+
session = cls(files=data["files"], name=data["name"])
|
|
3105
|
+
session.history = data["history"]
|
|
3106
|
+
session.current_index = data["current_index"]
|
|
3107
|
+
session.metadata = data.get("metadata", {})
|
|
3108
|
+
|
|
3109
|
+
return session
|
|
3110
|
+
|
|
3111
|
+
def get_summary(self) -> str:
|
|
3112
|
+
"""Get a human-readable summary of the investigation"""
|
|
3113
|
+
if not self.history:
|
|
3114
|
+
return "No investigation steps yet"
|
|
3115
|
+
|
|
3116
|
+
lines = [
|
|
3117
|
+
f"Investigation: {self.name}",
|
|
3118
|
+
f"Steps completed: {len(self.history)}",
|
|
3119
|
+
"",
|
|
3120
|
+
"Timeline:",
|
|
3121
|
+
]
|
|
3122
|
+
|
|
3123
|
+
for i, entry in enumerate(self.history):
|
|
3124
|
+
marker = "→" if i == self.current_index else " "
|
|
3125
|
+
lines.append(f" {marker} {i+1}. {entry['description']}")
|
|
3126
|
+
if entry.get("result_summary"):
|
|
3127
|
+
for key, value in entry["result_summary"].items():
|
|
3128
|
+
lines.append(f" {key}: {value}")
|
|
3129
|
+
|
|
3130
|
+
return "\n".join(lines)
|
|
3131
|
+
|
|
3132
|
+
def generate_report(self, format: str = "markdown", include_evidence: bool = True) -> str:
|
|
3133
|
+
"""
|
|
3134
|
+
Generate a comprehensive investigation report.
|
|
3135
|
+
|
|
3136
|
+
Args:
|
|
3137
|
+
format: Output format - "markdown", "text", or "json"
|
|
3138
|
+
include_evidence: Include example log entries as evidence
|
|
3139
|
+
|
|
3140
|
+
Returns:
|
|
3141
|
+
Formatted investigation report string
|
|
3142
|
+
"""
|
|
3143
|
+
if format == "markdown":
|
|
3144
|
+
return self._generate_markdown_report(include_evidence)
|
|
3145
|
+
elif format == "text":
|
|
3146
|
+
return self._generate_text_report(include_evidence)
|
|
3147
|
+
elif format == "json":
|
|
3148
|
+
import json
|
|
3149
|
+
|
|
3150
|
+
return json.dumps(self._generate_json_report(include_evidence), indent=2)
|
|
3151
|
+
else:
|
|
3152
|
+
return self._generate_markdown_report(include_evidence)
|
|
3153
|
+
|
|
3154
|
+
def _generate_markdown_report(self, include_evidence: bool) -> str:
|
|
3155
|
+
"""Generate Markdown format report"""
|
|
3156
|
+
lines = [
|
|
3157
|
+
f"# Investigation Report: {self.name}",
|
|
3158
|
+
"",
|
|
3159
|
+
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
3160
|
+
f"**Files Analyzed:** {', '.join(self.files)}",
|
|
3161
|
+
f"**Steps Completed:** {len(self.history)}",
|
|
3162
|
+
"",
|
|
3163
|
+
"---",
|
|
3164
|
+
"",
|
|
3165
|
+
"## Executive Summary",
|
|
3166
|
+
"",
|
|
3167
|
+
]
|
|
3168
|
+
|
|
3169
|
+
# Try to extract key findings
|
|
3170
|
+
error_counts = []
|
|
3171
|
+
patterns_found = []
|
|
3172
|
+
key_insights = []
|
|
3173
|
+
|
|
3174
|
+
for entry in self.history:
|
|
3175
|
+
summary = entry.get("result_summary") or {}
|
|
3176
|
+
if "total_matches" in summary and entry["operation"] == "search":
|
|
3177
|
+
error_counts.append(
|
|
3178
|
+
f"- Found {summary['total_matches']} matches in {entry['description']}"
|
|
3179
|
+
)
|
|
3180
|
+
if "pattern_count" in summary:
|
|
3181
|
+
patterns_found.append(f"- Identified {summary['pattern_count']} repeated patterns")
|
|
3182
|
+
if "summary" in summary:
|
|
3183
|
+
key_insights.append(f"- {summary['summary']}")
|
|
3184
|
+
|
|
3185
|
+
if error_counts:
|
|
3186
|
+
lines.extend(error_counts)
|
|
3187
|
+
if patterns_found:
|
|
3188
|
+
lines.extend(patterns_found)
|
|
3189
|
+
if key_insights:
|
|
3190
|
+
lines.append("")
|
|
3191
|
+
lines.append("### Key Findings")
|
|
3192
|
+
lines.extend(key_insights)
|
|
3193
|
+
|
|
3194
|
+
lines.extend(["", "---", "", "## Investigation Timeline", ""])
|
|
3195
|
+
|
|
3196
|
+
# Add detailed timeline
|
|
3197
|
+
for i, entry in enumerate(self.history):
|
|
3198
|
+
timestamp = entry.get("timestamp", "Unknown time")
|
|
3199
|
+
desc = entry["description"]
|
|
3200
|
+
operation = entry["operation"]
|
|
3201
|
+
|
|
3202
|
+
lines.append(f"### Step {i+1}: {desc}")
|
|
3203
|
+
lines.append("")
|
|
3204
|
+
lines.append(f"- **Time:** {timestamp}")
|
|
3205
|
+
lines.append(f"- **Operation:** `{operation}`")
|
|
3206
|
+
|
|
3207
|
+
# Add results
|
|
3208
|
+
if entry.get("result_summary"):
|
|
3209
|
+
lines.append("- **Results:**")
|
|
3210
|
+
for key, value in entry["result_summary"].items():
|
|
3211
|
+
lines.append(f" - {key}: {value}")
|
|
3212
|
+
|
|
3213
|
+
lines.append("")
|
|
3214
|
+
|
|
3215
|
+
lines.extend(
|
|
3216
|
+
[
|
|
3217
|
+
"---",
|
|
3218
|
+
"",
|
|
3219
|
+
"## Conclusions",
|
|
3220
|
+
"",
|
|
3221
|
+
"Based on the investigation steps above, review the key findings and error patterns.",
|
|
3222
|
+
"",
|
|
3223
|
+
"## Next Steps",
|
|
3224
|
+
"",
|
|
3225
|
+
"- [ ] Review identified error patterns",
|
|
3226
|
+
"- [ ] Investigate root causes",
|
|
3227
|
+
"- [ ] Implement fixes",
|
|
3228
|
+
"- [ ] Monitor for recurrence",
|
|
3229
|
+
"",
|
|
3230
|
+
]
|
|
3231
|
+
)
|
|
3232
|
+
|
|
3233
|
+
return "\n".join(lines)
|
|
3234
|
+
|
|
3235
|
+
def _generate_text_report(self, include_evidence: bool) -> str:
|
|
3236
|
+
"""Generate plain text format report"""
|
|
3237
|
+
lines = [
|
|
3238
|
+
"=" * 70,
|
|
3239
|
+
f"INVESTIGATION REPORT: {self.name}",
|
|
3240
|
+
"=" * 70,
|
|
3241
|
+
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
3242
|
+
f"Files: {', '.join(self.files)}",
|
|
3243
|
+
f"Steps: {len(self.history)}",
|
|
3244
|
+
"=" * 70,
|
|
3245
|
+
"",
|
|
3246
|
+
"TIMELINE:",
|
|
3247
|
+
"",
|
|
3248
|
+
]
|
|
3249
|
+
|
|
3250
|
+
for i, entry in enumerate(self.history):
|
|
3251
|
+
timestamp = entry.get("timestamp", "Unknown")
|
|
3252
|
+
lines.append(f"{i+1}. [{timestamp}] {entry['description']}")
|
|
3253
|
+
|
|
3254
|
+
if entry.get("result_summary"):
|
|
3255
|
+
for key, value in entry["result_summary"].items():
|
|
3256
|
+
lines.append(f" - {key}: {value}")
|
|
3257
|
+
lines.append("")
|
|
3258
|
+
|
|
3259
|
+
lines.extend(["=" * 70, "END OF REPORT", "=" * 70])
|
|
3260
|
+
|
|
3261
|
+
return "\n".join(lines)
|
|
3262
|
+
|
|
3263
|
+
def _generate_json_report(self, include_evidence: bool) -> Dict[str, Any]:
|
|
3264
|
+
"""Generate JSON format report"""
|
|
3265
|
+
return {
|
|
3266
|
+
"name": self.name,
|
|
3267
|
+
"generated_at": datetime.now().isoformat(),
|
|
3268
|
+
"files": self.files,
|
|
3269
|
+
"steps_completed": len(self.history),
|
|
3270
|
+
"timeline": (
|
|
3271
|
+
self.history if include_evidence else self.get_history(include_results=False)
|
|
3272
|
+
),
|
|
3273
|
+
"metadata": self.metadata,
|
|
3274
|
+
}
|
|
3275
|
+
|
|
3276
|
+
|
|
3277
|
+
# Smart Sampling
|
|
3278
|
+
|
|
3279
|
+
|
|
3280
|
+
def smart_sample(
|
|
3281
|
+
files: List[str],
|
|
3282
|
+
level: Optional[str] = None,
|
|
3283
|
+
strategy: str = "representative",
|
|
3284
|
+
sample_size: int = 50,
|
|
3285
|
+
) -> Dict[str, Any]:
|
|
3286
|
+
"""
|
|
3287
|
+
Get a smart sample of log entries that represents the full dataset.
|
|
3288
|
+
|
|
3289
|
+
Instead of random sampling, this uses intelligent strategies to ensure
|
|
3290
|
+
the sample is informative and diverse.
|
|
3291
|
+
|
|
3292
|
+
Args:
|
|
3293
|
+
files: List of log file paths
|
|
3294
|
+
level: Optional log level filter
|
|
3295
|
+
strategy: Sampling strategy:
|
|
3296
|
+
- "representative": Balanced mix of levels, times, and patterns
|
|
3297
|
+
- "diverse": Maximum diversity (different messages, threads, etc.)
|
|
3298
|
+
- "chronological": Evenly spaced across time
|
|
3299
|
+
- "errors_focused": Prioritize errors with context
|
|
3300
|
+
sample_size: Target number of entries (default 50)
|
|
3301
|
+
|
|
3302
|
+
Returns:
|
|
3303
|
+
Dictionary with sampled entries:
|
|
3304
|
+
{
|
|
3305
|
+
"samples": [...], # Selected log entries
|
|
3306
|
+
"total_population": 15230,
|
|
3307
|
+
"sample_size": 50,
|
|
3308
|
+
"strategy": "representative",
|
|
3309
|
+
"coverage": {
|
|
3310
|
+
"time_coverage": 0.95, # % of time range covered
|
|
3311
|
+
"level_coverage": {"ERROR": 10, "INFO": 35, "WARN": 5},
|
|
3312
|
+
"thread_coverage": 23 # Number of unique threads
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
|
|
3316
|
+
Example:
|
|
3317
|
+
# Get representative sample of 100 entries
|
|
3318
|
+
sample = smart_sample(
|
|
3319
|
+
files=["app.log"],
|
|
3320
|
+
strategy="representative",
|
|
3321
|
+
sample_size=100
|
|
3322
|
+
)
|
|
3323
|
+
|
|
3324
|
+
# Analyze the sample (much faster than full dataset)
|
|
3325
|
+
for entry in sample['samples']:
|
|
3326
|
+
print(entry['message'])
|
|
3327
|
+
"""
|
|
3328
|
+
if not RUST_AVAILABLE:
|
|
3329
|
+
raise RuntimeError("Rust backend not available")
|
|
3330
|
+
|
|
3331
|
+
# Get all entries
|
|
3332
|
+
results = search(files, level=level, limit=None)
|
|
3333
|
+
all_entries = [r["entry"] for r in results.get("results", [])]
|
|
3334
|
+
|
|
3335
|
+
if not all_entries:
|
|
3336
|
+
return {
|
|
3337
|
+
"samples": [],
|
|
3338
|
+
"total_population": 0,
|
|
3339
|
+
"sample_size": 0,
|
|
3340
|
+
"strategy": strategy,
|
|
3341
|
+
"coverage": {},
|
|
3342
|
+
}
|
|
3343
|
+
|
|
3344
|
+
# Apply sampling strategy
|
|
3345
|
+
if strategy == "representative":
|
|
3346
|
+
samples = _sample_representative(all_entries, sample_size)
|
|
3347
|
+
elif strategy == "diverse":
|
|
3348
|
+
samples = _sample_diverse(all_entries, sample_size)
|
|
3349
|
+
elif strategy == "chronological":
|
|
3350
|
+
samples = _sample_chronological(all_entries, sample_size)
|
|
3351
|
+
elif strategy == "errors_focused":
|
|
3352
|
+
samples = _sample_errors_focused(all_entries, sample_size)
|
|
3353
|
+
else:
|
|
3354
|
+
# Default to representative
|
|
3355
|
+
samples = _sample_representative(all_entries, sample_size)
|
|
3356
|
+
|
|
3357
|
+
# Calculate coverage
|
|
3358
|
+
coverage = _calculate_coverage(all_entries, samples)
|
|
3359
|
+
|
|
3360
|
+
return {
|
|
3361
|
+
"samples": samples,
|
|
3362
|
+
"total_population": len(all_entries),
|
|
3363
|
+
"sample_size": len(samples),
|
|
3364
|
+
"strategy": strategy,
|
|
3365
|
+
"coverage": coverage,
|
|
3366
|
+
}
|
|
3367
|
+
|
|
3368
|
+
|
|
3369
|
+
def _sample_representative(entries: List[Dict], size: int) -> List[Dict]:
|
|
3370
|
+
"""Sample to represent overall distribution"""
|
|
3371
|
+
if len(entries) <= size:
|
|
3372
|
+
return entries
|
|
3373
|
+
|
|
3374
|
+
samples = []
|
|
3375
|
+
|
|
3376
|
+
# Group by level
|
|
3377
|
+
by_level = defaultdict(list)
|
|
3378
|
+
for entry in entries:
|
|
3379
|
+
level = entry.get("level", "INFO")
|
|
3380
|
+
by_level[level].append(entry)
|
|
3381
|
+
|
|
3382
|
+
# Calculate proportional samples per level
|
|
3383
|
+
for level, level_entries in by_level.items():
|
|
3384
|
+
proportion = len(level_entries) / len(entries)
|
|
3385
|
+
level_sample_size = max(1, int(size * proportion))
|
|
3386
|
+
|
|
3387
|
+
# Sample evenly across time
|
|
3388
|
+
if level_sample_size >= len(level_entries):
|
|
3389
|
+
samples.extend(level_entries)
|
|
3390
|
+
else:
|
|
3391
|
+
step = len(level_entries) / level_sample_size
|
|
3392
|
+
indices = [int(i * step) for i in range(level_sample_size)]
|
|
3393
|
+
samples.extend([level_entries[i] for i in indices])
|
|
3394
|
+
|
|
3395
|
+
# If we have too many, trim to size
|
|
3396
|
+
if len(samples) > size:
|
|
3397
|
+
step = len(samples) / size
|
|
3398
|
+
indices = [int(i * step) for i in range(size)]
|
|
3399
|
+
samples = [samples[i] for i in indices]
|
|
3400
|
+
|
|
3401
|
+
return samples[:size]
|
|
3402
|
+
|
|
3403
|
+
|
|
3404
|
+
def _sample_diverse(entries: List[Dict], size: int) -> List[Dict]:
|
|
3405
|
+
"""Sample for maximum diversity"""
|
|
3406
|
+
if len(entries) <= size:
|
|
3407
|
+
return entries
|
|
3408
|
+
|
|
3409
|
+
samples = []
|
|
3410
|
+
used_messages = set()
|
|
3411
|
+
used_threads = set()
|
|
3412
|
+
|
|
3413
|
+
# First pass: unique messages
|
|
3414
|
+
for entry in entries:
|
|
3415
|
+
if len(samples) >= size:
|
|
3416
|
+
break
|
|
3417
|
+
|
|
3418
|
+
message = entry.get("message", "")
|
|
3419
|
+
if message and message not in used_messages:
|
|
3420
|
+
samples.append(entry)
|
|
3421
|
+
used_messages.add(message)
|
|
3422
|
+
thread = entry.get("thread_id") or entry.get("correlation_id")
|
|
3423
|
+
if thread:
|
|
3424
|
+
used_threads.add(thread)
|
|
3425
|
+
|
|
3426
|
+
# Second pass: unique threads
|
|
3427
|
+
if len(samples) < size:
|
|
3428
|
+
for entry in entries:
|
|
3429
|
+
if len(samples) >= size:
|
|
3430
|
+
break
|
|
3431
|
+
|
|
3432
|
+
thread = entry.get("thread_id") or entry.get("correlation_id")
|
|
3433
|
+
if thread and thread not in used_threads:
|
|
3434
|
+
samples.append(entry)
|
|
3435
|
+
used_threads.add(thread)
|
|
3436
|
+
|
|
3437
|
+
# Third pass: fill remaining with evenly spaced entries
|
|
3438
|
+
if len(samples) < size:
|
|
3439
|
+
remaining = size - len(samples)
|
|
3440
|
+
step = len(entries) / remaining
|
|
3441
|
+
for i in range(remaining):
|
|
3442
|
+
idx = int(i * step)
|
|
3443
|
+
if idx < len(entries):
|
|
3444
|
+
samples.append(entries[idx])
|
|
3445
|
+
|
|
3446
|
+
return samples[:size]
|
|
3447
|
+
|
|
3448
|
+
|
|
3449
|
+
def _sample_chronological(entries: List[Dict], size: int) -> List[Dict]:
|
|
3450
|
+
"""Sample evenly across time"""
|
|
3451
|
+
if len(entries) <= size:
|
|
3452
|
+
return entries
|
|
3453
|
+
|
|
3454
|
+
# Sort by timestamp
|
|
3455
|
+
sorted_entries = sorted(entries, key=lambda e: e.get("timestamp", ""))
|
|
3456
|
+
|
|
3457
|
+
# Sample evenly
|
|
3458
|
+
step = len(sorted_entries) / size
|
|
3459
|
+
indices = [int(i * step) for i in range(size)]
|
|
3460
|
+
return [sorted_entries[i] for i in indices]
|
|
3461
|
+
|
|
3462
|
+
|
|
3463
|
+
def _sample_errors_focused(entries: List[Dict], size: int) -> List[Dict]:
|
|
3464
|
+
"""Sample focusing on errors with context"""
|
|
3465
|
+
if len(entries) <= size:
|
|
3466
|
+
return entries
|
|
3467
|
+
|
|
3468
|
+
samples = []
|
|
3469
|
+
error_indices = []
|
|
3470
|
+
non_error_indices = []
|
|
3471
|
+
|
|
3472
|
+
# Separate errors from non-errors
|
|
3473
|
+
for i, entry in enumerate(entries):
|
|
3474
|
+
level = entry.get("level", "INFO")
|
|
3475
|
+
if level in ["ERROR", "FATAL"]:
|
|
3476
|
+
error_indices.append(i)
|
|
3477
|
+
else:
|
|
3478
|
+
non_error_indices.append(i)
|
|
3479
|
+
|
|
3480
|
+
# Allocate 70% to errors, 30% to context
|
|
3481
|
+
error_budget = int(size * 0.7)
|
|
3482
|
+
|
|
3483
|
+
# Sample errors
|
|
3484
|
+
if error_indices:
|
|
3485
|
+
if len(error_indices) <= error_budget:
|
|
3486
|
+
# All errors + some context
|
|
3487
|
+
for idx in error_indices:
|
|
3488
|
+
samples.append(entries[idx])
|
|
3489
|
+
# Add 1-2 entries before error for context
|
|
3490
|
+
if idx > 0:
|
|
3491
|
+
samples.append(entries[idx - 1])
|
|
3492
|
+
else:
|
|
3493
|
+
# Sample errors evenly
|
|
3494
|
+
step = len(error_indices) / error_budget
|
|
3495
|
+
for i in range(error_budget):
|
|
3496
|
+
idx = error_indices[int(i * step)]
|
|
3497
|
+
samples.append(entries[idx])
|
|
3498
|
+
|
|
3499
|
+
# Sample non-errors for context
|
|
3500
|
+
if non_error_indices and len(samples) < size:
|
|
3501
|
+
remaining = size - len(samples)
|
|
3502
|
+
step = len(non_error_indices) / remaining
|
|
3503
|
+
for i in range(remaining):
|
|
3504
|
+
idx = non_error_indices[min(int(i * step), len(non_error_indices) - 1)]
|
|
3505
|
+
samples.append(entries[idx])
|
|
3506
|
+
|
|
3507
|
+
# Sort by original order
|
|
3508
|
+
entry_to_index = {id(e): i for i, e in enumerate(entries)}
|
|
3509
|
+
samples.sort(key=lambda e: entry_to_index.get(id(e), 0))
|
|
3510
|
+
|
|
3511
|
+
return samples[:size]
|
|
3512
|
+
|
|
3513
|
+
|
|
3514
|
+
def _calculate_coverage(population: List[Dict], sample: List[Dict]) -> Dict[str, Any]:
|
|
3515
|
+
"""Calculate how well the sample covers the population"""
|
|
3516
|
+
# Time coverage
|
|
3517
|
+
pop_times = [e.get("timestamp") for e in population if e.get("timestamp")]
|
|
3518
|
+
sample_times = [e.get("timestamp") for e in sample if e.get("timestamp")]
|
|
3519
|
+
|
|
3520
|
+
time_coverage = 0.0
|
|
3521
|
+
if pop_times and sample_times:
|
|
3522
|
+
pop_times.sort()
|
|
3523
|
+
sample_times.sort()
|
|
3524
|
+
# Simple coverage: sample span / population span
|
|
3525
|
+
try:
|
|
3526
|
+
pop_start = datetime.fromisoformat(pop_times[0].replace("Z", "+00:00"))
|
|
3527
|
+
pop_end = datetime.fromisoformat(pop_times[-1].replace("Z", "+00:00"))
|
|
3528
|
+
sample_start = datetime.fromisoformat(sample_times[0].replace("Z", "+00:00"))
|
|
3529
|
+
sample_end = datetime.fromisoformat(sample_times[-1].replace("Z", "+00:00"))
|
|
3530
|
+
|
|
3531
|
+
pop_duration = (pop_end - pop_start).total_seconds()
|
|
3532
|
+
sample_duration = (sample_end - sample_start).total_seconds()
|
|
3533
|
+
|
|
3534
|
+
if pop_duration > 0:
|
|
3535
|
+
time_coverage = min(1.0, sample_duration / pop_duration)
|
|
3536
|
+
except (ValueError, TypeError, AttributeError):
|
|
3537
|
+
pass # Skip if timestamps are invalid
|
|
3538
|
+
|
|
3539
|
+
# Level coverage
|
|
3540
|
+
level_coverage = defaultdict(int)
|
|
3541
|
+
for entry in sample:
|
|
3542
|
+
level = entry.get("level", "INFO")
|
|
3543
|
+
level_coverage[level] += 1
|
|
3544
|
+
|
|
3545
|
+
# Thread coverage
|
|
3546
|
+
pop_threads = set()
|
|
3547
|
+
sample_threads = set()
|
|
3548
|
+
for entry in population:
|
|
3549
|
+
thread = entry.get("thread_id") or entry.get("correlation_id")
|
|
3550
|
+
if thread:
|
|
3551
|
+
pop_threads.add(thread)
|
|
3552
|
+
for entry in sample:
|
|
3553
|
+
thread = entry.get("thread_id") or entry.get("correlation_id")
|
|
3554
|
+
if thread:
|
|
3555
|
+
sample_threads.add(thread)
|
|
3556
|
+
|
|
3557
|
+
thread_coverage_pct = len(sample_threads) / len(pop_threads) if pop_threads else 0
|
|
3558
|
+
|
|
3559
|
+
return {
|
|
3560
|
+
"time_coverage": time_coverage,
|
|
3561
|
+
"level_distribution": dict(level_coverage),
|
|
3562
|
+
"unique_threads_in_sample": len(sample_threads),
|
|
3563
|
+
"unique_threads_in_population": len(pop_threads),
|
|
3564
|
+
"thread_coverage_pct": thread_coverage_pct,
|
|
3565
|
+
}
|
|
3566
|
+
|
|
3567
|
+
|
|
3568
|
+
# Automatic Insights and Suggestions
|
|
3569
|
+
|
|
3570
|
+
|
|
3571
|
+
def analyze_with_insights(
|
|
3572
|
+
files: List[str], level: Optional[str] = None, auto_investigate: bool = True
|
|
3573
|
+
) -> Dict[str, Any]:
|
|
3574
|
+
"""
|
|
3575
|
+
Analyze logs and automatically generate insights and suggestions.
|
|
3576
|
+
|
|
3577
|
+
This is the "smart mode" that does the thinking for you - perfect for
|
|
3578
|
+
LLMs that want quick actionable information.
|
|
3579
|
+
|
|
3580
|
+
Args:
|
|
3581
|
+
files: List of log file paths
|
|
3582
|
+
level: Optional log level filter (default: analyzes all levels)
|
|
3583
|
+
auto_investigate: Automatically perform follow-up investigations
|
|
3584
|
+
|
|
3585
|
+
Returns:
|
|
3586
|
+
Dictionary with insights:
|
|
3587
|
+
{
|
|
3588
|
+
"overview": {...}, # Quick stats
|
|
3589
|
+
"insights": [
|
|
3590
|
+
{
|
|
3591
|
+
"type": "error_spike",
|
|
3592
|
+
"severity": "high",
|
|
3593
|
+
"description": "Error rate 10x higher than normal",
|
|
3594
|
+
"evidence": [...],
|
|
3595
|
+
"suggestion": "Check database connections"
|
|
3596
|
+
},
|
|
3597
|
+
...
|
|
3598
|
+
],
|
|
3599
|
+
"suggestions": [
|
|
3600
|
+
"Follow thread req-12345 - appears to be failing consistently",
|
|
3601
|
+
"Check database connection pool size",
|
|
3602
|
+
...
|
|
3603
|
+
],
|
|
3604
|
+
"next_steps": [...] # Recommended next investigation steps
|
|
3605
|
+
}
|
|
3606
|
+
|
|
3607
|
+
Example:
|
|
3608
|
+
# One-shot analysis with insights
|
|
3609
|
+
result = analyze_with_insights(files=["app.log"])
|
|
3610
|
+
|
|
3611
|
+
for insight in result['insights']:
|
|
3612
|
+
print(f"[{insight['severity']}] {insight['description']}")
|
|
3613
|
+
print(f" → {insight['suggestion']}")
|
|
3614
|
+
"""
|
|
3615
|
+
if not RUST_AVAILABLE:
|
|
3616
|
+
raise RuntimeError("Rust backend not available")
|
|
3617
|
+
|
|
3618
|
+
insights = []
|
|
3619
|
+
suggestions = []
|
|
3620
|
+
next_steps = []
|
|
3621
|
+
|
|
3622
|
+
# Get overview
|
|
3623
|
+
metadata = get_metadata(files)
|
|
3624
|
+
search_results = search(files, level=level, output_format="summary")
|
|
3625
|
+
|
|
3626
|
+
# Insight 1: Error rate analysis
|
|
3627
|
+
total = search_results.get("total_matches", 0)
|
|
3628
|
+
levels = search_results.get("log_levels", {})
|
|
3629
|
+
error_count = levels.get("ERROR", 0) + levels.get("FATAL", 0)
|
|
3630
|
+
|
|
3631
|
+
if total > 0:
|
|
3632
|
+
error_rate = error_count / total
|
|
3633
|
+
if error_rate > 0.1: # More than 10% errors
|
|
3634
|
+
insights.append(
|
|
3635
|
+
{
|
|
3636
|
+
"type": "high_error_rate",
|
|
3637
|
+
"severity": "high",
|
|
3638
|
+
"description": f"High error rate: {error_rate:.1%} ({error_count}/{total})",
|
|
3639
|
+
"evidence": {"error_count": error_count, "total": total, "rate": error_rate},
|
|
3640
|
+
"suggestion": "Investigate most common errors first",
|
|
3641
|
+
}
|
|
3642
|
+
)
|
|
3643
|
+
next_steps.append("Run: find_patterns(files, min_occurrences=3)")
|
|
3644
|
+
|
|
3645
|
+
# Insight 2: Pattern detection
|
|
3646
|
+
if auto_investigate and error_count > 0:
|
|
3647
|
+
patterns = find_patterns(files, min_occurrences=2)
|
|
3648
|
+
if patterns.get("patterns"):
|
|
3649
|
+
pattern_count = len(patterns["patterns"])
|
|
3650
|
+
insights.append(
|
|
3651
|
+
{
|
|
3652
|
+
"type": "repeated_patterns",
|
|
3653
|
+
"severity": "medium",
|
|
3654
|
+
"description": f"Found {pattern_count} repeated error patterns",
|
|
3655
|
+
"evidence": patterns["patterns"][:3], # Top 3
|
|
3656
|
+
"suggestion": "These errors are systematic, not random",
|
|
3657
|
+
}
|
|
3658
|
+
)
|
|
3659
|
+
|
|
3660
|
+
# Suggest investigating the most frequent pattern
|
|
3661
|
+
if patterns["patterns"]:
|
|
3662
|
+
top_pattern = patterns["patterns"][0]
|
|
3663
|
+
suggestions.append(
|
|
3664
|
+
f"Investigate pattern: '{top_pattern.get('pattern', '')[:50]}...'"
|
|
3665
|
+
)
|
|
3666
|
+
|
|
3667
|
+
# Insight 3: Check for cascading failures
|
|
3668
|
+
if error_count > 5:
|
|
3669
|
+
# Look for timing patterns
|
|
3670
|
+
top_messages = search_results.get("top_messages", [])
|
|
3671
|
+
if top_messages:
|
|
3672
|
+
# Check if errors happened in quick succession
|
|
3673
|
+
time_clustered = any(msg.get("count", 0) > 3 for msg in top_messages)
|
|
3674
|
+
if time_clustered:
|
|
3675
|
+
insights.append(
|
|
3676
|
+
{
|
|
3677
|
+
"type": "possible_cascade",
|
|
3678
|
+
"severity": "high",
|
|
3679
|
+
"description": "Errors may be cascading (multiple errors in short time)",
|
|
3680
|
+
"evidence": top_messages[:2],
|
|
3681
|
+
"suggestion": "Look for root cause - later errors may be symptoms",
|
|
3682
|
+
}
|
|
3683
|
+
)
|
|
3684
|
+
suggestions.append("Check timestamps - investigate earliest error first")
|
|
3685
|
+
|
|
3686
|
+
# Insight 4: Thread analysis
|
|
3687
|
+
for meta in metadata:
|
|
3688
|
+
unique_correlations = meta.get("unique_correlation_ids", 0)
|
|
3689
|
+
|
|
3690
|
+
if error_count > 0 and unique_correlations > 0:
|
|
3691
|
+
# Some threads are failing
|
|
3692
|
+
insights.append(
|
|
3693
|
+
{
|
|
3694
|
+
"type": "thread_failures",
|
|
3695
|
+
"severity": "medium",
|
|
3696
|
+
"description": f"Errors across {unique_correlations} different requests",
|
|
3697
|
+
"evidence": {"unique_correlations": unique_correlations},
|
|
3698
|
+
"suggestion": "Compare successful vs failed requests",
|
|
3699
|
+
}
|
|
3700
|
+
)
|
|
3701
|
+
next_steps.append("Use: compare_threads() to find differences")
|
|
3702
|
+
|
|
3703
|
+
# Generate suggestions based on insights
|
|
3704
|
+
if not suggestions:
|
|
3705
|
+
if error_count > 0:
|
|
3706
|
+
suggestions.append("Start by examining the first error - it may be the root cause")
|
|
3707
|
+
suggestions.append("Use follow_thread() to see full request flow")
|
|
3708
|
+
else:
|
|
3709
|
+
suggestions.append("No errors found - logs look healthy")
|
|
3710
|
+
|
|
3711
|
+
# Overview
|
|
3712
|
+
overview = {
|
|
3713
|
+
"total_logs": total,
|
|
3714
|
+
"error_count": error_count,
|
|
3715
|
+
"error_rate": error_count / total if total > 0 else 0,
|
|
3716
|
+
"files_analyzed": len(files),
|
|
3717
|
+
"log_levels": levels,
|
|
3718
|
+
}
|
|
3719
|
+
|
|
3720
|
+
return {
|
|
3721
|
+
"overview": overview,
|
|
3722
|
+
"insights": insights,
|
|
3723
|
+
"suggestions": suggestions,
|
|
3724
|
+
"next_steps": next_steps,
|
|
3725
|
+
"investigated_automatically": auto_investigate,
|
|
3726
|
+
}
|
|
3727
|
+
|
|
3728
|
+
|
|
3729
|
+
def explain(
|
|
3730
|
+
entry: Optional[Dict[str, Any]] = None,
|
|
3731
|
+
error_message: Optional[str] = None,
|
|
3732
|
+
context: str = "general",
|
|
3733
|
+
) -> str:
|
|
3734
|
+
"""
|
|
3735
|
+
Explain a log entry or error message in simple terms.
|
|
3736
|
+
|
|
3737
|
+
Perfect for when you encounter cryptic errors or need to understand
|
|
3738
|
+
what's happening. Provides human-friendly explanations and next steps.
|
|
3739
|
+
|
|
3740
|
+
Args:
|
|
3741
|
+
entry: Log entry dictionary to explain
|
|
3742
|
+
error_message: Or just provide an error message string
|
|
3743
|
+
context: Context for explanation ("production", "development", "general")
|
|
3744
|
+
|
|
3745
|
+
Returns:
|
|
3746
|
+
Human-friendly explanation string
|
|
3747
|
+
|
|
3748
|
+
Example:
|
|
3749
|
+
# Explain a log entry
|
|
3750
|
+
explanation = explain(
|
|
3751
|
+
entry=error_entry,
|
|
3752
|
+
context="production"
|
|
3753
|
+
)
|
|
3754
|
+
print(explanation)
|
|
3755
|
+
|
|
3756
|
+
# Explain just a message
|
|
3757
|
+
explanation = explain(
|
|
3758
|
+
error_message="Connection pool exhausted",
|
|
3759
|
+
context="production"
|
|
3760
|
+
)
|
|
3761
|
+
"""
|
|
3762
|
+
if entry:
|
|
3763
|
+
message = entry.get("message", "")
|
|
3764
|
+
level = entry.get("level", "INFO")
|
|
3765
|
+
elif error_message:
|
|
3766
|
+
message = error_message
|
|
3767
|
+
level = "ERROR"
|
|
3768
|
+
else:
|
|
3769
|
+
return "No entry or message provided to explain"
|
|
3770
|
+
|
|
3771
|
+
# Build explanation
|
|
3772
|
+
lines = []
|
|
3773
|
+
|
|
3774
|
+
# What happened
|
|
3775
|
+
lines.append("## What This Means\n")
|
|
3776
|
+
|
|
3777
|
+
# Pattern matching for common errors
|
|
3778
|
+
message_lower = message.lower()
|
|
3779
|
+
|
|
3780
|
+
if "timeout" in message_lower or "timed out" in message_lower:
|
|
3781
|
+
lines.append("A timeout means an operation took too long and was cancelled.")
|
|
3782
|
+
lines.append("\n**Common causes:**")
|
|
3783
|
+
lines.append("- Database query is too slow")
|
|
3784
|
+
lines.append("- Network latency issues")
|
|
3785
|
+
lines.append("- Service is overloaded")
|
|
3786
|
+
lines.append("- Deadlock or infinite loop")
|
|
3787
|
+
lines.append("\n**Next steps:**")
|
|
3788
|
+
lines.append("1. Check what operation was timing out")
|
|
3789
|
+
lines.append("2. Look at the service being called - is it slow or down?")
|
|
3790
|
+
lines.append("3. Review timeout configuration - is it too short?")
|
|
3791
|
+
|
|
3792
|
+
elif "connection" in message_lower and (
|
|
3793
|
+
"refused" in message_lower or "failed" in message_lower
|
|
3794
|
+
):
|
|
3795
|
+
lines.append("A connection failure means the application couldn't reach another service.")
|
|
3796
|
+
lines.append("\n**Common causes:**")
|
|
3797
|
+
lines.append("- Service is down or not responding")
|
|
3798
|
+
lines.append("- Network connectivity issues")
|
|
3799
|
+
lines.append("- Firewall blocking the connection")
|
|
3800
|
+
lines.append("- Wrong hostname/port configuration")
|
|
3801
|
+
lines.append("\n**Next steps:**")
|
|
3802
|
+
lines.append("1. Check if the target service is running")
|
|
3803
|
+
lines.append("2. Verify network connectivity")
|
|
3804
|
+
lines.append("3. Check configuration (hostname, port, etc.)")
|
|
3805
|
+
|
|
3806
|
+
elif "pool exhausted" in message_lower or "too many connections" in message_lower:
|
|
3807
|
+
lines.append("The connection pool is exhausted - all available connections are in use.")
|
|
3808
|
+
lines.append("\n**Common causes:**")
|
|
3809
|
+
lines.append("- Traffic spike overwhelming the system")
|
|
3810
|
+
lines.append("- Connection leaks (not closing connections)")
|
|
3811
|
+
lines.append("- Pool size too small for the load")
|
|
3812
|
+
lines.append("- Slow queries holding connections too long")
|
|
3813
|
+
lines.append("\n**Next steps:**")
|
|
3814
|
+
lines.append("1. Check connection pool size configuration")
|
|
3815
|
+
lines.append("2. Look for connection leaks in code")
|
|
3816
|
+
lines.append("3. Identify slow operations holding connections")
|
|
3817
|
+
lines.append("4. Consider increasing pool size if load is legitimate")
|
|
3818
|
+
|
|
3819
|
+
elif "out of memory" in message_lower or "outofmemoryerror" in message_lower:
|
|
3820
|
+
lines.append("The application ran out of available memory.")
|
|
3821
|
+
lines.append("\n**Common causes:**")
|
|
3822
|
+
lines.append("- Memory leak (memory not being freed)")
|
|
3823
|
+
lines.append("- Processing too much data at once")
|
|
3824
|
+
lines.append("- Insufficient memory allocated")
|
|
3825
|
+
lines.append("- Caching too aggressively")
|
|
3826
|
+
lines.append("\n**Next steps:**")
|
|
3827
|
+
lines.append("1. Check memory allocation settings")
|
|
3828
|
+
lines.append("2. Look for memory leaks")
|
|
3829
|
+
lines.append("3. Review data processing - can it be batched/streamed?")
|
|
3830
|
+
lines.append("4. Check garbage collection logs")
|
|
3831
|
+
|
|
3832
|
+
elif "null" in message_lower and ("pointer" in message_lower or "reference" in message_lower):
|
|
3833
|
+
lines.append("Tried to use something that doesn't exist (null/None).")
|
|
3834
|
+
lines.append("\n**Common causes:**")
|
|
3835
|
+
lines.append("- Missing input validation")
|
|
3836
|
+
lines.append("- Unexpected missing data")
|
|
3837
|
+
lines.append("- Race condition")
|
|
3838
|
+
lines.append("- API returned unexpected null")
|
|
3839
|
+
lines.append("\n**Next steps:**")
|
|
3840
|
+
lines.append("1. Check the stack trace to find where it happened")
|
|
3841
|
+
lines.append("2. Add null checks and validation")
|
|
3842
|
+
lines.append("3. Review why the value was null")
|
|
3843
|
+
|
|
3844
|
+
elif (
|
|
3845
|
+
"permission" in message_lower
|
|
3846
|
+
or "access denied" in message_lower
|
|
3847
|
+
or "forbidden" in message_lower
|
|
3848
|
+
):
|
|
3849
|
+
lines.append("The application doesn't have permission to perform this action.")
|
|
3850
|
+
lines.append("\n**Common causes:**")
|
|
3851
|
+
lines.append("- Incorrect file/resource permissions")
|
|
3852
|
+
lines.append("- Wrong user/service account")
|
|
3853
|
+
lines.append("- Missing IAM roles or policies")
|
|
3854
|
+
lines.append("- Authentication token expired")
|
|
3855
|
+
lines.append("\n**Next steps:**")
|
|
3856
|
+
lines.append("1. Check file/resource permissions")
|
|
3857
|
+
lines.append("2. Verify the application is running as the correct user")
|
|
3858
|
+
lines.append("3. Review access control policies")
|
|
3859
|
+
|
|
3860
|
+
else:
|
|
3861
|
+
# Generic explanation
|
|
3862
|
+
if level == "ERROR" or level == "FATAL":
|
|
3863
|
+
lines.append("This is an error that prevented normal operation.")
|
|
3864
|
+
lines.append(f"\nError message: `{message}`")
|
|
3865
|
+
lines.append("\n**Next steps:**")
|
|
3866
|
+
lines.append("1. Look at the full stack trace if available")
|
|
3867
|
+
lines.append("2. Check what operation was being performed")
|
|
3868
|
+
lines.append("3. Look for similar errors - is this a pattern?")
|
|
3869
|
+
lines.append("4. Check if there were recent changes (deployment, config)")
|
|
3870
|
+
elif level == "WARN":
|
|
3871
|
+
lines.append("This is a warning - not critical but worth investigating.")
|
|
3872
|
+
lines.append(f"\nMessage: `{message}`")
|
|
3873
|
+
lines.append("\n**Next steps:**")
|
|
3874
|
+
lines.append("1. Determine if this warning is expected")
|
|
3875
|
+
lines.append("2. Check if it's happening frequently")
|
|
3876
|
+
lines.append("3. Consider if it could become a problem")
|
|
3877
|
+
else:
|
|
3878
|
+
lines.append("This is an informational message.")
|
|
3879
|
+
lines.append(f"\nMessage: `{message}`")
|
|
3880
|
+
|
|
3881
|
+
# Context-specific advice
|
|
3882
|
+
if context == "production":
|
|
3883
|
+
lines.append("\n**Production Context:**")
|
|
3884
|
+
lines.append("- Check monitoring dashboards for patterns")
|
|
3885
|
+
lines.append("- Review recent deployments")
|
|
3886
|
+
lines.append("- Consider impact on users")
|
|
3887
|
+
lines.append("- Prepare rollback plan if needed")
|
|
3888
|
+
|
|
3889
|
+
return "\n".join(lines)
|
|
3890
|
+
|
|
3891
|
+
|
|
3892
|
+
def suggest_next_action(
|
|
3893
|
+
current_results: Dict[str, Any], investigation_context: Optional[Dict] = None
|
|
3894
|
+
) -> List[str]:
|
|
3895
|
+
"""
|
|
3896
|
+
Suggest what to investigate next based on current results.
|
|
3897
|
+
|
|
3898
|
+
Args:
|
|
3899
|
+
current_results: Results from previous operation (search, pattern finding, etc.)
|
|
3900
|
+
investigation_context: Optional context about what's been investigated so far
|
|
3901
|
+
|
|
3902
|
+
Returns:
|
|
3903
|
+
List of suggested next actions with example code
|
|
3904
|
+
"""
|
|
3905
|
+
suggestions = []
|
|
3906
|
+
|
|
3907
|
+
# Based on search results
|
|
3908
|
+
if "total_matches" in current_results:
|
|
3909
|
+
total = current_results["total_matches"]
|
|
3910
|
+
if total == 0:
|
|
3911
|
+
suggestions.append("No matches found. Try:")
|
|
3912
|
+
suggestions.append(" - Broaden search (remove filters)")
|
|
3913
|
+
suggestions.append(" - Check different log files")
|
|
3914
|
+
suggestions.append(" - Verify time range")
|
|
3915
|
+
elif total > 1000:
|
|
3916
|
+
suggestions.append(f"Large result set ({total} matches). Consider:")
|
|
3917
|
+
suggestions.append(" - Use output_format='summary' for token efficiency")
|
|
3918
|
+
suggestions.append(" - Add more filters (level, time range, thread_id)")
|
|
3919
|
+
suggestions.append(" - Use smart_sample() to get representative sample")
|
|
3920
|
+
elif total > 0:
|
|
3921
|
+
# Good result size, suggest next steps
|
|
3922
|
+
if "top_messages" in current_results:
|
|
3923
|
+
top_msg = (
|
|
3924
|
+
current_results["top_messages"][0] if current_results["top_messages"] else None
|
|
3925
|
+
)
|
|
3926
|
+
if top_msg and top_msg.get("count", 0) > 3:
|
|
3927
|
+
suggestions.append("Repeated errors detected. Next:")
|
|
3928
|
+
suggestions.append(" - find_patterns(files, min_occurrences=3)")
|
|
3929
|
+
suggestions.append(" - Follow one of these threads to see full context")
|
|
3930
|
+
|
|
3931
|
+
# Based on patterns
|
|
3932
|
+
if "patterns" in current_results:
|
|
3933
|
+
pattern_count = len(current_results.get("patterns", []))
|
|
3934
|
+
if pattern_count > 0:
|
|
3935
|
+
suggestions.append(f"Found {pattern_count} patterns. Next:")
|
|
3936
|
+
suggestions.append(" - Compare successful vs failed requests")
|
|
3937
|
+
suggestions.append(" - Check timestamps - are they clustered?")
|
|
3938
|
+
|
|
3939
|
+
# Default suggestions
|
|
3940
|
+
if not suggestions:
|
|
3941
|
+
suggestions.append("Continue investigation:")
|
|
3942
|
+
suggestions.append(" - analyze_with_insights(files) - Get automatic insights")
|
|
3943
|
+
suggestions.append(" - find_patterns(files) - Find repeated issues")
|
|
3944
|
+
suggestions.append(" - compare_time_periods() - Before/after analysis")
|
|
3945
|
+
|
|
3946
|
+
return suggestions
|
|
3947
|
+
|
|
3948
|
+
|
|
3949
|
+
def _load_files_with_config(
|
|
3950
|
+
inv: Any,
|
|
3951
|
+
files: List[str],
|
|
3952
|
+
parser_format: Optional[str] = None,
|
|
3953
|
+
custom_regex: Optional[str] = None,
|
|
3954
|
+
):
|
|
3955
|
+
"""Load files with optional parser config; falls back to plain load if config not supported."""
|
|
3956
|
+
try:
|
|
3957
|
+
if parser_format or custom_regex:
|
|
3958
|
+
return inv.load_files_with_config(files, parser_format, custom_regex)
|
|
3959
|
+
except Exception:
|
|
3960
|
+
# Fall back silently to default loader if enhanced path fails
|
|
3961
|
+
pass
|
|
3962
|
+
return inv.load_files(files)
|