logler 1.0.7__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
logler/investigate.py ADDED
@@ -0,0 +1,3962 @@
1
+ """
2
+ LLM Investigation Module - High-performance log investigation powered by Rust
3
+
4
+ This module provides fast log parsing, searching, and investigation capabilities
5
+ specifically designed for LLM agents like Claude.
6
+
7
+ Example Usage:
8
+ import logler.investigate as investigate
9
+
10
+ # Search for errors
11
+ results = investigate.search(
12
+ files=["app.log"],
13
+ query="database timeout",
14
+ level="ERROR",
15
+ limit=10
16
+ )
17
+
18
+ # Follow a thread
19
+ timeline = investigate.follow_thread(
20
+ files=["app.log"],
21
+ thread_id="worker-1"
22
+ )
23
+
24
+ # Find patterns
25
+ patterns = investigate.find_patterns(
26
+ files=["app.log"],
27
+ min_occurrences=3
28
+ )
29
+ """
30
+
31
+ import json
32
+ import re
33
+ import warnings
34
+ from typing import List, Optional, Dict, Any, Tuple
35
+ from datetime import datetime
36
+ from collections import defaultdict
37
+
38
+ from .safe_regex import try_compile
39
+
40
+ try:
41
+ import logler_rs
42
+
43
+ RUST_AVAILABLE = True
44
+ except ImportError:
45
+ try:
46
+ from .bootstrap import ensure_rust_backend
47
+
48
+ if ensure_rust_backend():
49
+ import logler_rs # type: ignore
50
+
51
+ RUST_AVAILABLE = True
52
+ else:
53
+ RUST_AVAILABLE = False
54
+ warnings.warn("Rust backend not available. Using Python fallback.", stacklevel=2)
55
+ except (ImportError, AttributeError, OSError):
56
+ RUST_AVAILABLE = False
57
+ warnings.warn("Rust backend not available. Using Python fallback.", stacklevel=2)
58
+
59
+
60
+ def _normalize_entry(entry: Dict[str, Any]) -> None:
61
+ """Normalize a single log entry in-place (e.g., ensure uppercase levels)."""
62
+ if not isinstance(entry, dict):
63
+ return
64
+ level = entry.get("level")
65
+ if isinstance(level, str):
66
+ entry["level"] = level.upper()
67
+ raw = entry.get("raw") or ""
68
+ # Always detect and set format based on raw content
69
+ if isinstance(raw, str):
70
+ stripped = raw.lstrip()
71
+ if stripped.startswith("{"):
72
+ entry["format"] = "Json"
73
+ elif stripped.startswith("<") and stripped[1:2].isdigit():
74
+ entry["format"] = "Syslog"
75
+ elif "level=" in raw or " msg=" in raw or raw.startswith("level="):
76
+ entry["format"] = "Logfmt"
77
+ elif entry.get("format") is None:
78
+ entry["format"] = "PlainText"
79
+ if entry.get("level") is None and isinstance(raw, str):
80
+ inferred = _infer_syslog_level(raw)
81
+ entry["level"] = inferred or "UNKNOWN"
82
+
83
+
84
+ def _normalize_entries(entries: List[Dict[str, Any]]) -> None:
85
+ for entry in entries or []:
86
+ _normalize_entry(entry)
87
+
88
+
89
+ def _normalize_search_result_levels(result: Dict[str, Any]) -> None:
90
+ """Ensure search results and their contexts use consistent level casing."""
91
+ for item in result.get("results", []) or []:
92
+ _normalize_entry(item.get("entry", {}))
93
+ for ctx in item.get("context_before", []) or []:
94
+ _normalize_entry(ctx)
95
+ for ctx in item.get("context_after", []) or []:
96
+ _normalize_entry(ctx)
97
+
98
+
99
+ def _apply_custom_regex_to_results(result: Dict[str, Any], pattern: Optional[str]) -> None:
100
+ """Apply a user-provided regex to fill missing fields like timestamp/level."""
101
+ if not pattern:
102
+ return
103
+ regex = try_compile(pattern)
104
+ if regex is None:
105
+ return
106
+
107
+ for item in result.get("results", []) or []:
108
+ _apply_custom_regex_to_entry(item.get("entry", {}), regex)
109
+ for ctx in item.get("context_before", []) or []:
110
+ _apply_custom_regex_to_entry(ctx, regex)
111
+ for ctx in item.get("context_after", []) or []:
112
+ _apply_custom_regex_to_entry(ctx, regex)
113
+
114
+
115
+ def _apply_custom_regex_to_entry(entry: Dict[str, Any], regex: re.Pattern[str]) -> None:
116
+ if not isinstance(entry, dict):
117
+ return
118
+ raw = entry.get("raw") or entry.get("message") or ""
119
+ match = regex.match(raw)
120
+ if not match:
121
+ return
122
+
123
+ groups = match.groupdict()
124
+ ts_val = groups.get("timestamp")
125
+ if ts_val and not entry.get("timestamp"):
126
+ parsed = _parse_timestamp_flex(ts_val)
127
+ if parsed:
128
+ entry["timestamp"] = parsed
129
+ if groups.get("level") and not entry.get("level"):
130
+ entry["level"] = groups["level"].upper()
131
+ if groups.get("message") and entry.get("message") == raw:
132
+ entry["message"] = groups["message"]
133
+ if groups.get("thread") and not entry.get("thread_id"):
134
+ entry["thread_id"] = groups["thread"]
135
+ if groups.get("correlation_id") and not entry.get("correlation_id"):
136
+ entry["correlation_id"] = groups["correlation_id"]
137
+ entry["format"] = "Custom"
138
+
139
+
140
+ def _normalize_pattern_examples(result: Dict[str, Any]) -> None:
141
+ """Normalize example entries inside pattern detection results."""
142
+ for pattern in result.get("patterns", []) or []:
143
+ for example in pattern.get("examples", []) or []:
144
+ _normalize_entry(example)
145
+
146
+
147
+ def _infer_syslog_level(raw: str) -> Optional[str]:
148
+ match = re.match(r"<(?P<priority>\d+)>", raw.strip())
149
+ if not match:
150
+ return None
151
+ try:
152
+ priority = int(match.group("priority"))
153
+ except ValueError:
154
+ return None
155
+ severity = priority & 0x07
156
+ if severity == 0:
157
+ return "FATAL"
158
+ if severity <= 3:
159
+ return "ERROR"
160
+ if severity == 4:
161
+ return "WARN"
162
+ if severity <= 6:
163
+ return "INFO"
164
+ return "DEBUG"
165
+
166
+
167
+ def _parse_timestamp_flex(value: str) -> Optional[str]:
168
+ for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S"):
169
+ try:
170
+ dt = datetime.strptime(value.replace("Z", "+0000"), fmt)
171
+ return dt.isoformat()
172
+ except Exception:
173
+ continue
174
+ try:
175
+ return datetime.fromisoformat(value.replace("Z", "+00:00")).isoformat()
176
+ except Exception:
177
+ return None
178
+
179
+
180
+ def _normalize_context_payload(payload: Dict[str, Any]) -> None:
181
+ """Normalize context payload returned from Rust backend."""
182
+ _normalize_entry(payload.get("target", {}))
183
+ _normalize_entries(payload.get("context_before", []))
184
+ _normalize_entries(payload.get("context_after", []))
185
+
186
+
187
+ def search(
188
+ files: List[str],
189
+ query: Optional[str] = None,
190
+ level: Optional[str] = None,
191
+ thread_id: Optional[str] = None,
192
+ correlation_id: Optional[str] = None,
193
+ limit: Optional[int] = None,
194
+ context_lines: int = 3,
195
+ output_format: str = "full",
196
+ parser_format: Optional[str] = None,
197
+ custom_regex: Optional[str] = None,
198
+ ) -> Dict[str, Any]:
199
+ """
200
+ Search logs with filters.
201
+
202
+ Args:
203
+ files: List of log file paths
204
+ query: Search query string
205
+ level: Filter by log level (ERROR, WARN, INFO, etc.)
206
+ thread_id: Filter by thread ID
207
+ correlation_id: Filter by correlation ID
208
+ limit: Maximum number of results
209
+ context_lines: Number of context lines before/after each result
210
+ output_format: Output format - "full", "summary", "count", or "compact"
211
+ - "full": Complete log entries (default)
212
+ - "summary": Aggregated summary with examples
213
+ - "count": Just counts, no log content
214
+ - "compact": Essential fields only (no raw logs)
215
+
216
+ Returns:
217
+ Dictionary with search results (format depends on output_format):
218
+
219
+ For "full":
220
+ {
221
+ "results": [...], # Full entries
222
+ "total_matches": 123,
223
+ "search_time_ms": 45
224
+ }
225
+
226
+ For "summary":
227
+ {
228
+ "total_matches": 123,
229
+ "unique_messages": 15,
230
+ "log_levels": {"ERROR": 100, "WARN": 23},
231
+ "top_messages": [
232
+ {"message": "...", "count": 50, "first_seen": "...", "last_seen": "..."},
233
+ ...
234
+ ],
235
+ "sample_entries": [...] # 3-5 examples
236
+ }
237
+
238
+ For "count":
239
+ {
240
+ "total_matches": 123,
241
+ "by_level": {"ERROR": 100, "WARN": 23},
242
+ "by_file": {"app.log": 80, "api.log": 43},
243
+ "time_range": {"start": "...", "end": "..."}
244
+ }
245
+
246
+ For "compact":
247
+ {
248
+ "matches": [
249
+ {"time": "...", "level": "ERROR", "msg": "...", "thread": "..."},
250
+ ...
251
+ ],
252
+ "total": 123
253
+ }
254
+ """
255
+ if not RUST_AVAILABLE:
256
+ raise RuntimeError("Rust backend not available")
257
+
258
+ investigator = logler_rs.PyInvestigator()
259
+ _load_files_with_config(investigator, files, parser_format, custom_regex)
260
+
261
+ # Build query
262
+ filters = {"levels": []}
263
+ if level:
264
+ level_map = {
265
+ "trace": "Trace",
266
+ "debug": "Debug",
267
+ "info": "Info",
268
+ "warn": "Warn",
269
+ "warning": "Warn",
270
+ "error": "Error",
271
+ "fatal": "Fatal",
272
+ "critical": "Fatal",
273
+ }
274
+ normalized_level = level_map.get(level.lower())
275
+ if not normalized_level:
276
+ raise ValueError(f"Unknown log level: {level}")
277
+ filters["levels"] = [normalized_level]
278
+ if thread_id:
279
+ filters["thread_id"] = thread_id
280
+ if correlation_id:
281
+ filters["correlation_id"] = correlation_id
282
+
283
+ query_dict = {
284
+ "files": files,
285
+ "query": query,
286
+ "filters": filters,
287
+ "limit": limit,
288
+ "context_lines": context_lines,
289
+ }
290
+
291
+ # Call Rust engine with the full query payload
292
+ result_json = investigator.search(json.dumps(query_dict))
293
+ result = json.loads(result_json)
294
+ _normalize_search_result_levels(result)
295
+ _apply_custom_regex_to_results(result, custom_regex)
296
+
297
+ # Transform based on output_format
298
+ if output_format == "full":
299
+ return result
300
+ elif output_format == "summary":
301
+ return _format_as_summary(result)
302
+ elif output_format == "count":
303
+ return _format_as_count(result)
304
+ elif output_format == "compact":
305
+ return _format_as_compact(result)
306
+ else:
307
+ return result
308
+
309
+
310
+ def follow_thread(
311
+ files: List[str],
312
+ thread_id: Optional[str] = None,
313
+ correlation_id: Optional[str] = None,
314
+ trace_id: Optional[str] = None,
315
+ parser_format: Optional[str] = None,
316
+ custom_regex: Optional[str] = None,
317
+ ) -> Dict[str, Any]:
318
+ """
319
+ Follow a thread/correlation/trace through log files.
320
+
321
+ Args:
322
+ files: List of log file paths
323
+ thread_id: Thread ID to follow
324
+ correlation_id: Correlation ID to follow
325
+ trace_id: Trace ID to follow
326
+
327
+ Returns:
328
+ Dictionary with timeline:
329
+ {
330
+ "entries": [...],
331
+ "total_entries": 42,
332
+ "duration_ms": 1523,
333
+ "unique_spans": [...]
334
+ }
335
+ """
336
+ if not RUST_AVAILABLE:
337
+ raise RuntimeError("Rust backend not available")
338
+
339
+ # Use Investigator when custom parsing is requested so parsing honors the config.
340
+ if parser_format or custom_regex:
341
+ inv = Investigator()
342
+ inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
343
+ return inv.follow_thread(
344
+ thread_id=thread_id, correlation_id=correlation_id, trace_id=trace_id
345
+ )
346
+
347
+ result_json = logler_rs.follow_thread(files, thread_id, correlation_id, trace_id)
348
+ result = json.loads(result_json)
349
+ _normalize_entries(result.get("entries", []))
350
+ return result
351
+
352
+
353
+ def get_context(
354
+ file: str,
355
+ line_number: int,
356
+ lines_before: int = 10,
357
+ lines_after: int = 10,
358
+ ) -> Dict[str, Any]:
359
+ """
360
+ Get context around a specific log line.
361
+
362
+ Args:
363
+ file: Log file path
364
+ line_number: Line number to get context for
365
+ lines_before: Number of lines before
366
+ lines_after: Number of lines after
367
+
368
+ Returns:
369
+ Dictionary with context:
370
+ {
371
+ "target": {...},
372
+ "context_before": [...],
373
+ "context_after": [...],
374
+ }
375
+ """
376
+ if not RUST_AVAILABLE:
377
+ raise RuntimeError("Rust backend not available")
378
+
379
+ # Use Investigator class for more complex operations
380
+ investigator = logler_rs.PyInvestigator()
381
+ investigator.load_files([file])
382
+ result_json = investigator.get_context(file, line_number, lines_before, lines_after, False)
383
+ result = json.loads(result_json)
384
+ _normalize_context_payload(result)
385
+ return result
386
+
387
+
388
+ def follow_thread_hierarchy(
389
+ files: List[str],
390
+ root_identifier: str,
391
+ max_depth: Optional[int] = None,
392
+ use_naming_patterns: bool = True,
393
+ use_temporal_inference: bool = True,
394
+ min_confidence: float = 0.0,
395
+ parser_format: Optional[str] = None,
396
+ custom_regex: Optional[str] = None,
397
+ ) -> Dict[str, Any]:
398
+ """
399
+ Build hierarchical tree of threads/spans showing parent-child relationships.
400
+
401
+ This detects sub-threads and nested operations using:
402
+ - Explicit parent_span_id fields (OpenTelemetry)
403
+ - Naming patterns (worker-1.task-a, main:subtask-1)
404
+ - Temporal inference (time-based proximity)
405
+
406
+ Args:
407
+ files: List of log file paths
408
+ root_identifier: Root thread ID, correlation ID, or span ID
409
+ max_depth: Maximum depth of hierarchy tree (default: unlimited)
410
+ use_naming_patterns: Enable naming pattern detection (default: True)
411
+ use_temporal_inference: Enable time-based inference (default: True)
412
+ min_confidence: Minimum confidence score (0.0-1.0, default: 0.0)
413
+ parser_format: Optional log format hint
414
+ custom_regex: Optional custom parsing regex
415
+
416
+ Returns:
417
+ Dictionary with hierarchical structure:
418
+ {
419
+ "roots": [
420
+ {
421
+ "id": "main-thread",
422
+ "node_type": "Thread" | "Span" | "CorrelationGroup",
423
+ "name": "Main Request Handler",
424
+ "parent_id": null,
425
+ "children": [
426
+ {
427
+ "id": "worker-1.db-query",
428
+ "node_type": "Span",
429
+ "name": "Database Query",
430
+ "parent_id": "main-thread",
431
+ "children": [],
432
+ "entry_ids": [...],
433
+ "start_time": "2024-01-15T10:00:00Z",
434
+ "end_time": "2024-01-15T10:00:02Z",
435
+ "duration_ms": 2000,
436
+ "entry_count": 15,
437
+ "error_count": 0,
438
+ "level_counts": {"INFO": 12, "DEBUG": 3},
439
+ "depth": 1,
440
+ "confidence": 0.8,
441
+ "relationship_evidence": ["Naming pattern: worker-1.db-query"]
442
+ }
443
+ ],
444
+ "entry_ids": [...],
445
+ "start_time": "2024-01-15T10:00:00Z",
446
+ "end_time": "2024-01-15T10:00:05Z",
447
+ "duration_ms": 5000,
448
+ "entry_count": 42,
449
+ "error_count": 2,
450
+ "level_counts": {"INFO": 35, "ERROR": 2, "DEBUG": 5},
451
+ "depth": 0,
452
+ "confidence": 1.0,
453
+ "relationship_evidence": []
454
+ }
455
+ ],
456
+ "total_nodes": 8,
457
+ "max_depth": 3,
458
+ "total_duration_ms": 5000,
459
+ "concurrent_count": 2,
460
+ "bottleneck": {
461
+ "node_id": "worker-1.db-query",
462
+ "duration_ms": 2000,
463
+ "percentage": 40.0,
464
+ "depth": 1
465
+ },
466
+ "error_nodes": ["worker-2.api-call"],
467
+ "detection_method": "ExplicitParentId" | "NamingPattern" | "TemporalInference" | "Mixed"
468
+ }
469
+
470
+ Example:
471
+ # Detect OpenTelemetry trace hierarchy
472
+ hierarchy = follow_thread_hierarchy(
473
+ files=["app.log"],
474
+ root_identifier="trace-abc123",
475
+ min_confidence=0.8
476
+ )
477
+
478
+ # Print tree structure
479
+ for root in hierarchy['roots']:
480
+ print_tree(root, indent=0)
481
+
482
+ # Find bottleneck
483
+ if hierarchy['bottleneck']:
484
+ print(f"Bottleneck: {hierarchy['bottleneck']['node_id']} ({hierarchy['bottleneck']['duration_ms']}ms)")
485
+ """
486
+ if not RUST_AVAILABLE:
487
+ raise RuntimeError("Rust backend not available")
488
+
489
+ # Use Investigator when custom parsing is requested
490
+ if parser_format or custom_regex:
491
+ inv = Investigator()
492
+ inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
493
+ return inv.build_hierarchy(
494
+ root_identifier=root_identifier,
495
+ max_depth=max_depth,
496
+ use_naming_patterns=use_naming_patterns,
497
+ use_temporal_inference=use_temporal_inference,
498
+ min_confidence=min_confidence,
499
+ )
500
+
501
+ # Call Rust directly for better performance
502
+ result_json = logler_rs.build_hierarchy(
503
+ files,
504
+ root_identifier,
505
+ max_depth,
506
+ use_naming_patterns,
507
+ use_temporal_inference,
508
+ min_confidence,
509
+ )
510
+ return json.loads(result_json)
511
+
512
+
513
+ def get_hierarchy_summary(hierarchy: Dict[str, Any]) -> str:
514
+ """
515
+ Generate a human-readable summary of a thread hierarchy.
516
+
517
+ Args:
518
+ hierarchy: Hierarchy dictionary from follow_thread_hierarchy()
519
+
520
+ Returns:
521
+ Formatted text summary
522
+
523
+ Example:
524
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
525
+ summary = get_hierarchy_summary(hierarchy)
526
+ print(summary)
527
+ """
528
+ lines = []
529
+
530
+ # Overview
531
+ lines.append("=== Thread Hierarchy Summary ===")
532
+ lines.append(f"Total nodes: {hierarchy.get('total_nodes', 0)}")
533
+ lines.append(f"Max depth: {hierarchy.get('max_depth', 0)}")
534
+ lines.append(f"Detection method: {hierarchy.get('detection_method', 'Unknown')}")
535
+
536
+ # Duration
537
+ total_duration = hierarchy.get("total_duration_ms")
538
+ if total_duration:
539
+ lines.append(f"Total duration: {total_duration}ms ({total_duration/1000:.2f}s)")
540
+
541
+ # Concurrent operations
542
+ concurrent = hierarchy.get("concurrent_count", 0)
543
+ if concurrent > 1:
544
+ lines.append(f"Concurrent operations: {concurrent}")
545
+
546
+ # Bottleneck
547
+ bottleneck = hierarchy.get("bottleneck")
548
+ if bottleneck:
549
+ lines.append("")
550
+ lines.append("⚠️ BOTTLENECK DETECTED:")
551
+ lines.append(f" Node: {bottleneck.get('node_id')}")
552
+ lines.append(
553
+ f" Duration: {bottleneck.get('duration_ms')}ms ({bottleneck.get('percentage', 0):.1f}% of total)"
554
+ )
555
+ lines.append(f" Depth: {bottleneck.get('depth')}")
556
+
557
+ # Errors
558
+ error_nodes = hierarchy.get("error_nodes", [])
559
+ if error_nodes:
560
+ lines.append("")
561
+ lines.append(f"❌ Errors in {len(error_nodes)} node(s):")
562
+ for node_id in error_nodes[:5]: # Show first 5
563
+ lines.append(f" - {node_id}")
564
+ if len(error_nodes) > 5:
565
+ lines.append(f" ... and {len(error_nodes) - 5} more")
566
+
567
+ # Tree structure preview
568
+ roots = hierarchy.get("roots", [])
569
+ if roots:
570
+ lines.append("")
571
+ lines.append("Tree Structure:")
572
+ for root in roots[:3]: # Show first 3 roots
573
+ lines.append(
574
+ f" 📁 {root.get('id')} ({root.get('entry_count', 0)} entries, {len(root.get('children', []))} children)"
575
+ )
576
+ _append_tree_preview(root, lines, depth=1, max_depth=2)
577
+ if len(roots) > 3:
578
+ lines.append(f" ... and {len(roots) - 3} more root(s)")
579
+
580
+ return "\n".join(lines)
581
+
582
+
583
+ def _append_tree_preview(node: Dict[str, Any], lines: List[str], depth: int, max_depth: int):
584
+ """Helper to append tree preview to lines"""
585
+ if depth >= max_depth:
586
+ return
587
+
588
+ children = node.get("children", [])
589
+ for i, child in enumerate(children[:3]): # Show first 3 children
590
+ is_last = i == len(children) - 1
591
+ prefix = " " * depth + ("└─ " if is_last else "├─ ")
592
+
593
+ error_marker = "❌ " if child.get("error_count", 0) > 0 else ""
594
+ duration = child.get("duration_ms", 0)
595
+ duration_str = f" ({duration}ms)" if duration > 0 else ""
596
+
597
+ lines.append(
598
+ f"{prefix}{error_marker}{child.get('id')} ({child.get('entry_count', 0)} entries){duration_str}"
599
+ )
600
+ _append_tree_preview(child, lines, depth + 1, max_depth)
601
+
602
+ if len(children) > 3:
603
+ prefix = " " * depth + "└─ "
604
+ lines.append(f"{prefix}... and {len(children) - 3} more")
605
+
606
+
607
+ def analyze_error_flow(
608
+ hierarchy: Dict[str, Any],
609
+ include_context: bool = True,
610
+ ) -> Dict[str, Any]:
611
+ """
612
+ Analyze error propagation through a hierarchy to identify root causes and cascading failures.
613
+
614
+ This function traces errors through parent-child relationships to find:
615
+ - Root cause: The first/originating error in the chain
616
+ - Propagation chain: How errors cascaded through the system
617
+ - Affected nodes: All nodes impacted by the error
618
+ - Impact assessment: Severity and scope of the failure
619
+
620
+ Args:
621
+ hierarchy: Hierarchy dictionary from follow_thread_hierarchy()
622
+ include_context: Include sample error messages (default: True)
623
+
624
+ Returns:
625
+ Dictionary with error flow analysis:
626
+ {
627
+ "has_errors": bool,
628
+ "total_error_nodes": int,
629
+ "root_causes": [
630
+ {
631
+ "node_id": "redis-write",
632
+ "node_type": "Span",
633
+ "error_count": 1,
634
+ "depth": 3,
635
+ "timestamp": "2024-01-15T10:00:01.020Z",
636
+ "path": ["api-gateway", "product-service", "cache-update", "redis-write"],
637
+ "is_leaf": True,
638
+ "confidence": 0.95
639
+ }
640
+ ],
641
+ "propagation_chains": [
642
+ {
643
+ "root_cause": "redis-write",
644
+ "chain": [
645
+ {"node_id": "redis-write", "error_count": 1, "depth": 3},
646
+ {"node_id": "cache-update", "error_count": 1, "depth": 2},
647
+ {"node_id": "product-service", "error_count": 1, "depth": 1}
648
+ ],
649
+ "total_affected": 3,
650
+ "propagation_type": "upward" # errors bubbled up to parent
651
+ }
652
+ ],
653
+ "impact_summary": {
654
+ "total_affected_nodes": 5,
655
+ "affected_percentage": 35.7,
656
+ "max_propagation_depth": 3,
657
+ "concurrent_failures": 2
658
+ },
659
+ "recommendations": [
660
+ "Investigate redis-write first - it appears to be the root cause",
661
+ "Consider adding retry logic for cache operations",
662
+ "3 nodes show cascading failures from a single source"
663
+ ]
664
+ }
665
+
666
+ Example:
667
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
668
+ error_analysis = analyze_error_flow(hierarchy)
669
+
670
+ if error_analysis['has_errors']:
671
+ print(f"Root cause: {error_analysis['root_causes'][0]['node_id']}")
672
+ for rec in error_analysis['recommendations']:
673
+ print(f" - {rec}")
674
+ """
675
+ result = {
676
+ "has_errors": False,
677
+ "total_error_nodes": 0,
678
+ "root_causes": [],
679
+ "propagation_chains": [],
680
+ "impact_summary": {
681
+ "total_affected_nodes": 0,
682
+ "affected_percentage": 0.0,
683
+ "max_propagation_depth": 0,
684
+ "concurrent_failures": 0,
685
+ },
686
+ "recommendations": [],
687
+ }
688
+
689
+ error_nodes = hierarchy.get("error_nodes", [])
690
+ if not error_nodes:
691
+ return result
692
+
693
+ result["has_errors"] = True
694
+ result["total_error_nodes"] = len(error_nodes)
695
+
696
+ # Build node lookup and parent mapping
697
+ all_nodes = {}
698
+ parent_map = {} # child_id -> parent_id
699
+
700
+ def collect_nodes(node: Dict[str, Any], parent_id: Optional[str] = None):
701
+ node_id = node.get("id")
702
+ if node_id:
703
+ all_nodes[node_id] = node
704
+ if parent_id:
705
+ parent_map[node_id] = parent_id
706
+ for child in node.get("children", []):
707
+ collect_nodes(child, node_id)
708
+
709
+ for root in hierarchy.get("roots", []):
710
+ collect_nodes(root)
711
+
712
+ # Find root causes (errors at leaf nodes or deepest error in each chain)
713
+ error_node_data = []
714
+ for node_id in error_nodes:
715
+ node = all_nodes.get(node_id)
716
+ if node:
717
+ error_node_data.append(
718
+ {
719
+ "node_id": node_id,
720
+ "node_type": node.get("node_type", "Unknown"),
721
+ "error_count": node.get("error_count", 0),
722
+ "depth": node.get("depth", 0),
723
+ "timestamp": node.get("start_time"),
724
+ "is_leaf": len(node.get("children", [])) == 0,
725
+ "children_with_errors": sum(
726
+ 1 for c in node.get("children", []) if c.get("error_count", 0) > 0
727
+ ),
728
+ }
729
+ )
730
+
731
+ # Sort by depth (deepest first) and timestamp (earliest first)
732
+ error_node_data.sort(key=lambda x: (-x["depth"], x["timestamp"] or ""))
733
+
734
+ # Identify root causes - errors that didn't come from children
735
+ root_causes = []
736
+
737
+ for error_node in error_node_data:
738
+ node_id = error_node["node_id"]
739
+
740
+ # Build path from root to this node
741
+ path = []
742
+ current = node_id
743
+ while current:
744
+ path.insert(0, current)
745
+ current = parent_map.get(current)
746
+
747
+ # Check if this is a root cause (no child errors, or leaf node)
748
+ if error_node["children_with_errors"] == 0:
749
+ # Calculate confidence based on evidence
750
+ confidence = 1.0 if error_node["is_leaf"] else 0.85
751
+
752
+ root_causes.append(
753
+ {
754
+ "node_id": node_id,
755
+ "node_type": error_node["node_type"],
756
+ "error_count": error_node["error_count"],
757
+ "depth": error_node["depth"],
758
+ "timestamp": error_node["timestamp"],
759
+ "path": path,
760
+ "is_leaf": error_node["is_leaf"],
761
+ "confidence": confidence,
762
+ }
763
+ )
764
+
765
+ result["root_causes"] = root_causes
766
+
767
+ # Build propagation chains (trace errors upward from root causes)
768
+ propagation_chains = []
769
+
770
+ for root_cause in root_causes:
771
+ chain = []
772
+ current_id = root_cause["node_id"]
773
+
774
+ # Walk up the tree
775
+ while current_id:
776
+ node = all_nodes.get(current_id)
777
+ if node:
778
+ chain.append(
779
+ {
780
+ "node_id": current_id,
781
+ "error_count": node.get("error_count", 0),
782
+ "depth": node.get("depth", 0),
783
+ }
784
+ )
785
+ current_id = parent_map.get(current_id)
786
+
787
+ # Only include chains where errors actually propagated
788
+ if len(chain) > 1:
789
+ # Check if parent nodes also have errors
790
+ propagated_chain = [c for c in chain if c["error_count"] > 0]
791
+ if len(propagated_chain) > 1:
792
+ propagation_chains.append(
793
+ {
794
+ "root_cause": root_cause["node_id"],
795
+ "chain": propagated_chain,
796
+ "total_affected": len(propagated_chain),
797
+ "propagation_type": "upward",
798
+ }
799
+ )
800
+
801
+ result["propagation_chains"] = propagation_chains
802
+
803
+ # Calculate impact summary
804
+ total_nodes = hierarchy.get("total_nodes", 1)
805
+ affected_nodes = len(set(error_nodes))
806
+ max_depth = max((rc["depth"] for rc in root_causes), default=0)
807
+
808
+ # Count concurrent failures (root causes at same depth)
809
+ depth_counts = defaultdict(int)
810
+ for rc in root_causes:
811
+ depth_counts[rc["depth"]] += 1
812
+ concurrent = max(depth_counts.values(), default=0)
813
+
814
+ result["impact_summary"] = {
815
+ "total_affected_nodes": affected_nodes,
816
+ "affected_percentage": (affected_nodes / total_nodes * 100) if total_nodes > 0 else 0,
817
+ "max_propagation_depth": max_depth,
818
+ "concurrent_failures": concurrent if concurrent > 1 else 0,
819
+ }
820
+
821
+ # Generate recommendations
822
+ recommendations = []
823
+
824
+ if root_causes:
825
+ primary_cause = root_causes[0]
826
+ recommendations.append(
827
+ f"Investigate {primary_cause['node_id']} first - it appears to be the root cause"
828
+ )
829
+
830
+ if primary_cause["is_leaf"]:
831
+ recommendations.append(
832
+ f"Error originated at leaf node (depth {primary_cause['depth']}) - check external dependencies"
833
+ )
834
+
835
+ if len(propagation_chains) > 0:
836
+ total_propagated = sum(c["total_affected"] for c in propagation_chains)
837
+ recommendations.append(
838
+ f"{total_propagated} nodes show cascading failures - consider adding circuit breakers"
839
+ )
840
+
841
+ if concurrent > 1:
842
+ recommendations.append(
843
+ f"{concurrent} concurrent failures detected - possible systemic issue"
844
+ )
845
+
846
+ if result["impact_summary"]["affected_percentage"] > 50:
847
+ recommendations.append(
848
+ "High impact failure (>50% of nodes affected) - prioritize investigation"
849
+ )
850
+
851
+ result["recommendations"] = recommendations
852
+
853
+ return result
854
+
855
+
856
+ def format_error_flow(
857
+ error_analysis: Dict[str, Any],
858
+ show_chains: bool = True,
859
+ show_recommendations: bool = True,
860
+ ) -> str:
861
+ """
862
+ Format error flow analysis as human-readable text.
863
+
864
+ Args:
865
+ error_analysis: Error analysis from analyze_error_flow()
866
+ show_chains: Show propagation chains (default: True)
867
+ show_recommendations: Show recommendations (default: True)
868
+
869
+ Returns:
870
+ Formatted error flow string
871
+
872
+ Example:
873
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
874
+ error_analysis = analyze_error_flow(hierarchy)
875
+ print(format_error_flow(error_analysis))
876
+ """
877
+ lines = []
878
+
879
+ if not error_analysis.get("has_errors"):
880
+ return "✅ No errors detected in hierarchy"
881
+
882
+ # Header
883
+ lines.append("=" * 70)
884
+ lines.append("🔍 ERROR FLOW ANALYSIS")
885
+ lines.append("=" * 70)
886
+ lines.append("")
887
+
888
+ # Summary
889
+ total = error_analysis.get("total_error_nodes", 0)
890
+ impact = error_analysis.get("impact_summary", {})
891
+ lines.append(f"Total error nodes: {total}")
892
+ lines.append(f"Affected: {impact.get('affected_percentage', 0):.1f}% of hierarchy")
893
+
894
+ if impact.get("concurrent_failures", 0) > 1:
895
+ lines.append(f"Concurrent failures: {impact['concurrent_failures']}")
896
+
897
+ lines.append("")
898
+
899
+ # Root Causes
900
+ root_causes = error_analysis.get("root_causes", [])
901
+ if root_causes:
902
+ lines.append("-" * 70)
903
+ lines.append("🔴 ROOT CAUSE(S)")
904
+ lines.append("-" * 70)
905
+
906
+ for i, cause in enumerate(root_causes, 1):
907
+ confidence_pct = int(cause.get("confidence", 0) * 100)
908
+ leaf_marker = " (leaf node)" if cause.get("is_leaf") else ""
909
+
910
+ lines.append(f"\n {i}. {cause['node_id']}{leaf_marker}")
911
+ lines.append(f" Type: {cause.get('node_type', 'Unknown')}")
912
+ lines.append(f" Errors: {cause.get('error_count', 0)}")
913
+ lines.append(f" Depth: {cause.get('depth', 0)}")
914
+ lines.append(f" Confidence: {confidence_pct}%")
915
+
916
+ if cause.get("timestamp"):
917
+ lines.append(f" Time: {cause['timestamp']}")
918
+
919
+ if cause.get("path"):
920
+ path_str = " → ".join(cause["path"])
921
+ lines.append(f" Path: {path_str}")
922
+
923
+ # Propagation Chains
924
+ if show_chains:
925
+ chains = error_analysis.get("propagation_chains", [])
926
+ if chains:
927
+ lines.append("")
928
+ lines.append("-" * 70)
929
+ lines.append("📈 ERROR PROPAGATION")
930
+ lines.append("-" * 70)
931
+
932
+ for chain_data in chains:
933
+ lines.append(f"\n From: {chain_data['root_cause']}")
934
+ lines.append(f" Affected nodes: {chain_data['total_affected']}")
935
+ lines.append(" Chain:")
936
+
937
+ chain = chain_data.get("chain", [])
938
+ for j, node in enumerate(chain):
939
+ is_last = j == len(chain) - 1
940
+ prefix = " └─" if is_last else " ├─"
941
+ arrow = " ← ROOT CAUSE" if j == 0 else ""
942
+ lines.append(
943
+ f"{prefix} {node['node_id']} ({node['error_count']} errors){arrow}"
944
+ )
945
+
946
+ # Recommendations
947
+ if show_recommendations:
948
+ recommendations = error_analysis.get("recommendations", [])
949
+ if recommendations:
950
+ lines.append("")
951
+ lines.append("-" * 70)
952
+ lines.append("💡 RECOMMENDATIONS")
953
+ lines.append("-" * 70)
954
+
955
+ for rec in recommendations:
956
+ lines.append(f" • {rec}")
957
+
958
+ lines.append("")
959
+ lines.append("=" * 70)
960
+
961
+ return "\n".join(lines)
962
+
963
+
964
+ def detect_correlation_chains(
965
+ files: List[str],
966
+ root_correlation_id: Optional[str] = None,
967
+ chain_patterns: Optional[List[str]] = None,
968
+ parser_format: Optional[str] = None,
969
+ ) -> Dict[str, Any]:
970
+ """
971
+ Detect correlation ID chaining where one request spawns sub-requests.
972
+
973
+ This function identifies parent-child relationships between correlation IDs
974
+ by analyzing log messages for patterns like:
975
+ - "spawning request {child_id}"
976
+ - "child_correlation_id": "xxx"
977
+ - "parent_request_id": "xxx"
978
+
979
+ Args:
980
+ files: List of log file paths to analyze
981
+ root_correlation_id: Optional root correlation ID to start from
982
+ chain_patterns: Optional custom regex patterns for detecting chains
983
+ parser_format: Optional log format hint
984
+
985
+ Returns:
986
+ Dictionary with correlation chain information:
987
+ {
988
+ "chains": [
989
+ {
990
+ "parent_correlation_id": "req-123",
991
+ "child_correlation_id": "subreq-456",
992
+ "evidence": "Spawning sub-request subreq-456",
993
+ "timestamp": "2024-01-15T10:00:00Z",
994
+ "confidence": 0.9
995
+ }
996
+ ],
997
+ "root_ids": ["req-123"],
998
+ "hierarchy": {
999
+ "req-123": ["subreq-456", "subreq-789"],
1000
+ "subreq-456": ["subreq-456-a"]
1001
+ },
1002
+ "total_chains": 3
1003
+ }
1004
+
1005
+ Example:
1006
+ chains = detect_correlation_chains(
1007
+ files=["app.log", "service.log"],
1008
+ root_correlation_id="req-main-001"
1009
+ )
1010
+ for chain in chains['chains']:
1011
+ print(f"{chain['parent_correlation_id']} -> {chain['child_correlation_id']}")
1012
+ """
1013
+ # Default patterns to detect correlation chaining
1014
+ default_patterns = [
1015
+ # Explicit field patterns
1016
+ r'child_correlation_id["\s:=]+([a-zA-Z0-9_-]+)',
1017
+ r'parent_correlation_id["\s:=]+([a-zA-Z0-9_-]+)',
1018
+ r'parent_request_id["\s:=]+([a-zA-Z0-9_-]+)',
1019
+ r'spawned_request["\s:=]+([a-zA-Z0-9_-]+)',
1020
+ # Message patterns
1021
+ r"[Ss]pawning (?:sub-?)?request[:\s]+([a-zA-Z0-9_-]+)",
1022
+ r"[Cc]reating child request[:\s]+([a-zA-Z0-9_-]+)",
1023
+ r"[Ff]orked to[:\s]+([a-zA-Z0-9_-]+)",
1024
+ r"[Dd]elegating to[:\s]+([a-zA-Z0-9_-]+)",
1025
+ r"[Ss]ub-?request[:\s]+([a-zA-Z0-9_-]+)",
1026
+ ]
1027
+
1028
+ patterns = chain_patterns or default_patterns
1029
+ compiled_patterns = [re.compile(p) for p in patterns]
1030
+
1031
+ # Read and parse logs
1032
+ entries = []
1033
+ if RUST_AVAILABLE:
1034
+ for file_path in files:
1035
+ result_json = logler_rs.search(
1036
+ [file_path],
1037
+ "", # No query filter
1038
+ None, # level
1039
+ None, # thread_id
1040
+ None, # correlation_id
1041
+ None, # trace_id
1042
+ None, # start_time
1043
+ None, # end_time
1044
+ 10000, # limit - get many entries
1045
+ 0, # offset
1046
+ )
1047
+ result = json.loads(result_json)
1048
+ entries.extend(result.get("entries", []))
1049
+ else:
1050
+ # Fallback to Python parsing
1051
+ from .parser import LogParser
1052
+
1053
+ parser = LogParser()
1054
+ for file_path in files:
1055
+ with open(file_path, "r") as f:
1056
+ for line in f:
1057
+ entry = parser.parse_line(line)
1058
+ if entry:
1059
+ entries.append(entry.__dict__ if hasattr(entry, "__dict__") else entry)
1060
+
1061
+ # Detect chains
1062
+ chains = []
1063
+ hierarchy = defaultdict(list)
1064
+ all_correlation_ids = set()
1065
+
1066
+ for entry in entries:
1067
+ correlation_id = entry.get("correlation_id")
1068
+ message = entry.get("message", "")
1069
+ timestamp = entry.get("timestamp")
1070
+ fields = entry.get("fields", {})
1071
+
1072
+ if correlation_id:
1073
+ all_correlation_ids.add(correlation_id)
1074
+
1075
+ # Check explicit fields first
1076
+ child_id = fields.get("child_correlation_id") or fields.get("spawned_request")
1077
+ parent_id = fields.get("parent_correlation_id") or fields.get("parent_request_id")
1078
+
1079
+ if child_id and correlation_id:
1080
+ chains.append(
1081
+ {
1082
+ "parent_correlation_id": correlation_id,
1083
+ "child_correlation_id": child_id,
1084
+ "evidence": f"Explicit field: child_correlation_id={child_id}",
1085
+ "timestamp": timestamp,
1086
+ "confidence": 1.0,
1087
+ }
1088
+ )
1089
+ hierarchy[correlation_id].append(child_id)
1090
+ all_correlation_ids.add(child_id)
1091
+
1092
+ if parent_id and correlation_id:
1093
+ chains.append(
1094
+ {
1095
+ "parent_correlation_id": parent_id,
1096
+ "child_correlation_id": correlation_id,
1097
+ "evidence": f"Explicit field: parent_correlation_id={parent_id}",
1098
+ "timestamp": timestamp,
1099
+ "confidence": 1.0,
1100
+ }
1101
+ )
1102
+ hierarchy[parent_id].append(correlation_id)
1103
+ all_correlation_ids.add(parent_id)
1104
+
1105
+ # Check message patterns
1106
+ for pattern in compiled_patterns:
1107
+ match = pattern.search(message)
1108
+ if match and correlation_id:
1109
+ detected_id = match.group(1)
1110
+ if detected_id != correlation_id:
1111
+ # Determine if it's a parent or child reference
1112
+ if "parent" in pattern.pattern.lower():
1113
+ chains.append(
1114
+ {
1115
+ "parent_correlation_id": detected_id,
1116
+ "child_correlation_id": correlation_id,
1117
+ "evidence": f"Pattern match in message: {match.group(0)}",
1118
+ "timestamp": timestamp,
1119
+ "confidence": 0.85,
1120
+ }
1121
+ )
1122
+ hierarchy[detected_id].append(correlation_id)
1123
+ else:
1124
+ chains.append(
1125
+ {
1126
+ "parent_correlation_id": correlation_id,
1127
+ "child_correlation_id": detected_id,
1128
+ "evidence": f"Pattern match in message: {match.group(0)}",
1129
+ "timestamp": timestamp,
1130
+ "confidence": 0.85,
1131
+ }
1132
+ )
1133
+ hierarchy[correlation_id].append(detected_id)
1134
+ all_correlation_ids.add(detected_id)
1135
+
1136
+ # Deduplicate chains
1137
+ seen = set()
1138
+ unique_chains = []
1139
+ for chain in chains:
1140
+ key = (chain["parent_correlation_id"], chain["child_correlation_id"])
1141
+ if key not in seen:
1142
+ seen.add(key)
1143
+ unique_chains.append(chain)
1144
+
1145
+ # Find root IDs (correlation IDs that are never a child)
1146
+ all_children = set()
1147
+ for children in hierarchy.values():
1148
+ all_children.update(children)
1149
+
1150
+ root_ids = [cid for cid in all_correlation_ids if cid not in all_children]
1151
+
1152
+ # Filter by root_correlation_id if specified
1153
+ if root_correlation_id:
1154
+ # Build the tree from root
1155
+ def get_descendants(cid: str, seen: set) -> set:
1156
+ if cid in seen:
1157
+ return set()
1158
+ seen.add(cid)
1159
+ result = {cid}
1160
+ for child in hierarchy.get(cid, []):
1161
+ result.update(get_descendants(child, seen))
1162
+ return result
1163
+
1164
+ relevant_ids = get_descendants(root_correlation_id, set())
1165
+ unique_chains = [
1166
+ c
1167
+ for c in unique_chains
1168
+ if c["parent_correlation_id"] in relevant_ids
1169
+ or c["child_correlation_id"] in relevant_ids
1170
+ ]
1171
+ root_ids = [root_correlation_id] if root_correlation_id in root_ids else []
1172
+
1173
+ # Convert hierarchy to regular dict
1174
+ hierarchy_dict = {k: list(set(v)) for k, v in hierarchy.items()}
1175
+
1176
+ return {
1177
+ "chains": unique_chains,
1178
+ "root_ids": sorted(root_ids),
1179
+ "hierarchy": hierarchy_dict,
1180
+ "total_chains": len(unique_chains),
1181
+ "total_correlation_ids": len(all_correlation_ids),
1182
+ }
1183
+
1184
+
1185
+ def build_hierarchy_with_correlation_chains(
1186
+ files: List[str],
1187
+ root_identifier: str,
1188
+ include_correlation_chains: bool = True,
1189
+ max_depth: Optional[int] = None,
1190
+ use_naming_patterns: bool = True,
1191
+ use_temporal_inference: bool = True,
1192
+ min_confidence: float = 0.0,
1193
+ ) -> Dict[str, Any]:
1194
+ """
1195
+ Build hierarchy that includes correlation ID chaining relationships.
1196
+
1197
+ This extends follow_thread_hierarchy by also detecting when one correlation ID
1198
+ spawns sub-requests with different correlation IDs.
1199
+
1200
+ Args:
1201
+ files: List of log file paths
1202
+ root_identifier: Root correlation ID, thread ID, or span ID
1203
+ include_correlation_chains: Whether to detect correlation chaining (default: True)
1204
+ max_depth: Maximum hierarchy depth
1205
+ use_naming_patterns: Enable naming pattern detection
1206
+ use_temporal_inference: Enable temporal inference
1207
+ min_confidence: Minimum confidence score
1208
+
1209
+ Returns:
1210
+ Hierarchy dictionary with additional correlation chain information
1211
+
1212
+ Example:
1213
+ hierarchy = build_hierarchy_with_correlation_chains(
1214
+ files=["api.log", "worker.log"],
1215
+ root_identifier="req-main-001",
1216
+ include_correlation_chains=True
1217
+ )
1218
+ # hierarchy now includes sub-requests spawned by req-main-001
1219
+ """
1220
+ # First build the regular hierarchy
1221
+ hierarchy = follow_thread_hierarchy(
1222
+ files=files,
1223
+ root_identifier=root_identifier,
1224
+ max_depth=max_depth,
1225
+ use_naming_patterns=use_naming_patterns,
1226
+ use_temporal_inference=use_temporal_inference,
1227
+ min_confidence=min_confidence,
1228
+ )
1229
+
1230
+ if not include_correlation_chains:
1231
+ return hierarchy
1232
+
1233
+ # Detect correlation chains
1234
+ chains = detect_correlation_chains(files=files, root_correlation_id=root_identifier)
1235
+
1236
+ # Add chain information to hierarchy
1237
+ hierarchy["correlation_chains"] = chains["chains"]
1238
+ hierarchy["chained_correlation_ids"] = list(chains["hierarchy"].keys())
1239
+
1240
+ # If there are chained correlation IDs, we could optionally merge their hierarchies
1241
+ # For now, just add metadata about them
1242
+ if chains["total_chains"] > 0:
1243
+ hierarchy["has_correlation_chains"] = True
1244
+ hierarchy["correlation_chain_count"] = chains["total_chains"]
1245
+
1246
+ # Add note about additional correlation IDs that could be explored
1247
+ child_ids = set()
1248
+ for chain in chains["chains"]:
1249
+ child_ids.add(chain["child_correlation_id"])
1250
+
1251
+ hierarchy["related_correlation_ids"] = sorted(child_ids)
1252
+
1253
+ return hierarchy
1254
+
1255
+
1256
+ def analyze_bottlenecks(
1257
+ hierarchy: Dict[str, Any],
1258
+ threshold_percentage: float = 20.0,
1259
+ ) -> Dict[str, Any]:
1260
+ """
1261
+ AI-powered bottleneck detection with optimization suggestions.
1262
+
1263
+ Analyzes hierarchy to identify:
1264
+ - Primary bottleneck (longest duration)
1265
+ - Secondary bottlenecks
1266
+ - Potential parallelization opportunities
1267
+ - Caching opportunities
1268
+ - Circuit breaker recommendations
1269
+
1270
+ Args:
1271
+ hierarchy: Hierarchy from follow_thread_hierarchy()
1272
+ threshold_percentage: Minimum % of total time to be considered significant
1273
+
1274
+ Returns:
1275
+ Dictionary with bottleneck analysis:
1276
+ {
1277
+ "primary_bottleneck": {...},
1278
+ "secondary_bottlenecks": [...],
1279
+ "optimization_suggestions": [...],
1280
+ "parallelization_opportunities": [...],
1281
+ "estimated_improvement_ms": float
1282
+ }
1283
+
1284
+ Example:
1285
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
1286
+ analysis = analyze_bottlenecks(hierarchy)
1287
+ for suggestion in analysis['optimization_suggestions']:
1288
+ print(f" - {suggestion}")
1289
+ """
1290
+ result = {
1291
+ "primary_bottleneck": None,
1292
+ "secondary_bottlenecks": [],
1293
+ "optimization_suggestions": [],
1294
+ "parallelization_opportunities": [],
1295
+ "caching_opportunities": [],
1296
+ "estimated_improvement_ms": 0,
1297
+ }
1298
+
1299
+ total_duration = hierarchy.get("total_duration_ms", 0)
1300
+ if total_duration <= 0:
1301
+ return result
1302
+
1303
+ bottleneck = hierarchy.get("bottleneck")
1304
+ if bottleneck:
1305
+ result["primary_bottleneck"] = bottleneck
1306
+
1307
+ # Collect all nodes with timing
1308
+ all_nodes = []
1309
+
1310
+ def collect_nodes(node: Dict[str, Any]):
1311
+ duration = node.get("duration_ms", 0)
1312
+ if duration and duration > 0:
1313
+ percentage = (duration / total_duration) * 100
1314
+ all_nodes.append(
1315
+ {
1316
+ "id": node.get("id"),
1317
+ "duration_ms": duration,
1318
+ "percentage": percentage,
1319
+ "depth": node.get("depth", 0),
1320
+ "children_count": len(node.get("children", [])),
1321
+ "is_leaf": len(node.get("children", [])) == 0,
1322
+ "error_count": node.get("error_count", 0),
1323
+ }
1324
+ )
1325
+ for child in node.get("children", []):
1326
+ collect_nodes(child)
1327
+
1328
+ for root in hierarchy.get("roots", []):
1329
+ collect_nodes(root)
1330
+
1331
+ # Sort by duration
1332
+ all_nodes.sort(key=lambda x: -x["duration_ms"])
1333
+
1334
+ # Find secondary bottlenecks
1335
+ for node in all_nodes[1:5]: # Top 5 excluding primary
1336
+ if node["percentage"] >= threshold_percentage:
1337
+ result["secondary_bottlenecks"].append(node)
1338
+
1339
+ # Generate optimization suggestions
1340
+ suggestions = []
1341
+
1342
+ # Check for parallelization opportunities
1343
+ # Look for siblings at same depth with no dependencies
1344
+ depth_groups = defaultdict(list)
1345
+ for node in all_nodes:
1346
+ depth_groups[node["depth"]].append(node)
1347
+
1348
+ for depth, nodes in depth_groups.items():
1349
+ if len(nodes) >= 2:
1350
+ total_sibling_time = sum(n["duration_ms"] for n in nodes)
1351
+ max_sibling_time = max(n["duration_ms"] for n in nodes)
1352
+ savings = total_sibling_time - max_sibling_time
1353
+
1354
+ if savings > total_duration * 0.1: # >10% potential savings
1355
+ sibling_names = [n["id"] for n in nodes[:3]]
1356
+ result["parallelization_opportunities"].append(
1357
+ {
1358
+ "depth": depth,
1359
+ "nodes": sibling_names,
1360
+ "potential_savings_ms": savings,
1361
+ }
1362
+ )
1363
+ suggestions.append(
1364
+ f"Parallelize operations at depth {depth} ({', '.join(sibling_names[:2])}) - "
1365
+ f"potential savings: {savings:.0f}ms"
1366
+ )
1367
+
1368
+ # Check for caching opportunities (repeated patterns)
1369
+ leaf_nodes = [n for n in all_nodes if n["is_leaf"]]
1370
+ if len(leaf_nodes) > 3:
1371
+ avg_leaf_time = sum(n["duration_ms"] for n in leaf_nodes) / len(leaf_nodes)
1372
+ slow_leaves = [n for n in leaf_nodes if n["duration_ms"] > avg_leaf_time * 2]
1373
+ if slow_leaves:
1374
+ suggestions.append(
1375
+ f"Consider caching for slow leaf operations: {', '.join(n['id'] for n in slow_leaves[:3])}"
1376
+ )
1377
+ result["caching_opportunities"] = [n["id"] for n in slow_leaves[:3]]
1378
+
1379
+ # Primary bottleneck specific suggestions
1380
+ if bottleneck:
1381
+ percentage = bottleneck.get("percentage", 0)
1382
+ if percentage > 50:
1383
+ suggestions.append(
1384
+ f"CRITICAL: {bottleneck['node_id']} takes {percentage:.0f}% of total time - prioritize optimization"
1385
+ )
1386
+ elif percentage > 30:
1387
+ suggestions.append(
1388
+ f"IMPORTANT: Consider optimizing {bottleneck['node_id']} ({percentage:.0f}% of time)"
1389
+ )
1390
+
1391
+ if bottleneck.get("depth", 0) > 2:
1392
+ suggestions.append(
1393
+ f"Bottleneck is deep in call stack (depth {bottleneck['depth']}) - consider moving to async"
1394
+ )
1395
+
1396
+ # Check for error-prone bottlenecks
1397
+ error_nodes = [n for n in all_nodes if n["error_count"] > 0 and n["percentage"] > 10]
1398
+ for node in error_nodes:
1399
+ suggestions.append(
1400
+ f"Add circuit breaker for {node['id']} - errors detected and {node['percentage']:.0f}% of time"
1401
+ )
1402
+
1403
+ result["optimization_suggestions"] = suggestions
1404
+
1405
+ # Estimate potential improvement
1406
+ if result["parallelization_opportunities"]:
1407
+ result["estimated_improvement_ms"] = sum(
1408
+ p["potential_savings_ms"] for p in result["parallelization_opportunities"]
1409
+ )
1410
+
1411
+ return result
1412
+
1413
+
1414
+ def diff_hierarchies(
1415
+ hierarchy_a: Dict[str, Any],
1416
+ hierarchy_b: Dict[str, Any],
1417
+ label_a: str = "Before",
1418
+ label_b: str = "After",
1419
+ ) -> Dict[str, Any]:
1420
+ """
1421
+ Compare two hierarchies to identify performance changes.
1422
+
1423
+ Useful for before/after deployment comparisons, A/B testing,
1424
+ or debugging performance regressions.
1425
+
1426
+ Args:
1427
+ hierarchy_a: First hierarchy (baseline)
1428
+ hierarchy_b: Second hierarchy (comparison)
1429
+ label_a: Label for first hierarchy
1430
+ label_b: Label for second hierarchy
1431
+
1432
+ Returns:
1433
+ Dictionary with comparison results:
1434
+ {
1435
+ "summary": {
1436
+ "total_duration_change_ms": float,
1437
+ "total_duration_change_pct": float,
1438
+ "node_count_change": int,
1439
+ "new_errors": int,
1440
+ "resolved_errors": int
1441
+ },
1442
+ "improved_nodes": [...],
1443
+ "degraded_nodes": [...],
1444
+ "new_nodes": [...],
1445
+ "removed_nodes": [...],
1446
+ "error_changes": {...}
1447
+ }
1448
+
1449
+ Example:
1450
+ before = follow_thread_hierarchy(files=["before.log"], root_identifier="req-123")
1451
+ after = follow_thread_hierarchy(files=["after.log"], root_identifier="req-123")
1452
+ diff = diff_hierarchies(before, after)
1453
+ print(f"Performance change: {diff['summary']['total_duration_change_pct']:.1f}%")
1454
+ """
1455
+ result = {
1456
+ "label_a": label_a,
1457
+ "label_b": label_b,
1458
+ "summary": {
1459
+ "total_duration_change_ms": 0,
1460
+ "total_duration_change_pct": 0,
1461
+ "node_count_change": 0,
1462
+ "new_errors": 0,
1463
+ "resolved_errors": 0,
1464
+ },
1465
+ "improved_nodes": [],
1466
+ "degraded_nodes": [],
1467
+ "new_nodes": [],
1468
+ "removed_nodes": [],
1469
+ "error_changes": {
1470
+ "new_errors": [],
1471
+ "resolved_errors": [],
1472
+ },
1473
+ }
1474
+
1475
+ # Collect nodes from both hierarchies
1476
+ def collect_nodes(hierarchy: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
1477
+ nodes = {}
1478
+
1479
+ def walk(node: Dict[str, Any]):
1480
+ node_id = node.get("id")
1481
+ if node_id:
1482
+ nodes[node_id] = {
1483
+ "duration_ms": node.get("duration_ms", 0),
1484
+ "error_count": node.get("error_count", 0),
1485
+ "entry_count": node.get("entry_count", 0),
1486
+ }
1487
+ for child in node.get("children", []):
1488
+ walk(child)
1489
+
1490
+ for root in hierarchy.get("roots", []):
1491
+ walk(root)
1492
+
1493
+ return nodes
1494
+
1495
+ nodes_a = collect_nodes(hierarchy_a)
1496
+ nodes_b = collect_nodes(hierarchy_b)
1497
+
1498
+ # Duration changes
1499
+ duration_a = hierarchy_a.get("total_duration_ms", 0)
1500
+ duration_b = hierarchy_b.get("total_duration_ms", 0)
1501
+
1502
+ result["summary"]["total_duration_change_ms"] = duration_b - duration_a
1503
+ if duration_a > 0:
1504
+ result["summary"]["total_duration_change_pct"] = (
1505
+ (duration_b - duration_a) / duration_a * 100
1506
+ )
1507
+
1508
+ # Node count changes
1509
+ result["summary"]["node_count_change"] = len(nodes_b) - len(nodes_a)
1510
+
1511
+ # Compare individual nodes
1512
+ all_node_ids = set(nodes_a.keys()) | set(nodes_b.keys())
1513
+
1514
+ for node_id in all_node_ids:
1515
+ in_a = node_id in nodes_a
1516
+ in_b = node_id in nodes_b
1517
+
1518
+ if in_a and not in_b:
1519
+ result["removed_nodes"].append(
1520
+ {
1521
+ "id": node_id,
1522
+ "duration_ms": nodes_a[node_id]["duration_ms"],
1523
+ }
1524
+ )
1525
+ elif in_b and not in_a:
1526
+ result["new_nodes"].append(
1527
+ {
1528
+ "id": node_id,
1529
+ "duration_ms": nodes_b[node_id]["duration_ms"],
1530
+ }
1531
+ )
1532
+ else:
1533
+ # Both exist - compare
1534
+ dur_a = nodes_a[node_id]["duration_ms"]
1535
+ dur_b = nodes_b[node_id]["duration_ms"]
1536
+ change_ms = dur_b - dur_a
1537
+ change_pct = ((dur_b - dur_a) / dur_a * 100) if dur_a > 0 else 0
1538
+
1539
+ if change_ms < -10: # >10ms improvement
1540
+ result["improved_nodes"].append(
1541
+ {
1542
+ "id": node_id,
1543
+ "before_ms": dur_a,
1544
+ "after_ms": dur_b,
1545
+ "change_ms": change_ms,
1546
+ "change_pct": change_pct,
1547
+ }
1548
+ )
1549
+ elif change_ms > 10: # >10ms degradation
1550
+ result["degraded_nodes"].append(
1551
+ {
1552
+ "id": node_id,
1553
+ "before_ms": dur_a,
1554
+ "after_ms": dur_b,
1555
+ "change_ms": change_ms,
1556
+ "change_pct": change_pct,
1557
+ }
1558
+ )
1559
+
1560
+ # Error changes
1561
+ err_a = nodes_a[node_id]["error_count"]
1562
+ err_b = nodes_b[node_id]["error_count"]
1563
+
1564
+ if err_a == 0 and err_b > 0:
1565
+ result["error_changes"]["new_errors"].append(node_id)
1566
+ result["summary"]["new_errors"] += 1
1567
+ elif err_a > 0 and err_b == 0:
1568
+ result["error_changes"]["resolved_errors"].append(node_id)
1569
+ result["summary"]["resolved_errors"] += 1
1570
+
1571
+ # Sort by impact
1572
+ result["improved_nodes"].sort(key=lambda x: x["change_ms"])
1573
+ result["degraded_nodes"].sort(key=lambda x: -x["change_ms"])
1574
+
1575
+ return result
1576
+
1577
+
1578
+ def format_hierarchy_diff(diff: Dict[str, Any]) -> str:
1579
+ """
1580
+ Format hierarchy diff as human-readable text.
1581
+
1582
+ Args:
1583
+ diff: Diff from diff_hierarchies()
1584
+
1585
+ Returns:
1586
+ Formatted diff string
1587
+ """
1588
+ lines = []
1589
+
1590
+ lines.append("=" * 70)
1591
+ lines.append("📊 HIERARCHY COMPARISON")
1592
+ lines.append(f" {diff['label_a']} vs {diff['label_b']}")
1593
+ lines.append("=" * 70)
1594
+
1595
+ summary = diff["summary"]
1596
+ change_ms = summary["total_duration_change_ms"]
1597
+ change_pct = summary["total_duration_change_pct"]
1598
+
1599
+ direction = "⬇️ IMPROVED" if change_ms < 0 else "⬆️ DEGRADED" if change_ms > 0 else "➡️ UNCHANGED"
1600
+ lines.append(f"\n{direction}: {abs(change_ms):.0f}ms ({abs(change_pct):.1f}%)")
1601
+
1602
+ if summary["new_errors"] > 0:
1603
+ lines.append(f"❌ New errors: {summary['new_errors']}")
1604
+ if summary["resolved_errors"] > 0:
1605
+ lines.append(f"✅ Resolved errors: {summary['resolved_errors']}")
1606
+
1607
+ if diff["improved_nodes"]:
1608
+ lines.append("\n" + "-" * 70)
1609
+ lines.append("✅ IMPROVED NODES")
1610
+ for node in diff["improved_nodes"][:5]:
1611
+ lines.append(
1612
+ f" • {node['id']}: {node['before_ms']:.0f}ms → {node['after_ms']:.0f}ms "
1613
+ f"({node['change_pct']:.1f}%)"
1614
+ )
1615
+
1616
+ if diff["degraded_nodes"]:
1617
+ lines.append("\n" + "-" * 70)
1618
+ lines.append("⚠️ DEGRADED NODES")
1619
+ for node in diff["degraded_nodes"][:5]:
1620
+ lines.append(
1621
+ f" • {node['id']}: {node['before_ms']:.0f}ms → {node['after_ms']:.0f}ms "
1622
+ f"(+{node['change_pct']:.1f}%)"
1623
+ )
1624
+
1625
+ if diff["new_nodes"]:
1626
+ lines.append("\n" + "-" * 70)
1627
+ lines.append("🆕 NEW NODES")
1628
+ for node in diff["new_nodes"][:5]:
1629
+ lines.append(f" • {node['id']}: {node['duration_ms']:.0f}ms")
1630
+
1631
+ if diff["removed_nodes"]:
1632
+ lines.append("\n" + "-" * 70)
1633
+ lines.append("🗑️ REMOVED NODES")
1634
+ for node in diff["removed_nodes"][:5]:
1635
+ lines.append(f" • {node['id']}: was {node['duration_ms']:.0f}ms")
1636
+
1637
+ lines.append("\n" + "=" * 70)
1638
+
1639
+ return "\n".join(lines)
1640
+
1641
+
1642
+ def export_to_jaeger(
1643
+ hierarchy: Dict[str, Any],
1644
+ service_name: str = "logler-export",
1645
+ ) -> Dict[str, Any]:
1646
+ """
1647
+ Export hierarchy to Jaeger-compatible format.
1648
+
1649
+ The output follows the Jaeger JSON format and can be imported
1650
+ into Jaeger UI for visualization.
1651
+
1652
+ Args:
1653
+ hierarchy: Hierarchy from follow_thread_hierarchy()
1654
+ service_name: Name of the service for Jaeger
1655
+
1656
+ Returns:
1657
+ Dictionary in Jaeger trace format
1658
+
1659
+ Example:
1660
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
1661
+ jaeger_trace = export_to_jaeger(hierarchy, service_name="my-service")
1662
+
1663
+ with open("trace.json", "w") as f:
1664
+ json.dump(jaeger_trace, f)
1665
+
1666
+ # Import with: jaeger-query --grpc.host-port=localhost:16685
1667
+ """
1668
+ import uuid
1669
+ from datetime import datetime
1670
+
1671
+ trace_id = uuid.uuid4().hex[:32]
1672
+ spans = []
1673
+
1674
+ def convert_node(node: Dict[str, Any], parent_span_id: Optional[str] = None):
1675
+ span_id = uuid.uuid4().hex[:16]
1676
+
1677
+ # Parse timestamps
1678
+ start_time = node.get("start_time")
1679
+ if start_time:
1680
+ if isinstance(start_time, str):
1681
+ try:
1682
+ dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
1683
+ start_us = int(dt.timestamp() * 1_000_000)
1684
+ except Exception:
1685
+ start_us = 0
1686
+ else:
1687
+ start_us = 0
1688
+ else:
1689
+ start_us = 0
1690
+
1691
+ duration_us = int((node.get("duration_ms", 0) or 0) * 1000)
1692
+
1693
+ span = {
1694
+ "traceID": trace_id,
1695
+ "spanID": span_id,
1696
+ "operationName": node.get("id", "unknown"),
1697
+ "references": [],
1698
+ "startTime": start_us,
1699
+ "duration": duration_us,
1700
+ "tags": [
1701
+ {"key": "node_type", "type": "string", "value": node.get("node_type", "unknown")},
1702
+ {"key": "entry_count", "type": "int64", "value": node.get("entry_count", 0)},
1703
+ {"key": "error_count", "type": "int64", "value": node.get("error_count", 0)},
1704
+ ],
1705
+ "logs": [],
1706
+ "processID": "p1",
1707
+ "warnings": [],
1708
+ }
1709
+
1710
+ if parent_span_id:
1711
+ span["references"].append(
1712
+ {
1713
+ "refType": "CHILD_OF",
1714
+ "traceID": trace_id,
1715
+ "spanID": parent_span_id,
1716
+ }
1717
+ )
1718
+
1719
+ if node.get("error_count", 0) > 0:
1720
+ span["tags"].append({"key": "error", "type": "bool", "value": True})
1721
+
1722
+ spans.append(span)
1723
+
1724
+ # Process children
1725
+ for child in node.get("children", []):
1726
+ convert_node(child, span_id)
1727
+
1728
+ # Convert all roots
1729
+ for root in hierarchy.get("roots", []):
1730
+ convert_node(root)
1731
+
1732
+ return {
1733
+ "data": [
1734
+ {
1735
+ "traceID": trace_id,
1736
+ "spans": spans,
1737
+ "processes": {
1738
+ "p1": {
1739
+ "serviceName": service_name,
1740
+ "tags": [
1741
+ {"key": "exported_by", "type": "string", "value": "logler"},
1742
+ ],
1743
+ }
1744
+ },
1745
+ "warnings": [],
1746
+ }
1747
+ ]
1748
+ }
1749
+
1750
+
1751
+ def export_to_zipkin(
1752
+ hierarchy: Dict[str, Any],
1753
+ service_name: str = "logler-export",
1754
+ ) -> List[Dict[str, Any]]:
1755
+ """
1756
+ Export hierarchy to Zipkin-compatible format.
1757
+
1758
+ Args:
1759
+ hierarchy: Hierarchy from follow_thread_hierarchy()
1760
+ service_name: Name of the service
1761
+
1762
+ Returns:
1763
+ List of spans in Zipkin V2 format
1764
+
1765
+ Example:
1766
+ hierarchy = follow_thread_hierarchy(files=["app.log"], root_identifier="req-123")
1767
+ zipkin_spans = export_to_zipkin(hierarchy)
1768
+
1769
+ # POST to Zipkin: curl -X POST http://localhost:9411/api/v2/spans -H 'Content-Type: application/json' -d '@spans.json'
1770
+ """
1771
+ import uuid
1772
+ from datetime import datetime
1773
+
1774
+ trace_id = uuid.uuid4().hex[:32]
1775
+ spans = []
1776
+
1777
+ def convert_node(node: Dict[str, Any], parent_id: Optional[str] = None):
1778
+ span_id = uuid.uuid4().hex[:16]
1779
+
1780
+ # Parse timestamp
1781
+ start_time = node.get("start_time")
1782
+ timestamp_us = 0
1783
+ if start_time:
1784
+ if isinstance(start_time, str):
1785
+ try:
1786
+ dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
1787
+ timestamp_us = int(dt.timestamp() * 1_000_000)
1788
+ except Exception:
1789
+ pass
1790
+
1791
+ duration_us = int((node.get("duration_ms", 0) or 0) * 1000)
1792
+
1793
+ span = {
1794
+ "traceId": trace_id,
1795
+ "id": span_id,
1796
+ "name": node.get("id", "unknown"),
1797
+ "timestamp": timestamp_us,
1798
+ "duration": duration_us,
1799
+ "localEndpoint": {
1800
+ "serviceName": service_name,
1801
+ },
1802
+ "tags": {
1803
+ "node_type": node.get("node_type", "unknown"),
1804
+ "entry_count": str(node.get("entry_count", 0)),
1805
+ },
1806
+ }
1807
+
1808
+ if parent_id:
1809
+ span["parentId"] = parent_id
1810
+
1811
+ if node.get("error_count", 0) > 0:
1812
+ span["tags"]["error"] = "true"
1813
+
1814
+ spans.append(span)
1815
+
1816
+ for child in node.get("children", []):
1817
+ convert_node(child, span_id)
1818
+
1819
+ for root in hierarchy.get("roots", []):
1820
+ convert_node(root)
1821
+
1822
+ return spans
1823
+
1824
+
1825
+ def find_patterns(
1826
+ files: List[str],
1827
+ min_occurrences: int = 3,
1828
+ parser_format: Optional[str] = None,
1829
+ custom_regex: Optional[str] = None,
1830
+ ) -> Dict[str, Any]:
1831
+ """
1832
+ Find repeated patterns and anomalies in logs.
1833
+
1834
+ Args:
1835
+ files: List of log file paths
1836
+ min_occurrences: Minimum number of occurrences to consider a pattern
1837
+
1838
+ Returns:
1839
+ Dictionary with patterns:
1840
+ {
1841
+ "patterns": [
1842
+ {
1843
+ "pattern": "...",
1844
+ "occurrences": 15,
1845
+ "first_seen": "...",
1846
+ "last_seen": "...",
1847
+ "affected_threads": [...],
1848
+ "examples": [...]
1849
+ }
1850
+ ]
1851
+ }
1852
+ """
1853
+ if not RUST_AVAILABLE:
1854
+ raise RuntimeError("Rust backend not available")
1855
+
1856
+ if parser_format or custom_regex:
1857
+ inv = Investigator()
1858
+ inv.load_files(files, parser_format=parser_format, custom_regex=custom_regex)
1859
+ return inv.find_patterns(min_occurrences=min_occurrences)
1860
+
1861
+ result_json = logler_rs.find_patterns(files, min_occurrences)
1862
+ result = json.loads(result_json)
1863
+ _normalize_pattern_examples(result)
1864
+ _apply_custom_regex_to_results(result, custom_regex)
1865
+ return result
1866
+
1867
+
1868
+ def get_metadata(files: List[str]) -> Dict[str, Any]:
1869
+ """
1870
+ Get metadata about log files.
1871
+
1872
+ Args:
1873
+ files: List of log file paths
1874
+
1875
+ Returns:
1876
+ List of file metadata:
1877
+ [
1878
+ {
1879
+ "path": "...",
1880
+ "size_bytes": 12345,
1881
+ "lines": 5000,
1882
+ "format": "json",
1883
+ "time_range": {...},
1884
+ "available_fields": [...],
1885
+ "unique_threads": 8,
1886
+ "unique_correlation_ids": 123,
1887
+ "log_levels": {...}
1888
+ }
1889
+ ]
1890
+ """
1891
+ if not RUST_AVAILABLE:
1892
+ raise RuntimeError("Rust backend not available")
1893
+
1894
+ result_json = logler_rs.get_metadata(files)
1895
+ return json.loads(result_json)
1896
+
1897
+
1898
+ # Advanced API using Investigator class
1899
+ class Investigator:
1900
+ """
1901
+ Advanced investigation API with persistent index.
1902
+
1903
+ Use this when you need to perform multiple operations on the same files
1904
+ for better performance.
1905
+
1906
+ Example:
1907
+ investigator = Investigator()
1908
+ investigator.load_files(["app.log", "api.log"])
1909
+
1910
+ results = investigator.search(query="error", limit=10)
1911
+ patterns = investigator.find_patterns(min_occurrences=5)
1912
+ metadata = investigator.get_metadata()
1913
+ """
1914
+
1915
+ def __init__(self):
1916
+ if not RUST_AVAILABLE:
1917
+ raise RuntimeError("Rust backend not available")
1918
+ self._investigator = logler_rs.PyInvestigator()
1919
+ self._files = []
1920
+ self._custom_regex = None
1921
+
1922
+ def load_files(
1923
+ self,
1924
+ files: List[str],
1925
+ parser_format: Optional[str] = None,
1926
+ custom_regex: Optional[str] = None,
1927
+ ):
1928
+ """Load log files and build index."""
1929
+ _load_files_with_config(self._investigator, files, parser_format, custom_regex)
1930
+ self._files = files
1931
+ self._custom_regex = custom_regex
1932
+
1933
+ def search(
1934
+ self,
1935
+ query: Optional[str] = None,
1936
+ level: Optional[str] = None,
1937
+ thread_id: Optional[str] = None,
1938
+ correlation_id: Optional[str] = None,
1939
+ limit: Optional[int] = None,
1940
+ context_lines: int = 3,
1941
+ ) -> Dict[str, Any]:
1942
+ """Search loaded files."""
1943
+ filters = {"levels": []}
1944
+ if level:
1945
+ filters["levels"] = [level.upper()]
1946
+ if thread_id:
1947
+ filters["thread_id"] = thread_id
1948
+ if correlation_id:
1949
+ filters["correlation_id"] = correlation_id
1950
+
1951
+ query_dict = {
1952
+ "files": self._files,
1953
+ "query": query,
1954
+ "filters": filters,
1955
+ "limit": limit,
1956
+ "context_lines": context_lines,
1957
+ }
1958
+
1959
+ result_json = self._investigator.search(json.dumps(query_dict))
1960
+ result = json.loads(result_json)
1961
+ _normalize_search_result_levels(result)
1962
+ _apply_custom_regex_to_results(result, self._custom_regex)
1963
+ return result
1964
+
1965
+ def follow_thread(
1966
+ self,
1967
+ thread_id: Optional[str] = None,
1968
+ correlation_id: Optional[str] = None,
1969
+ trace_id: Optional[str] = None,
1970
+ ) -> Dict[str, Any]:
1971
+ """Follow thread in loaded files."""
1972
+ result_json = self._investigator.follow_thread(
1973
+ self._files, thread_id, correlation_id, trace_id
1974
+ )
1975
+ result = json.loads(result_json)
1976
+ _normalize_entries(result.get("entries", []))
1977
+ return result
1978
+
1979
+ def find_patterns(self, min_occurrences: int = 3) -> Dict[str, Any]:
1980
+ """Find patterns in loaded files."""
1981
+ result_json = self._investigator.find_patterns(self._files, min_occurrences)
1982
+ result = json.loads(result_json)
1983
+ _normalize_pattern_examples(result)
1984
+ return result
1985
+
1986
+ def get_metadata(self) -> Dict[str, Any]:
1987
+ """Get metadata for loaded files."""
1988
+ result_json = self._investigator.get_metadata(self._files)
1989
+ return json.loads(result_json)
1990
+
1991
+ def get_context(
1992
+ self,
1993
+ file: str,
1994
+ line_number: int,
1995
+ lines_before: int = 10,
1996
+ lines_after: int = 10,
1997
+ ) -> Dict[str, Any]:
1998
+ """Get context around a line."""
1999
+ result_json = self._investigator.get_context(
2000
+ file, line_number, lines_before, lines_after, False
2001
+ )
2002
+ result = json.loads(result_json)
2003
+ _normalize_context_payload(result)
2004
+ return result
2005
+
2006
+ def sql_query(self, query: str) -> List[Dict[str, Any]]:
2007
+ """
2008
+ Execute SQL query on loaded logs (requires 'sql' feature).
2009
+
2010
+ Args:
2011
+ query: SQL query string
2012
+
2013
+ Returns:
2014
+ List of result rows as dictionaries
2015
+
2016
+ Example:
2017
+ results = investigator.sql_query(\"\"\"
2018
+ SELECT level, COUNT(*) as count
2019
+ FROM logs
2020
+ GROUP BY level
2021
+ ORDER BY count DESC
2022
+ \"\"\")
2023
+ """
2024
+ if not hasattr(self._investigator, "sql_query"):
2025
+ raise RuntimeError("SQL feature not available. Build with --features sql")
2026
+ result_json = self._investigator.sql_query(query)
2027
+ return json.loads(result_json)
2028
+
2029
+ def sql_tables(self) -> List[str]:
2030
+ """Get list of available SQL tables (requires 'sql' feature)."""
2031
+ if not hasattr(self._investigator, "sql_tables"):
2032
+ raise RuntimeError("SQL feature not available. Build with --features sql")
2033
+ return self._investigator.sql_tables()
2034
+
2035
+ def sql_schema(self, table: str) -> List[Dict[str, Any]]:
2036
+ """Get schema for a SQL table (requires 'sql' feature)."""
2037
+ if not hasattr(self._investigator, "sql_schema"):
2038
+ raise RuntimeError("SQL feature not available. Build with --features sql")
2039
+ result_json = self._investigator.sql_schema(table)
2040
+ return json.loads(result_json)
2041
+
2042
+ def build_hierarchy(
2043
+ self,
2044
+ root_identifier: str,
2045
+ max_depth: Optional[int] = None,
2046
+ use_naming_patterns: bool = True,
2047
+ use_temporal_inference: bool = True,
2048
+ min_confidence: float = 0.0,
2049
+ ) -> Dict[str, Any]:
2050
+ """
2051
+ Build hierarchical tree of threads/spans from loaded files.
2052
+
2053
+ Args:
2054
+ root_identifier: Root thread ID, correlation ID, or span ID
2055
+ max_depth: Maximum depth of hierarchy tree
2056
+ use_naming_patterns: Enable naming pattern detection
2057
+ use_temporal_inference: Enable time-based inference
2058
+ min_confidence: Minimum confidence score (0.0-1.0)
2059
+
2060
+ Returns:
2061
+ Hierarchy dictionary (see follow_thread_hierarchy for structure)
2062
+
2063
+ Example:
2064
+ inv = Investigator()
2065
+ inv.load_files(["app.log"])
2066
+ hierarchy = inv.build_hierarchy(root_identifier="req-123")
2067
+ summary = get_hierarchy_summary(hierarchy)
2068
+ print(summary)
2069
+ """
2070
+ result_json = self._investigator.build_hierarchy(
2071
+ self._files,
2072
+ root_identifier,
2073
+ max_depth,
2074
+ use_naming_patterns,
2075
+ use_temporal_inference,
2076
+ min_confidence,
2077
+ )
2078
+ return json.loads(result_json)
2079
+
2080
+
2081
+ # Advanced LLM-optimized features
2082
+
2083
+
2084
+ def cross_service_timeline(
2085
+ files: Dict[str, List[str]],
2086
+ time_window: Optional[Tuple[str, str]] = None,
2087
+ correlation_id: Optional[str] = None,
2088
+ trace_id: Optional[str] = None,
2089
+ limit: Optional[int] = None,
2090
+ parser_format: Optional[str] = None,
2091
+ custom_regex: Optional[str] = None,
2092
+ ) -> Dict[str, Any]:
2093
+ """
2094
+ Create a unified timeline across multiple services/log files.
2095
+
2096
+ This is perfect for investigating distributed systems where a single
2097
+ request flows through multiple services (API Gateway → Auth → Database → Cache).
2098
+
2099
+ Args:
2100
+ files: Dictionary mapping service names to log file lists
2101
+ e.g., {"api": ["api.log"], "database": ["db.log"], "cache": ["cache.log"]}
2102
+ time_window: Optional tuple of (start_time, end_time) in ISO format
2103
+ correlation_id: Filter to specific correlation ID
2104
+ trace_id: Filter to specific trace ID
2105
+ limit: Maximum number of entries to return
2106
+
2107
+ Returns:
2108
+ Dictionary with unified timeline:
2109
+ {
2110
+ "timeline": [
2111
+ {
2112
+ "service": "api",
2113
+ "timestamp": "2024-01-01T10:30:15.123Z",
2114
+ "entry": {...},
2115
+ "relative_time_ms": 0
2116
+ },
2117
+ {
2118
+ "service": "database",
2119
+ "timestamp": "2024-01-01T10:30:15.456Z",
2120
+ "entry": {...},
2121
+ "relative_time_ms": 333
2122
+ },
2123
+ ...
2124
+ ],
2125
+ "services": ["api", "database", "cache"],
2126
+ "total_entries": 42,
2127
+ "duration_ms": 1523,
2128
+ "service_breakdown": {
2129
+ "api": 15,
2130
+ "database": 20,
2131
+ "cache": 7
2132
+ }
2133
+ }
2134
+
2135
+ Example:
2136
+ # Investigate a failed request across services
2137
+ timeline = cross_service_timeline(
2138
+ files={
2139
+ "api": ["logs/api.log"],
2140
+ "auth": ["logs/auth.log"],
2141
+ "db": ["logs/db.log"]
2142
+ },
2143
+ correlation_id="req-12345"
2144
+ )
2145
+
2146
+ # See the flow
2147
+ for entry in timeline['timeline']:
2148
+ print(f"[{entry['service']:10s}] +{entry['relative_time_ms']:4d}ms: {entry['entry']['message']}")
2149
+ """
2150
+ if not RUST_AVAILABLE:
2151
+ raise RuntimeError("Rust backend not available")
2152
+
2153
+ # Collect entries from all services
2154
+ all_entries = []
2155
+ service_counts = defaultdict(int)
2156
+
2157
+ for service_name, service_files in files.items():
2158
+ if correlation_id:
2159
+ result = follow_thread(service_files, correlation_id=correlation_id, trace_id=trace_id)
2160
+ entries = result.get("entries", [])
2161
+ elif trace_id:
2162
+ result = follow_thread(service_files, trace_id=trace_id)
2163
+ entries = result.get("entries", [])
2164
+ else:
2165
+ # Get all entries
2166
+ result = search(
2167
+ service_files, limit=None, parser_format=parser_format, custom_regex=custom_regex
2168
+ )
2169
+ entries = [r["entry"] for r in result.get("results", [])]
2170
+
2171
+ # Add service label to each entry
2172
+ for entry in entries:
2173
+ # Parse timestamp if present
2174
+ timestamp_str = entry.get("timestamp")
2175
+ if timestamp_str:
2176
+ try:
2177
+ timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
2178
+ except (ValueError, TypeError):
2179
+ timestamp = None
2180
+ else:
2181
+ timestamp = None
2182
+
2183
+ all_entries.append(
2184
+ {
2185
+ "service": service_name,
2186
+ "timestamp": timestamp,
2187
+ "timestamp_str": timestamp_str,
2188
+ "entry": entry,
2189
+ }
2190
+ )
2191
+ service_counts[service_name] += 1
2192
+
2193
+ # Filter by time window if specified
2194
+ if time_window:
2195
+ start_time, end_time = time_window
2196
+ try:
2197
+ start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
2198
+ end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
2199
+ all_entries = [
2200
+ e for e in all_entries if e["timestamp"] and start_dt <= e["timestamp"] <= end_dt
2201
+ ]
2202
+ except Exception as e:
2203
+ warnings.warn(f"Could not parse time window: {e}", stacklevel=2)
2204
+
2205
+ # Sort by timestamp
2206
+ all_entries.sort(key=lambda e: e["timestamp"] if e["timestamp"] else datetime.min)
2207
+
2208
+ # Calculate relative times
2209
+ if all_entries and all_entries[0]["timestamp"]:
2210
+ start_time = all_entries[0]["timestamp"]
2211
+ for entry in all_entries:
2212
+ if entry["timestamp"]:
2213
+ delta = entry["timestamp"] - start_time
2214
+ entry["relative_time_ms"] = int(delta.total_seconds() * 1000)
2215
+ else:
2216
+ entry["relative_time_ms"] = None
2217
+ else:
2218
+ for entry in all_entries:
2219
+ entry["relative_time_ms"] = None
2220
+
2221
+ # Apply limit if specified
2222
+ if limit:
2223
+ all_entries = all_entries[:limit]
2224
+
2225
+ # Calculate duration
2226
+ duration_ms = None
2227
+ if len(all_entries) >= 2 and all_entries[0]["timestamp"] and all_entries[-1]["timestamp"]:
2228
+ duration = all_entries[-1]["timestamp"] - all_entries[0]["timestamp"]
2229
+ duration_ms = int(duration.total_seconds() * 1000)
2230
+
2231
+ # Clean up entries for output (remove internal timestamp objects)
2232
+ timeline = []
2233
+ for e in all_entries:
2234
+ timeline.append(
2235
+ {
2236
+ "service": e["service"],
2237
+ "timestamp": e["timestamp_str"],
2238
+ "entry": e["entry"],
2239
+ "relative_time_ms": e["relative_time_ms"],
2240
+ }
2241
+ )
2242
+
2243
+ return {
2244
+ "timeline": timeline,
2245
+ "services": list(files.keys()),
2246
+ "total_entries": len(timeline),
2247
+ "duration_ms": duration_ms,
2248
+ "service_breakdown": dict(service_counts),
2249
+ }
2250
+
2251
+
2252
+ def compare_threads(
2253
+ files: List[str],
2254
+ thread_a: Optional[str] = None,
2255
+ thread_b: Optional[str] = None,
2256
+ correlation_a: Optional[str] = None,
2257
+ correlation_b: Optional[str] = None,
2258
+ trace_a: Optional[str] = None,
2259
+ trace_b: Optional[str] = None,
2260
+ ) -> Dict[str, Any]:
2261
+ """
2262
+ Compare two threads/requests to find differences.
2263
+
2264
+ Perfect for root cause analysis: "What's different between the successful
2265
+ request and the failed one?"
2266
+
2267
+ Args:
2268
+ files: List of log file paths
2269
+ thread_a: First thread ID to compare
2270
+ thread_b: Second thread ID to compare
2271
+ correlation_a: First correlation ID to compare
2272
+ correlation_b: Second correlation ID to compare
2273
+ trace_a: First trace ID to compare
2274
+ trace_b: Second trace ID to compare
2275
+
2276
+ Returns:
2277
+ Dictionary with comparison:
2278
+ {
2279
+ "thread_a": {
2280
+ "id": "...",
2281
+ "entries": [...],
2282
+ "duration_ms": 1523,
2283
+ "error_count": 0,
2284
+ "log_levels": {"INFO": 15, "ERROR": 0},
2285
+ "unique_messages": 15,
2286
+ "services": [...]
2287
+ },
2288
+ "thread_b": {...},
2289
+ "differences": {
2290
+ "duration_diff_ms": 2341, # B took 2341ms longer
2291
+ "error_diff": 5, # B had 5 more errors
2292
+ "only_in_a": ["cache hit", ...], # Messages only in A
2293
+ "only_in_b": ["cache miss", "timeout", ...], # Messages only in B
2294
+ "level_changes": {"ERROR": +5, "WARN": +2}
2295
+ },
2296
+ "summary": "Thread B took 2.3s longer and had 5 errors (cache miss, timeout)"
2297
+ }
2298
+
2299
+ Example:
2300
+ # Compare successful vs failed request
2301
+ diff = compare_threads(
2302
+ files=["app.log"],
2303
+ correlation_a="req-success-123",
2304
+ correlation_b="req-failed-456"
2305
+ )
2306
+ print(diff['summary'])
2307
+ """
2308
+ if not RUST_AVAILABLE:
2309
+ raise RuntimeError("Rust backend not available")
2310
+
2311
+ # Get both threads
2312
+ timeline_a = follow_thread(
2313
+ files, thread_id=thread_a, correlation_id=correlation_a, trace_id=trace_a
2314
+ )
2315
+ timeline_b = follow_thread(
2316
+ files, thread_id=thread_b, correlation_id=correlation_b, trace_id=trace_b
2317
+ )
2318
+
2319
+ # Analyze thread A
2320
+ entries_a = timeline_a.get("entries", [])
2321
+ analysis_a = _analyze_thread(entries_a, thread_a or correlation_a or trace_a or "Thread A")
2322
+
2323
+ # Analyze thread B
2324
+ entries_b = timeline_b.get("entries", [])
2325
+ analysis_b = _analyze_thread(entries_b, thread_b or correlation_b or trace_b or "Thread B")
2326
+
2327
+ # Compare
2328
+ differences = _compute_differences(analysis_a, analysis_b)
2329
+
2330
+ # Generate summary
2331
+ summary = _generate_comparison_summary(analysis_a, analysis_b, differences)
2332
+
2333
+ return {
2334
+ "thread_a": analysis_a,
2335
+ "thread_b": analysis_b,
2336
+ "differences": differences,
2337
+ "summary": summary,
2338
+ }
2339
+
2340
+
2341
+ def compare_time_periods(
2342
+ files: List[str],
2343
+ period_a_start: str,
2344
+ period_a_end: str,
2345
+ period_b_start: str,
2346
+ period_b_end: str,
2347
+ ) -> Dict[str, Any]:
2348
+ """
2349
+ Compare two time periods to find what changed.
2350
+
2351
+ Perfect for questions like: "What changed after the deployment?"
2352
+ or "Why did error rates spike at 3pm?"
2353
+
2354
+ Args:
2355
+ files: List of log file paths
2356
+ period_a_start: Start time for period A (ISO format)
2357
+ period_a_end: End time for period A (ISO format)
2358
+ period_b_start: Start time for period B (ISO format)
2359
+ period_b_end: End time for period B (ISO format)
2360
+
2361
+ Returns:
2362
+ Dictionary with comparison:
2363
+ {
2364
+ "period_a": {
2365
+ "start": "...",
2366
+ "end": "...",
2367
+ "total_logs": 1523,
2368
+ "error_rate": 0.02,
2369
+ "log_levels": {...},
2370
+ "top_errors": [...],
2371
+ "unique_threads": 45
2372
+ },
2373
+ "period_b": {...},
2374
+ "changes": {
2375
+ "log_volume_change_pct": 150, # 150% increase
2376
+ "error_rate_change": 10.5, # 10.5x more errors
2377
+ "new_errors": ["OutOfMemoryError", ...],
2378
+ "resolved_errors": [],
2379
+ "new_threads": 23
2380
+ },
2381
+ "summary": "Period B had 150% more logs and 10.5x error rate. New errors: OutOfMemoryError"
2382
+ }
2383
+
2384
+ Example:
2385
+ # Compare before/after deployment
2386
+ diff = compare_time_periods(
2387
+ files=["app.log"],
2388
+ period_a_start="2024-01-01T14:00:00Z",
2389
+ period_a_end="2024-01-01T15:00:00Z",
2390
+ period_b_start="2024-01-01T15:00:00Z",
2391
+ period_b_end="2024-01-01T16:00:00Z"
2392
+ )
2393
+ print(diff['summary'])
2394
+ """
2395
+ if not RUST_AVAILABLE:
2396
+ raise RuntimeError("Rust backend not available")
2397
+
2398
+ # Search each period
2399
+ # Period A
2400
+ inv = Investigator()
2401
+ inv.load_files(files)
2402
+
2403
+ results_a = search(files, limit=None)
2404
+ results_b = search(files, limit=None)
2405
+
2406
+ # Filter by time
2407
+ entries_a = [
2408
+ r["entry"]
2409
+ for r in results_a.get("results", [])
2410
+ if _in_time_range(r["entry"], period_a_start, period_a_end)
2411
+ ]
2412
+ entries_b = [
2413
+ r["entry"]
2414
+ for r in results_b.get("results", [])
2415
+ if _in_time_range(r["entry"], period_b_start, period_b_end)
2416
+ ]
2417
+
2418
+ # Analyze periods
2419
+ analysis_a = _analyze_period(entries_a, period_a_start, period_a_end)
2420
+ analysis_b = _analyze_period(entries_b, period_b_start, period_b_end)
2421
+
2422
+ # Compute changes
2423
+ changes = _compute_period_changes(analysis_a, analysis_b)
2424
+
2425
+ # Generate summary
2426
+ summary = _generate_period_summary(analysis_a, analysis_b, changes)
2427
+
2428
+ return {"period_a": analysis_a, "period_b": analysis_b, "changes": changes, "summary": summary}
2429
+
2430
+
2431
+ # Helper functions for comparison
2432
+
2433
+
2434
+ def _analyze_thread(entries: List[Dict], thread_id: str) -> Dict[str, Any]:
2435
+ """Analyze a single thread's entries"""
2436
+ if not entries:
2437
+ return {
2438
+ "id": thread_id,
2439
+ "entries": [],
2440
+ "duration_ms": 0,
2441
+ "error_count": 0,
2442
+ "log_levels": {},
2443
+ "unique_messages": 0,
2444
+ "messages": [],
2445
+ "services": [],
2446
+ }
2447
+
2448
+ # Count log levels
2449
+ level_counts = defaultdict(int)
2450
+ error_count = 0
2451
+ messages = []
2452
+ services = set()
2453
+
2454
+ for entry in entries:
2455
+ level = entry.get("level", "INFO")
2456
+ level_counts[level] += 1
2457
+ if level in ["ERROR", "FATAL"]:
2458
+ error_count += 1
2459
+
2460
+ message = entry.get("message", "")
2461
+ messages.append(message)
2462
+
2463
+ service = entry.get("service") or entry.get("service_name")
2464
+ if service:
2465
+ services.add(service)
2466
+
2467
+ # Calculate duration
2468
+ duration_ms = 0
2469
+ if len(entries) >= 2:
2470
+ try:
2471
+ start = datetime.fromisoformat(entries[0].get("timestamp", "").replace("Z", "+00:00"))
2472
+ end = datetime.fromisoformat(entries[-1].get("timestamp", "").replace("Z", "+00:00"))
2473
+ duration_ms = int((end - start).total_seconds() * 1000)
2474
+ except (ValueError, TypeError, AttributeError):
2475
+ pass # Skip if timestamps are missing or invalid
2476
+
2477
+ return {
2478
+ "id": thread_id,
2479
+ "entries": entries,
2480
+ "entry_count": len(entries),
2481
+ "duration_ms": duration_ms,
2482
+ "error_count": error_count,
2483
+ "log_levels": dict(level_counts),
2484
+ "unique_messages": len(set(messages)),
2485
+ "messages": messages,
2486
+ "services": list(services),
2487
+ }
2488
+
2489
+
2490
+ def _compute_differences(analysis_a: Dict, analysis_b: Dict) -> Dict[str, Any]:
2491
+ """Compute differences between two thread analyses"""
2492
+ # Duration difference
2493
+ duration_diff_ms = analysis_b["duration_ms"] - analysis_a["duration_ms"]
2494
+
2495
+ # Error difference
2496
+ error_diff = analysis_b["error_count"] - analysis_a["error_count"]
2497
+
2498
+ # Message differences
2499
+ messages_a = set(analysis_a["messages"])
2500
+ messages_b = set(analysis_b["messages"])
2501
+ only_in_a = list(messages_a - messages_b)
2502
+ only_in_b = list(messages_b - messages_a)
2503
+
2504
+ # Log level changes
2505
+ level_changes = {}
2506
+ all_levels = set(list(analysis_a["log_levels"].keys()) + list(analysis_b["log_levels"].keys()))
2507
+ for level in all_levels:
2508
+ count_a = analysis_a["log_levels"].get(level, 0)
2509
+ count_b = analysis_b["log_levels"].get(level, 0)
2510
+ if count_a != count_b:
2511
+ level_changes[level] = count_b - count_a
2512
+
2513
+ return {
2514
+ "duration_diff_ms": duration_diff_ms,
2515
+ "error_diff": error_diff,
2516
+ "only_in_a": only_in_a[:10], # Limit to 10
2517
+ "only_in_b": only_in_b[:10],
2518
+ "level_changes": level_changes,
2519
+ "entry_count_diff": analysis_b["entry_count"] - analysis_a["entry_count"],
2520
+ }
2521
+
2522
+
2523
+ def _generate_comparison_summary(analysis_a: Dict, analysis_b: Dict, differences: Dict) -> str:
2524
+ """Generate human-readable summary of comparison"""
2525
+ parts = []
2526
+
2527
+ # Duration
2528
+ duration_diff = differences["duration_diff_ms"]
2529
+ if abs(duration_diff) > 100:
2530
+ if duration_diff > 0:
2531
+ parts.append(f"Thread B took {duration_diff}ms longer")
2532
+ else:
2533
+ parts.append(f"Thread B was {-duration_diff}ms faster")
2534
+
2535
+ # Errors
2536
+ error_diff = differences["error_diff"]
2537
+ if error_diff > 0:
2538
+ parts.append(f"Thread B had {error_diff} more error(s)")
2539
+ if differences["only_in_b"]:
2540
+ examples = differences["only_in_b"][:3]
2541
+ parts.append(f"including: {', '.join(examples)}")
2542
+ elif error_diff < 0:
2543
+ parts.append(f"Thread B had {-error_diff} fewer error(s)")
2544
+
2545
+ # New messages in B
2546
+ if differences["only_in_b"] and error_diff == 0:
2547
+ parts.append(f"Thread B had unique messages: {', '.join(differences['only_in_b'][:3])}")
2548
+
2549
+ if not parts:
2550
+ parts.append("Threads are similar")
2551
+
2552
+ return ". ".join(parts)
2553
+
2554
+
2555
+ def _analyze_period(entries: List[Dict], start: str, end: str) -> Dict[str, Any]:
2556
+ """Analyze a time period's entries"""
2557
+ level_counts = defaultdict(int)
2558
+ error_messages = []
2559
+ threads = set()
2560
+
2561
+ for entry in entries:
2562
+ level = entry.get("level", "INFO")
2563
+ level_counts[level] += 1
2564
+
2565
+ if level in ["ERROR", "FATAL"]:
2566
+ error_messages.append(entry.get("message", ""))
2567
+
2568
+ thread = entry.get("thread_id") or entry.get("correlation_id")
2569
+ if thread:
2570
+ threads.add(thread)
2571
+
2572
+ total = len(entries)
2573
+ error_count = level_counts.get("ERROR", 0) + level_counts.get("FATAL", 0)
2574
+ error_rate = error_count / total if total > 0 else 0
2575
+
2576
+ return {
2577
+ "start": start,
2578
+ "end": end,
2579
+ "total_logs": total,
2580
+ "error_count": error_count,
2581
+ "error_rate": error_rate,
2582
+ "log_levels": dict(level_counts),
2583
+ "top_errors": list(set(error_messages))[:10],
2584
+ "unique_threads": len(threads),
2585
+ }
2586
+
2587
+
2588
+ def _compute_period_changes(analysis_a: Dict, analysis_b: Dict) -> Dict[str, Any]:
2589
+ """Compute changes between two time periods"""
2590
+ # Volume change
2591
+ if analysis_a["total_logs"] > 0:
2592
+ volume_change_pct = (
2593
+ (analysis_b["total_logs"] - analysis_a["total_logs"]) / analysis_a["total_logs"]
2594
+ ) * 100
2595
+ else:
2596
+ volume_change_pct = 100 if analysis_b["total_logs"] > 0 else 0
2597
+
2598
+ # Error rate change
2599
+ if analysis_a["error_rate"] > 0:
2600
+ error_rate_multiplier = analysis_b["error_rate"] / analysis_a["error_rate"]
2601
+ else:
2602
+ error_rate_multiplier = float("inf") if analysis_b["error_rate"] > 0 else 1.0
2603
+
2604
+ # New vs resolved errors
2605
+ errors_a = set(analysis_a["top_errors"])
2606
+ errors_b = set(analysis_b["top_errors"])
2607
+ new_errors = list(errors_b - errors_a)
2608
+ resolved_errors = list(errors_a - errors_b)
2609
+
2610
+ return {
2611
+ "log_volume_change_pct": volume_change_pct,
2612
+ "error_rate_multiplier": error_rate_multiplier,
2613
+ "error_count_change": analysis_b["error_count"] - analysis_a["error_count"],
2614
+ "new_errors": new_errors[:10],
2615
+ "resolved_errors": resolved_errors[:10],
2616
+ "thread_count_change": analysis_b["unique_threads"] - analysis_a["unique_threads"],
2617
+ }
2618
+
2619
+
2620
+ def _generate_period_summary(analysis_a: Dict, analysis_b: Dict, changes: Dict) -> str:
2621
+ """Generate human-readable summary of period comparison"""
2622
+ parts = []
2623
+
2624
+ # Volume
2625
+ vol_change = changes["log_volume_change_pct"]
2626
+ if abs(vol_change) > 20:
2627
+ parts.append(
2628
+ f"Log volume {'increased' if vol_change > 0 else 'decreased'} by {abs(vol_change):.1f}%"
2629
+ )
2630
+
2631
+ # Error rate
2632
+ err_mult = changes["error_rate_multiplier"]
2633
+ if err_mult > 1.5:
2634
+ parts.append(f"Error rate increased {err_mult:.1f}x")
2635
+ elif err_mult < 0.7 and err_mult > 0:
2636
+ parts.append(f"Error rate decreased to {err_mult:.1f}x")
2637
+
2638
+ # New errors
2639
+ if changes["new_errors"]:
2640
+ parts.append(f"New errors: {', '.join(changes['new_errors'][:3])}")
2641
+
2642
+ if not parts:
2643
+ parts.append("Periods are similar")
2644
+
2645
+ return ". ".join(parts)
2646
+
2647
+
2648
+ def _in_time_range(entry: Dict, start: str, end: str) -> bool:
2649
+ """Check if entry timestamp is within range"""
2650
+ timestamp_str = entry.get("timestamp")
2651
+ if not timestamp_str:
2652
+ return False
2653
+
2654
+ try:
2655
+ timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
2656
+ start_dt = datetime.fromisoformat(start.replace("Z", "+00:00"))
2657
+ end_dt = datetime.fromisoformat(end.replace("Z", "+00:00"))
2658
+ return start_dt <= timestamp <= end_dt
2659
+ except (ValueError, TypeError, AttributeError):
2660
+ return False
2661
+
2662
+
2663
+ # Token-efficient output formatters
2664
+
2665
+
2666
+ def _format_as_summary(result: Dict[str, Any]) -> Dict[str, Any]:
2667
+ """
2668
+ Convert full search results to token-efficient summary format.
2669
+
2670
+ Instead of returning all log entries, groups them by message and
2671
+ provides aggregated statistics with a few examples.
2672
+ """
2673
+ results = result.get("results", [])
2674
+ if not results:
2675
+ return {
2676
+ "total_matches": 0,
2677
+ "unique_messages": 0,
2678
+ "log_levels": {},
2679
+ "top_messages": [],
2680
+ "sample_entries": [],
2681
+ }
2682
+
2683
+ # Group by message
2684
+ message_groups = defaultdict(
2685
+ lambda: {
2686
+ "count": 0,
2687
+ "first_seen": None,
2688
+ "last_seen": None,
2689
+ "levels": defaultdict(int),
2690
+ "examples": [],
2691
+ }
2692
+ )
2693
+
2694
+ level_counts = defaultdict(int)
2695
+ file_counts = defaultdict(int)
2696
+
2697
+ for item in results:
2698
+ entry = item.get("entry", {})
2699
+ message = entry.get("message", "").strip()
2700
+ level = entry.get("level", "INFO")
2701
+ timestamp = entry.get("timestamp")
2702
+ file_path = entry.get("file", "")
2703
+
2704
+ # Update level counts
2705
+ level_counts[level] += 1
2706
+ file_counts[file_path] += 1
2707
+
2708
+ # Update message group
2709
+ group = message_groups[message]
2710
+ group["count"] += 1
2711
+ group["levels"][level] += 1
2712
+
2713
+ if group["first_seen"] is None or (timestamp and timestamp < group["first_seen"]):
2714
+ group["first_seen"] = timestamp
2715
+
2716
+ if group["last_seen"] is None or (timestamp and timestamp > group["last_seen"]):
2717
+ group["last_seen"] = timestamp
2718
+
2719
+ # Keep up to 2 examples per message
2720
+ if len(group["examples"]) < 2:
2721
+ group["examples"].append(
2722
+ {
2723
+ "file": file_path,
2724
+ "line": entry.get("line_number"),
2725
+ "timestamp": timestamp,
2726
+ "level": level,
2727
+ }
2728
+ )
2729
+
2730
+ # Convert to sorted list (most frequent first)
2731
+ top_messages = []
2732
+ for message, data in sorted(message_groups.items(), key=lambda x: x[1]["count"], reverse=True)[
2733
+ :20
2734
+ ]:
2735
+ top_messages.append(
2736
+ {
2737
+ "message": message[:200], # Truncate long messages
2738
+ "count": data["count"],
2739
+ "first_seen": data["first_seen"],
2740
+ "last_seen": data["last_seen"],
2741
+ "levels": dict(data["levels"]),
2742
+ "examples": data["examples"],
2743
+ }
2744
+ )
2745
+
2746
+ # Sample entries (diverse selection)
2747
+ sample_entries = _select_diverse_samples(results, max_samples=5)
2748
+
2749
+ return {
2750
+ "total_matches": len(results),
2751
+ "unique_messages": len(message_groups),
2752
+ "log_levels": dict(level_counts),
2753
+ "by_file": dict(file_counts),
2754
+ "top_messages": top_messages,
2755
+ "sample_entries": sample_entries,
2756
+ "full_results_available": True,
2757
+ }
2758
+
2759
+
2760
+ def _format_as_count(result: Dict[str, Any]) -> Dict[str, Any]:
2761
+ """
2762
+ Convert full search results to count-only format (minimal tokens).
2763
+
2764
+ Returns only statistics, no actual log content.
2765
+ """
2766
+ results = result.get("results", [])
2767
+ if not results:
2768
+ return {"total_matches": 0, "by_level": {}, "by_file": {}, "time_range": None}
2769
+
2770
+ level_counts = defaultdict(int)
2771
+ file_counts = defaultdict(int)
2772
+ timestamps = []
2773
+
2774
+ for item in results:
2775
+ entry = item.get("entry", {})
2776
+ level = entry.get("level", "INFO")
2777
+ file_path = entry.get("file", "")
2778
+ timestamp = entry.get("timestamp")
2779
+
2780
+ level_counts[level] += 1
2781
+ file_counts[file_path] += 1
2782
+
2783
+ if timestamp:
2784
+ timestamps.append(timestamp)
2785
+
2786
+ # Time range
2787
+ time_range = None
2788
+ if timestamps:
2789
+ timestamps.sort()
2790
+ time_range = {"start": timestamps[0], "end": timestamps[-1]}
2791
+
2792
+ return {
2793
+ "total_matches": len(results),
2794
+ "by_level": dict(level_counts),
2795
+ "by_file": dict(file_counts),
2796
+ "time_range": time_range,
2797
+ }
2798
+
2799
+
2800
+ def _format_as_compact(result: Dict[str, Any]) -> Dict[str, Any]:
2801
+ """
2802
+ Convert full search results to compact format.
2803
+
2804
+ Returns only essential fields, removing raw logs and extra context.
2805
+ """
2806
+ results = result.get("results", [])
2807
+ if not results:
2808
+ return {"matches": [], "total": 0}
2809
+
2810
+ compact_matches = []
2811
+ for item in results:
2812
+ entry = item.get("entry", {})
2813
+ compact_matches.append(
2814
+ {
2815
+ "time": entry.get("timestamp"),
2816
+ "level": entry.get("level"),
2817
+ "msg": entry.get("message", "")[:150], # Truncate messages
2818
+ "thread": entry.get("thread_id") or entry.get("correlation_id"),
2819
+ "file": entry.get("file", "").split("/")[-1], # Just filename
2820
+ "line": entry.get("line_number"),
2821
+ }
2822
+ )
2823
+
2824
+ return {"matches": compact_matches, "total": len(results)}
2825
+
2826
+
2827
+ def _select_diverse_samples(results: List[Dict], max_samples: int = 5) -> List[Dict]:
2828
+ """
2829
+ Select a diverse set of sample entries.
2830
+
2831
+ Tries to include:
2832
+ - First and last entry
2833
+ - Different log levels
2834
+ - Different files
2835
+ - Errors if present
2836
+ """
2837
+ if not results:
2838
+ return []
2839
+
2840
+ if len(results) <= max_samples:
2841
+ return [r.get("entry", {}) for r in results]
2842
+
2843
+ samples = []
2844
+ indices_used = set()
2845
+
2846
+ # Always include first and last
2847
+ samples.append(results[0].get("entry", {}))
2848
+ indices_used.add(0)
2849
+
2850
+ if len(results) > 1:
2851
+ samples.append(results[-1].get("entry", {}))
2852
+ indices_used.add(len(results) - 1)
2853
+
2854
+ # Find first error
2855
+ for i, item in enumerate(results):
2856
+ if i in indices_used:
2857
+ continue
2858
+ entry = item.get("entry", {})
2859
+ if entry.get("level") in ["ERROR", "FATAL"]:
2860
+ samples.append(entry)
2861
+ indices_used.add(i)
2862
+ break
2863
+
2864
+ # Fill remaining slots with evenly spaced entries
2865
+ remaining = max_samples - len(samples)
2866
+ if remaining > 0 and len(results) > len(indices_used):
2867
+ step = len(results) // (remaining + 1)
2868
+ for i in range(1, remaining + 1):
2869
+ idx = min(i * step, len(results) - 1)
2870
+ if idx not in indices_used:
2871
+ samples.append(results[idx].get("entry", {}))
2872
+ indices_used.add(idx)
2873
+
2874
+ return samples[:max_samples]
2875
+
2876
+
2877
+ # Investigation Session Management
2878
+
2879
+
2880
+ class InvestigationSession:
2881
+ """
2882
+ Track investigation state and history for multi-step analysis.
2883
+
2884
+ This allows LLMs to:
2885
+ - Track what they've already investigated
2886
+ - Undo/redo operations
2887
+ - Save and resume investigations
2888
+ - Generate reports of their investigation process
2889
+
2890
+ Example:
2891
+ session = InvestigationSession(files=["app.log"])
2892
+
2893
+ # Perform investigation
2894
+ session.search(level="ERROR")
2895
+ session.follow_thread(correlation_id="req-123")
2896
+ session.find_patterns()
2897
+
2898
+ # Review history
2899
+ history = session.get_history()
2900
+
2901
+ # Undo last operation
2902
+ session.undo()
2903
+
2904
+ # Save for later
2905
+ session.save("incident_2024_01_15.json")
2906
+
2907
+ # Resume later
2908
+ session2 = InvestigationSession.load("incident_2024_01_15.json")
2909
+ """
2910
+
2911
+ def __init__(self, files: Optional[List[str]] = None, name: Optional[str] = None):
2912
+ self.files = files or []
2913
+ self.name = name or f"investigation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
2914
+ self.history = []
2915
+ self.current_index = -1
2916
+ self.metadata = {}
2917
+
2918
+ if files:
2919
+ self._add_to_history("init", "Initialize investigation", {"files": files}, None)
2920
+
2921
+ def search(
2922
+ self,
2923
+ query: Optional[str] = None,
2924
+ level: Optional[str] = None,
2925
+ output_format: str = "summary",
2926
+ **kwargs,
2927
+ ) -> Dict[str, Any]:
2928
+ """Perform search and track in history"""
2929
+ params = {"query": query, "level": level, "output_format": output_format, **kwargs}
2930
+ result = search(self.files, query=query, level=level, output_format=output_format, **kwargs)
2931
+
2932
+ self._add_to_history(
2933
+ "search",
2934
+ f"Search for {level or 'all'} logs" + (f" matching '{query}'" if query else ""),
2935
+ params,
2936
+ result,
2937
+ )
2938
+
2939
+ return result
2940
+
2941
+ def follow_thread(
2942
+ self,
2943
+ thread_id: Optional[str] = None,
2944
+ correlation_id: Optional[str] = None,
2945
+ trace_id: Optional[str] = None,
2946
+ ) -> Dict[str, Any]:
2947
+ """Follow thread and track in history"""
2948
+ params = {"thread_id": thread_id, "correlation_id": correlation_id, "trace_id": trace_id}
2949
+ result = follow_thread(
2950
+ self.files, thread_id=thread_id, correlation_id=correlation_id, trace_id=trace_id
2951
+ )
2952
+
2953
+ thread_desc = thread_id or correlation_id or trace_id
2954
+ self._add_to_history("follow_thread", f"Follow thread: {thread_desc}", params, result)
2955
+
2956
+ return result
2957
+
2958
+ def find_patterns(self, min_occurrences: int = 3) -> Dict[str, Any]:
2959
+ """Find patterns and track in history"""
2960
+ params = {"min_occurrences": min_occurrences}
2961
+ result = find_patterns(self.files, min_occurrences=min_occurrences)
2962
+
2963
+ self._add_to_history(
2964
+ "find_patterns", f"Find patterns (min {min_occurrences} occurrences)", params, result
2965
+ )
2966
+
2967
+ return result
2968
+
2969
+ def compare_threads(self, **kwargs) -> Dict[str, Any]:
2970
+ """Compare threads and track in history"""
2971
+ result = compare_threads(self.files, **kwargs)
2972
+
2973
+ desc = f"Compare {kwargs.get('correlation_a', 'A')} vs {kwargs.get('correlation_b', 'B')}"
2974
+ self._add_to_history("compare_threads", desc, kwargs, result)
2975
+
2976
+ return result
2977
+
2978
+ def cross_service_timeline(
2979
+ self, service_files: Dict[str, List[str]], **kwargs
2980
+ ) -> Dict[str, Any]:
2981
+ """Create cross-service timeline and track in history"""
2982
+ result = cross_service_timeline(service_files, **kwargs)
2983
+
2984
+ desc = f"Cross-service timeline for {list(service_files.keys())}"
2985
+ self._add_to_history(
2986
+ "cross_service_timeline", desc, {"service_files": service_files, **kwargs}, result
2987
+ )
2988
+
2989
+ return result
2990
+
2991
+ def add_note(self, note: str):
2992
+ """Add a text note to the investigation"""
2993
+ self._add_to_history("note", f"Note: {note[:50]}...", {"note": note}, None)
2994
+
2995
+ def _add_to_history(
2996
+ self,
2997
+ operation_type: str,
2998
+ description: str,
2999
+ params: Dict[str, Any],
3000
+ result: Optional[Dict[str, Any]],
3001
+ ):
3002
+ """Add operation to history"""
3003
+ # Remove any operations after current index (for undo/redo)
3004
+ self.history = self.history[: self.current_index + 1]
3005
+
3006
+ entry = {
3007
+ "timestamp": datetime.now().isoformat(),
3008
+ "operation": operation_type,
3009
+ "description": description,
3010
+ "params": params,
3011
+ "result_summary": self._summarize_result(result) if result else None,
3012
+ }
3013
+
3014
+ self.history.append(entry)
3015
+ self.current_index = len(self.history) - 1
3016
+
3017
+ def _summarize_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
3018
+ """Create a compact summary of operation result"""
3019
+ if not result:
3020
+ return {}
3021
+
3022
+ summary = {}
3023
+
3024
+ # Common fields
3025
+ if "total_matches" in result:
3026
+ summary["total_matches"] = result["total_matches"]
3027
+ if "total_entries" in result:
3028
+ summary["total_entries"] = result["total_entries"]
3029
+ if "duration_ms" in result:
3030
+ summary["duration_ms"] = result["duration_ms"]
3031
+ if "summary" in result:
3032
+ summary["summary"] = result["summary"]
3033
+
3034
+ # Pattern results
3035
+ if "patterns" in result:
3036
+ summary["pattern_count"] = len(result["patterns"])
3037
+
3038
+ # Timeline results
3039
+ if "timeline" in result:
3040
+ summary["timeline_length"] = len(result["timeline"])
3041
+
3042
+ return summary
3043
+
3044
+ def get_history(self, include_results: bool = False) -> List[Dict[str, Any]]:
3045
+ """Get investigation history"""
3046
+ if include_results:
3047
+ return self.history
3048
+ else:
3049
+ # Return without full results (more token-efficient)
3050
+ return [
3051
+ {
3052
+ "timestamp": h["timestamp"],
3053
+ "operation": h["operation"],
3054
+ "description": h["description"],
3055
+ "result_summary": h.get("result_summary"),
3056
+ }
3057
+ for h in self.history
3058
+ ]
3059
+
3060
+ def undo(self) -> bool:
3061
+ """Undo last operation"""
3062
+ if self.current_index > 0:
3063
+ self.current_index -= 1
3064
+ return True
3065
+ return False
3066
+
3067
+ def redo(self) -> bool:
3068
+ """Redo previously undone operation"""
3069
+ if self.current_index < len(self.history) - 1:
3070
+ self.current_index += 1
3071
+ return True
3072
+ return False
3073
+
3074
+ def get_current_focus(self) -> Optional[Dict[str, Any]]:
3075
+ """Get the current operation being focused on"""
3076
+ if 0 <= self.current_index < len(self.history):
3077
+ return self.history[self.current_index]
3078
+ return None
3079
+
3080
+ def save(self, filepath: str):
3081
+ """Save session to file"""
3082
+ import json
3083
+
3084
+ data = {
3085
+ "name": self.name,
3086
+ "files": self.files,
3087
+ "history": self.history,
3088
+ "current_index": self.current_index,
3089
+ "metadata": self.metadata,
3090
+ "saved_at": datetime.now().isoformat(),
3091
+ }
3092
+
3093
+ with open(filepath, "w") as f:
3094
+ json.dump(data, f, indent=2)
3095
+
3096
+ @classmethod
3097
+ def load(cls, filepath: str) -> "InvestigationSession":
3098
+ """Load session from file"""
3099
+ import json
3100
+
3101
+ with open(filepath, "r") as f:
3102
+ data = json.load(f)
3103
+
3104
+ session = cls(files=data["files"], name=data["name"])
3105
+ session.history = data["history"]
3106
+ session.current_index = data["current_index"]
3107
+ session.metadata = data.get("metadata", {})
3108
+
3109
+ return session
3110
+
3111
+ def get_summary(self) -> str:
3112
+ """Get a human-readable summary of the investigation"""
3113
+ if not self.history:
3114
+ return "No investigation steps yet"
3115
+
3116
+ lines = [
3117
+ f"Investigation: {self.name}",
3118
+ f"Steps completed: {len(self.history)}",
3119
+ "",
3120
+ "Timeline:",
3121
+ ]
3122
+
3123
+ for i, entry in enumerate(self.history):
3124
+ marker = "→" if i == self.current_index else " "
3125
+ lines.append(f" {marker} {i+1}. {entry['description']}")
3126
+ if entry.get("result_summary"):
3127
+ for key, value in entry["result_summary"].items():
3128
+ lines.append(f" {key}: {value}")
3129
+
3130
+ return "\n".join(lines)
3131
+
3132
+ def generate_report(self, format: str = "markdown", include_evidence: bool = True) -> str:
3133
+ """
3134
+ Generate a comprehensive investigation report.
3135
+
3136
+ Args:
3137
+ format: Output format - "markdown", "text", or "json"
3138
+ include_evidence: Include example log entries as evidence
3139
+
3140
+ Returns:
3141
+ Formatted investigation report string
3142
+ """
3143
+ if format == "markdown":
3144
+ return self._generate_markdown_report(include_evidence)
3145
+ elif format == "text":
3146
+ return self._generate_text_report(include_evidence)
3147
+ elif format == "json":
3148
+ import json
3149
+
3150
+ return json.dumps(self._generate_json_report(include_evidence), indent=2)
3151
+ else:
3152
+ return self._generate_markdown_report(include_evidence)
3153
+
3154
+ def _generate_markdown_report(self, include_evidence: bool) -> str:
3155
+ """Generate Markdown format report"""
3156
+ lines = [
3157
+ f"# Investigation Report: {self.name}",
3158
+ "",
3159
+ f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
3160
+ f"**Files Analyzed:** {', '.join(self.files)}",
3161
+ f"**Steps Completed:** {len(self.history)}",
3162
+ "",
3163
+ "---",
3164
+ "",
3165
+ "## Executive Summary",
3166
+ "",
3167
+ ]
3168
+
3169
+ # Try to extract key findings
3170
+ error_counts = []
3171
+ patterns_found = []
3172
+ key_insights = []
3173
+
3174
+ for entry in self.history:
3175
+ summary = entry.get("result_summary") or {}
3176
+ if "total_matches" in summary and entry["operation"] == "search":
3177
+ error_counts.append(
3178
+ f"- Found {summary['total_matches']} matches in {entry['description']}"
3179
+ )
3180
+ if "pattern_count" in summary:
3181
+ patterns_found.append(f"- Identified {summary['pattern_count']} repeated patterns")
3182
+ if "summary" in summary:
3183
+ key_insights.append(f"- {summary['summary']}")
3184
+
3185
+ if error_counts:
3186
+ lines.extend(error_counts)
3187
+ if patterns_found:
3188
+ lines.extend(patterns_found)
3189
+ if key_insights:
3190
+ lines.append("")
3191
+ lines.append("### Key Findings")
3192
+ lines.extend(key_insights)
3193
+
3194
+ lines.extend(["", "---", "", "## Investigation Timeline", ""])
3195
+
3196
+ # Add detailed timeline
3197
+ for i, entry in enumerate(self.history):
3198
+ timestamp = entry.get("timestamp", "Unknown time")
3199
+ desc = entry["description"]
3200
+ operation = entry["operation"]
3201
+
3202
+ lines.append(f"### Step {i+1}: {desc}")
3203
+ lines.append("")
3204
+ lines.append(f"- **Time:** {timestamp}")
3205
+ lines.append(f"- **Operation:** `{operation}`")
3206
+
3207
+ # Add results
3208
+ if entry.get("result_summary"):
3209
+ lines.append("- **Results:**")
3210
+ for key, value in entry["result_summary"].items():
3211
+ lines.append(f" - {key}: {value}")
3212
+
3213
+ lines.append("")
3214
+
3215
+ lines.extend(
3216
+ [
3217
+ "---",
3218
+ "",
3219
+ "## Conclusions",
3220
+ "",
3221
+ "Based on the investigation steps above, review the key findings and error patterns.",
3222
+ "",
3223
+ "## Next Steps",
3224
+ "",
3225
+ "- [ ] Review identified error patterns",
3226
+ "- [ ] Investigate root causes",
3227
+ "- [ ] Implement fixes",
3228
+ "- [ ] Monitor for recurrence",
3229
+ "",
3230
+ ]
3231
+ )
3232
+
3233
+ return "\n".join(lines)
3234
+
3235
+ def _generate_text_report(self, include_evidence: bool) -> str:
3236
+ """Generate plain text format report"""
3237
+ lines = [
3238
+ "=" * 70,
3239
+ f"INVESTIGATION REPORT: {self.name}",
3240
+ "=" * 70,
3241
+ f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
3242
+ f"Files: {', '.join(self.files)}",
3243
+ f"Steps: {len(self.history)}",
3244
+ "=" * 70,
3245
+ "",
3246
+ "TIMELINE:",
3247
+ "",
3248
+ ]
3249
+
3250
+ for i, entry in enumerate(self.history):
3251
+ timestamp = entry.get("timestamp", "Unknown")
3252
+ lines.append(f"{i+1}. [{timestamp}] {entry['description']}")
3253
+
3254
+ if entry.get("result_summary"):
3255
+ for key, value in entry["result_summary"].items():
3256
+ lines.append(f" - {key}: {value}")
3257
+ lines.append("")
3258
+
3259
+ lines.extend(["=" * 70, "END OF REPORT", "=" * 70])
3260
+
3261
+ return "\n".join(lines)
3262
+
3263
+ def _generate_json_report(self, include_evidence: bool) -> Dict[str, Any]:
3264
+ """Generate JSON format report"""
3265
+ return {
3266
+ "name": self.name,
3267
+ "generated_at": datetime.now().isoformat(),
3268
+ "files": self.files,
3269
+ "steps_completed": len(self.history),
3270
+ "timeline": (
3271
+ self.history if include_evidence else self.get_history(include_results=False)
3272
+ ),
3273
+ "metadata": self.metadata,
3274
+ }
3275
+
3276
+
3277
+ # Smart Sampling
3278
+
3279
+
3280
+ def smart_sample(
3281
+ files: List[str],
3282
+ level: Optional[str] = None,
3283
+ strategy: str = "representative",
3284
+ sample_size: int = 50,
3285
+ ) -> Dict[str, Any]:
3286
+ """
3287
+ Get a smart sample of log entries that represents the full dataset.
3288
+
3289
+ Instead of random sampling, this uses intelligent strategies to ensure
3290
+ the sample is informative and diverse.
3291
+
3292
+ Args:
3293
+ files: List of log file paths
3294
+ level: Optional log level filter
3295
+ strategy: Sampling strategy:
3296
+ - "representative": Balanced mix of levels, times, and patterns
3297
+ - "diverse": Maximum diversity (different messages, threads, etc.)
3298
+ - "chronological": Evenly spaced across time
3299
+ - "errors_focused": Prioritize errors with context
3300
+ sample_size: Target number of entries (default 50)
3301
+
3302
+ Returns:
3303
+ Dictionary with sampled entries:
3304
+ {
3305
+ "samples": [...], # Selected log entries
3306
+ "total_population": 15230,
3307
+ "sample_size": 50,
3308
+ "strategy": "representative",
3309
+ "coverage": {
3310
+ "time_coverage": 0.95, # % of time range covered
3311
+ "level_coverage": {"ERROR": 10, "INFO": 35, "WARN": 5},
3312
+ "thread_coverage": 23 # Number of unique threads
3313
+ }
3314
+ }
3315
+
3316
+ Example:
3317
+ # Get representative sample of 100 entries
3318
+ sample = smart_sample(
3319
+ files=["app.log"],
3320
+ strategy="representative",
3321
+ sample_size=100
3322
+ )
3323
+
3324
+ # Analyze the sample (much faster than full dataset)
3325
+ for entry in sample['samples']:
3326
+ print(entry['message'])
3327
+ """
3328
+ if not RUST_AVAILABLE:
3329
+ raise RuntimeError("Rust backend not available")
3330
+
3331
+ # Get all entries
3332
+ results = search(files, level=level, limit=None)
3333
+ all_entries = [r["entry"] for r in results.get("results", [])]
3334
+
3335
+ if not all_entries:
3336
+ return {
3337
+ "samples": [],
3338
+ "total_population": 0,
3339
+ "sample_size": 0,
3340
+ "strategy": strategy,
3341
+ "coverage": {},
3342
+ }
3343
+
3344
+ # Apply sampling strategy
3345
+ if strategy == "representative":
3346
+ samples = _sample_representative(all_entries, sample_size)
3347
+ elif strategy == "diverse":
3348
+ samples = _sample_diverse(all_entries, sample_size)
3349
+ elif strategy == "chronological":
3350
+ samples = _sample_chronological(all_entries, sample_size)
3351
+ elif strategy == "errors_focused":
3352
+ samples = _sample_errors_focused(all_entries, sample_size)
3353
+ else:
3354
+ # Default to representative
3355
+ samples = _sample_representative(all_entries, sample_size)
3356
+
3357
+ # Calculate coverage
3358
+ coverage = _calculate_coverage(all_entries, samples)
3359
+
3360
+ return {
3361
+ "samples": samples,
3362
+ "total_population": len(all_entries),
3363
+ "sample_size": len(samples),
3364
+ "strategy": strategy,
3365
+ "coverage": coverage,
3366
+ }
3367
+
3368
+
3369
+ def _sample_representative(entries: List[Dict], size: int) -> List[Dict]:
3370
+ """Sample to represent overall distribution"""
3371
+ if len(entries) <= size:
3372
+ return entries
3373
+
3374
+ samples = []
3375
+
3376
+ # Group by level
3377
+ by_level = defaultdict(list)
3378
+ for entry in entries:
3379
+ level = entry.get("level", "INFO")
3380
+ by_level[level].append(entry)
3381
+
3382
+ # Calculate proportional samples per level
3383
+ for level, level_entries in by_level.items():
3384
+ proportion = len(level_entries) / len(entries)
3385
+ level_sample_size = max(1, int(size * proportion))
3386
+
3387
+ # Sample evenly across time
3388
+ if level_sample_size >= len(level_entries):
3389
+ samples.extend(level_entries)
3390
+ else:
3391
+ step = len(level_entries) / level_sample_size
3392
+ indices = [int(i * step) for i in range(level_sample_size)]
3393
+ samples.extend([level_entries[i] for i in indices])
3394
+
3395
+ # If we have too many, trim to size
3396
+ if len(samples) > size:
3397
+ step = len(samples) / size
3398
+ indices = [int(i * step) for i in range(size)]
3399
+ samples = [samples[i] for i in indices]
3400
+
3401
+ return samples[:size]
3402
+
3403
+
3404
+ def _sample_diverse(entries: List[Dict], size: int) -> List[Dict]:
3405
+ """Sample for maximum diversity"""
3406
+ if len(entries) <= size:
3407
+ return entries
3408
+
3409
+ samples = []
3410
+ used_messages = set()
3411
+ used_threads = set()
3412
+
3413
+ # First pass: unique messages
3414
+ for entry in entries:
3415
+ if len(samples) >= size:
3416
+ break
3417
+
3418
+ message = entry.get("message", "")
3419
+ if message and message not in used_messages:
3420
+ samples.append(entry)
3421
+ used_messages.add(message)
3422
+ thread = entry.get("thread_id") or entry.get("correlation_id")
3423
+ if thread:
3424
+ used_threads.add(thread)
3425
+
3426
+ # Second pass: unique threads
3427
+ if len(samples) < size:
3428
+ for entry in entries:
3429
+ if len(samples) >= size:
3430
+ break
3431
+
3432
+ thread = entry.get("thread_id") or entry.get("correlation_id")
3433
+ if thread and thread not in used_threads:
3434
+ samples.append(entry)
3435
+ used_threads.add(thread)
3436
+
3437
+ # Third pass: fill remaining with evenly spaced entries
3438
+ if len(samples) < size:
3439
+ remaining = size - len(samples)
3440
+ step = len(entries) / remaining
3441
+ for i in range(remaining):
3442
+ idx = int(i * step)
3443
+ if idx < len(entries):
3444
+ samples.append(entries[idx])
3445
+
3446
+ return samples[:size]
3447
+
3448
+
3449
+ def _sample_chronological(entries: List[Dict], size: int) -> List[Dict]:
3450
+ """Sample evenly across time"""
3451
+ if len(entries) <= size:
3452
+ return entries
3453
+
3454
+ # Sort by timestamp
3455
+ sorted_entries = sorted(entries, key=lambda e: e.get("timestamp", ""))
3456
+
3457
+ # Sample evenly
3458
+ step = len(sorted_entries) / size
3459
+ indices = [int(i * step) for i in range(size)]
3460
+ return [sorted_entries[i] for i in indices]
3461
+
3462
+
3463
+ def _sample_errors_focused(entries: List[Dict], size: int) -> List[Dict]:
3464
+ """Sample focusing on errors with context"""
3465
+ if len(entries) <= size:
3466
+ return entries
3467
+
3468
+ samples = []
3469
+ error_indices = []
3470
+ non_error_indices = []
3471
+
3472
+ # Separate errors from non-errors
3473
+ for i, entry in enumerate(entries):
3474
+ level = entry.get("level", "INFO")
3475
+ if level in ["ERROR", "FATAL"]:
3476
+ error_indices.append(i)
3477
+ else:
3478
+ non_error_indices.append(i)
3479
+
3480
+ # Allocate 70% to errors, 30% to context
3481
+ error_budget = int(size * 0.7)
3482
+
3483
+ # Sample errors
3484
+ if error_indices:
3485
+ if len(error_indices) <= error_budget:
3486
+ # All errors + some context
3487
+ for idx in error_indices:
3488
+ samples.append(entries[idx])
3489
+ # Add 1-2 entries before error for context
3490
+ if idx > 0:
3491
+ samples.append(entries[idx - 1])
3492
+ else:
3493
+ # Sample errors evenly
3494
+ step = len(error_indices) / error_budget
3495
+ for i in range(error_budget):
3496
+ idx = error_indices[int(i * step)]
3497
+ samples.append(entries[idx])
3498
+
3499
+ # Sample non-errors for context
3500
+ if non_error_indices and len(samples) < size:
3501
+ remaining = size - len(samples)
3502
+ step = len(non_error_indices) / remaining
3503
+ for i in range(remaining):
3504
+ idx = non_error_indices[min(int(i * step), len(non_error_indices) - 1)]
3505
+ samples.append(entries[idx])
3506
+
3507
+ # Sort by original order
3508
+ entry_to_index = {id(e): i for i, e in enumerate(entries)}
3509
+ samples.sort(key=lambda e: entry_to_index.get(id(e), 0))
3510
+
3511
+ return samples[:size]
3512
+
3513
+
3514
+ def _calculate_coverage(population: List[Dict], sample: List[Dict]) -> Dict[str, Any]:
3515
+ """Calculate how well the sample covers the population"""
3516
+ # Time coverage
3517
+ pop_times = [e.get("timestamp") for e in population if e.get("timestamp")]
3518
+ sample_times = [e.get("timestamp") for e in sample if e.get("timestamp")]
3519
+
3520
+ time_coverage = 0.0
3521
+ if pop_times and sample_times:
3522
+ pop_times.sort()
3523
+ sample_times.sort()
3524
+ # Simple coverage: sample span / population span
3525
+ try:
3526
+ pop_start = datetime.fromisoformat(pop_times[0].replace("Z", "+00:00"))
3527
+ pop_end = datetime.fromisoformat(pop_times[-1].replace("Z", "+00:00"))
3528
+ sample_start = datetime.fromisoformat(sample_times[0].replace("Z", "+00:00"))
3529
+ sample_end = datetime.fromisoformat(sample_times[-1].replace("Z", "+00:00"))
3530
+
3531
+ pop_duration = (pop_end - pop_start).total_seconds()
3532
+ sample_duration = (sample_end - sample_start).total_seconds()
3533
+
3534
+ if pop_duration > 0:
3535
+ time_coverage = min(1.0, sample_duration / pop_duration)
3536
+ except (ValueError, TypeError, AttributeError):
3537
+ pass # Skip if timestamps are invalid
3538
+
3539
+ # Level coverage
3540
+ level_coverage = defaultdict(int)
3541
+ for entry in sample:
3542
+ level = entry.get("level", "INFO")
3543
+ level_coverage[level] += 1
3544
+
3545
+ # Thread coverage
3546
+ pop_threads = set()
3547
+ sample_threads = set()
3548
+ for entry in population:
3549
+ thread = entry.get("thread_id") or entry.get("correlation_id")
3550
+ if thread:
3551
+ pop_threads.add(thread)
3552
+ for entry in sample:
3553
+ thread = entry.get("thread_id") or entry.get("correlation_id")
3554
+ if thread:
3555
+ sample_threads.add(thread)
3556
+
3557
+ thread_coverage_pct = len(sample_threads) / len(pop_threads) if pop_threads else 0
3558
+
3559
+ return {
3560
+ "time_coverage": time_coverage,
3561
+ "level_distribution": dict(level_coverage),
3562
+ "unique_threads_in_sample": len(sample_threads),
3563
+ "unique_threads_in_population": len(pop_threads),
3564
+ "thread_coverage_pct": thread_coverage_pct,
3565
+ }
3566
+
3567
+
3568
+ # Automatic Insights and Suggestions
3569
+
3570
+
3571
+ def analyze_with_insights(
3572
+ files: List[str], level: Optional[str] = None, auto_investigate: bool = True
3573
+ ) -> Dict[str, Any]:
3574
+ """
3575
+ Analyze logs and automatically generate insights and suggestions.
3576
+
3577
+ This is the "smart mode" that does the thinking for you - perfect for
3578
+ LLMs that want quick actionable information.
3579
+
3580
+ Args:
3581
+ files: List of log file paths
3582
+ level: Optional log level filter (default: analyzes all levels)
3583
+ auto_investigate: Automatically perform follow-up investigations
3584
+
3585
+ Returns:
3586
+ Dictionary with insights:
3587
+ {
3588
+ "overview": {...}, # Quick stats
3589
+ "insights": [
3590
+ {
3591
+ "type": "error_spike",
3592
+ "severity": "high",
3593
+ "description": "Error rate 10x higher than normal",
3594
+ "evidence": [...],
3595
+ "suggestion": "Check database connections"
3596
+ },
3597
+ ...
3598
+ ],
3599
+ "suggestions": [
3600
+ "Follow thread req-12345 - appears to be failing consistently",
3601
+ "Check database connection pool size",
3602
+ ...
3603
+ ],
3604
+ "next_steps": [...] # Recommended next investigation steps
3605
+ }
3606
+
3607
+ Example:
3608
+ # One-shot analysis with insights
3609
+ result = analyze_with_insights(files=["app.log"])
3610
+
3611
+ for insight in result['insights']:
3612
+ print(f"[{insight['severity']}] {insight['description']}")
3613
+ print(f" → {insight['suggestion']}")
3614
+ """
3615
+ if not RUST_AVAILABLE:
3616
+ raise RuntimeError("Rust backend not available")
3617
+
3618
+ insights = []
3619
+ suggestions = []
3620
+ next_steps = []
3621
+
3622
+ # Get overview
3623
+ metadata = get_metadata(files)
3624
+ search_results = search(files, level=level, output_format="summary")
3625
+
3626
+ # Insight 1: Error rate analysis
3627
+ total = search_results.get("total_matches", 0)
3628
+ levels = search_results.get("log_levels", {})
3629
+ error_count = levels.get("ERROR", 0) + levels.get("FATAL", 0)
3630
+
3631
+ if total > 0:
3632
+ error_rate = error_count / total
3633
+ if error_rate > 0.1: # More than 10% errors
3634
+ insights.append(
3635
+ {
3636
+ "type": "high_error_rate",
3637
+ "severity": "high",
3638
+ "description": f"High error rate: {error_rate:.1%} ({error_count}/{total})",
3639
+ "evidence": {"error_count": error_count, "total": total, "rate": error_rate},
3640
+ "suggestion": "Investigate most common errors first",
3641
+ }
3642
+ )
3643
+ next_steps.append("Run: find_patterns(files, min_occurrences=3)")
3644
+
3645
+ # Insight 2: Pattern detection
3646
+ if auto_investigate and error_count > 0:
3647
+ patterns = find_patterns(files, min_occurrences=2)
3648
+ if patterns.get("patterns"):
3649
+ pattern_count = len(patterns["patterns"])
3650
+ insights.append(
3651
+ {
3652
+ "type": "repeated_patterns",
3653
+ "severity": "medium",
3654
+ "description": f"Found {pattern_count} repeated error patterns",
3655
+ "evidence": patterns["patterns"][:3], # Top 3
3656
+ "suggestion": "These errors are systematic, not random",
3657
+ }
3658
+ )
3659
+
3660
+ # Suggest investigating the most frequent pattern
3661
+ if patterns["patterns"]:
3662
+ top_pattern = patterns["patterns"][0]
3663
+ suggestions.append(
3664
+ f"Investigate pattern: '{top_pattern.get('pattern', '')[:50]}...'"
3665
+ )
3666
+
3667
+ # Insight 3: Check for cascading failures
3668
+ if error_count > 5:
3669
+ # Look for timing patterns
3670
+ top_messages = search_results.get("top_messages", [])
3671
+ if top_messages:
3672
+ # Check if errors happened in quick succession
3673
+ time_clustered = any(msg.get("count", 0) > 3 for msg in top_messages)
3674
+ if time_clustered:
3675
+ insights.append(
3676
+ {
3677
+ "type": "possible_cascade",
3678
+ "severity": "high",
3679
+ "description": "Errors may be cascading (multiple errors in short time)",
3680
+ "evidence": top_messages[:2],
3681
+ "suggestion": "Look for root cause - later errors may be symptoms",
3682
+ }
3683
+ )
3684
+ suggestions.append("Check timestamps - investigate earliest error first")
3685
+
3686
+ # Insight 4: Thread analysis
3687
+ for meta in metadata:
3688
+ unique_correlations = meta.get("unique_correlation_ids", 0)
3689
+
3690
+ if error_count > 0 and unique_correlations > 0:
3691
+ # Some threads are failing
3692
+ insights.append(
3693
+ {
3694
+ "type": "thread_failures",
3695
+ "severity": "medium",
3696
+ "description": f"Errors across {unique_correlations} different requests",
3697
+ "evidence": {"unique_correlations": unique_correlations},
3698
+ "suggestion": "Compare successful vs failed requests",
3699
+ }
3700
+ )
3701
+ next_steps.append("Use: compare_threads() to find differences")
3702
+
3703
+ # Generate suggestions based on insights
3704
+ if not suggestions:
3705
+ if error_count > 0:
3706
+ suggestions.append("Start by examining the first error - it may be the root cause")
3707
+ suggestions.append("Use follow_thread() to see full request flow")
3708
+ else:
3709
+ suggestions.append("No errors found - logs look healthy")
3710
+
3711
+ # Overview
3712
+ overview = {
3713
+ "total_logs": total,
3714
+ "error_count": error_count,
3715
+ "error_rate": error_count / total if total > 0 else 0,
3716
+ "files_analyzed": len(files),
3717
+ "log_levels": levels,
3718
+ }
3719
+
3720
+ return {
3721
+ "overview": overview,
3722
+ "insights": insights,
3723
+ "suggestions": suggestions,
3724
+ "next_steps": next_steps,
3725
+ "investigated_automatically": auto_investigate,
3726
+ }
3727
+
3728
+
3729
+ def explain(
3730
+ entry: Optional[Dict[str, Any]] = None,
3731
+ error_message: Optional[str] = None,
3732
+ context: str = "general",
3733
+ ) -> str:
3734
+ """
3735
+ Explain a log entry or error message in simple terms.
3736
+
3737
+ Perfect for when you encounter cryptic errors or need to understand
3738
+ what's happening. Provides human-friendly explanations and next steps.
3739
+
3740
+ Args:
3741
+ entry: Log entry dictionary to explain
3742
+ error_message: Or just provide an error message string
3743
+ context: Context for explanation ("production", "development", "general")
3744
+
3745
+ Returns:
3746
+ Human-friendly explanation string
3747
+
3748
+ Example:
3749
+ # Explain a log entry
3750
+ explanation = explain(
3751
+ entry=error_entry,
3752
+ context="production"
3753
+ )
3754
+ print(explanation)
3755
+
3756
+ # Explain just a message
3757
+ explanation = explain(
3758
+ error_message="Connection pool exhausted",
3759
+ context="production"
3760
+ )
3761
+ """
3762
+ if entry:
3763
+ message = entry.get("message", "")
3764
+ level = entry.get("level", "INFO")
3765
+ elif error_message:
3766
+ message = error_message
3767
+ level = "ERROR"
3768
+ else:
3769
+ return "No entry or message provided to explain"
3770
+
3771
+ # Build explanation
3772
+ lines = []
3773
+
3774
+ # What happened
3775
+ lines.append("## What This Means\n")
3776
+
3777
+ # Pattern matching for common errors
3778
+ message_lower = message.lower()
3779
+
3780
+ if "timeout" in message_lower or "timed out" in message_lower:
3781
+ lines.append("A timeout means an operation took too long and was cancelled.")
3782
+ lines.append("\n**Common causes:**")
3783
+ lines.append("- Database query is too slow")
3784
+ lines.append("- Network latency issues")
3785
+ lines.append("- Service is overloaded")
3786
+ lines.append("- Deadlock or infinite loop")
3787
+ lines.append("\n**Next steps:**")
3788
+ lines.append("1. Check what operation was timing out")
3789
+ lines.append("2. Look at the service being called - is it slow or down?")
3790
+ lines.append("3. Review timeout configuration - is it too short?")
3791
+
3792
+ elif "connection" in message_lower and (
3793
+ "refused" in message_lower or "failed" in message_lower
3794
+ ):
3795
+ lines.append("A connection failure means the application couldn't reach another service.")
3796
+ lines.append("\n**Common causes:**")
3797
+ lines.append("- Service is down or not responding")
3798
+ lines.append("- Network connectivity issues")
3799
+ lines.append("- Firewall blocking the connection")
3800
+ lines.append("- Wrong hostname/port configuration")
3801
+ lines.append("\n**Next steps:**")
3802
+ lines.append("1. Check if the target service is running")
3803
+ lines.append("2. Verify network connectivity")
3804
+ lines.append("3. Check configuration (hostname, port, etc.)")
3805
+
3806
+ elif "pool exhausted" in message_lower or "too many connections" in message_lower:
3807
+ lines.append("The connection pool is exhausted - all available connections are in use.")
3808
+ lines.append("\n**Common causes:**")
3809
+ lines.append("- Traffic spike overwhelming the system")
3810
+ lines.append("- Connection leaks (not closing connections)")
3811
+ lines.append("- Pool size too small for the load")
3812
+ lines.append("- Slow queries holding connections too long")
3813
+ lines.append("\n**Next steps:**")
3814
+ lines.append("1. Check connection pool size configuration")
3815
+ lines.append("2. Look for connection leaks in code")
3816
+ lines.append("3. Identify slow operations holding connections")
3817
+ lines.append("4. Consider increasing pool size if load is legitimate")
3818
+
3819
+ elif "out of memory" in message_lower or "outofmemoryerror" in message_lower:
3820
+ lines.append("The application ran out of available memory.")
3821
+ lines.append("\n**Common causes:**")
3822
+ lines.append("- Memory leak (memory not being freed)")
3823
+ lines.append("- Processing too much data at once")
3824
+ lines.append("- Insufficient memory allocated")
3825
+ lines.append("- Caching too aggressively")
3826
+ lines.append("\n**Next steps:**")
3827
+ lines.append("1. Check memory allocation settings")
3828
+ lines.append("2. Look for memory leaks")
3829
+ lines.append("3. Review data processing - can it be batched/streamed?")
3830
+ lines.append("4. Check garbage collection logs")
3831
+
3832
+ elif "null" in message_lower and ("pointer" in message_lower or "reference" in message_lower):
3833
+ lines.append("Tried to use something that doesn't exist (null/None).")
3834
+ lines.append("\n**Common causes:**")
3835
+ lines.append("- Missing input validation")
3836
+ lines.append("- Unexpected missing data")
3837
+ lines.append("- Race condition")
3838
+ lines.append("- API returned unexpected null")
3839
+ lines.append("\n**Next steps:**")
3840
+ lines.append("1. Check the stack trace to find where it happened")
3841
+ lines.append("2. Add null checks and validation")
3842
+ lines.append("3. Review why the value was null")
3843
+
3844
+ elif (
3845
+ "permission" in message_lower
3846
+ or "access denied" in message_lower
3847
+ or "forbidden" in message_lower
3848
+ ):
3849
+ lines.append("The application doesn't have permission to perform this action.")
3850
+ lines.append("\n**Common causes:**")
3851
+ lines.append("- Incorrect file/resource permissions")
3852
+ lines.append("- Wrong user/service account")
3853
+ lines.append("- Missing IAM roles or policies")
3854
+ lines.append("- Authentication token expired")
3855
+ lines.append("\n**Next steps:**")
3856
+ lines.append("1. Check file/resource permissions")
3857
+ lines.append("2. Verify the application is running as the correct user")
3858
+ lines.append("3. Review access control policies")
3859
+
3860
+ else:
3861
+ # Generic explanation
3862
+ if level == "ERROR" or level == "FATAL":
3863
+ lines.append("This is an error that prevented normal operation.")
3864
+ lines.append(f"\nError message: `{message}`")
3865
+ lines.append("\n**Next steps:**")
3866
+ lines.append("1. Look at the full stack trace if available")
3867
+ lines.append("2. Check what operation was being performed")
3868
+ lines.append("3. Look for similar errors - is this a pattern?")
3869
+ lines.append("4. Check if there were recent changes (deployment, config)")
3870
+ elif level == "WARN":
3871
+ lines.append("This is a warning - not critical but worth investigating.")
3872
+ lines.append(f"\nMessage: `{message}`")
3873
+ lines.append("\n**Next steps:**")
3874
+ lines.append("1. Determine if this warning is expected")
3875
+ lines.append("2. Check if it's happening frequently")
3876
+ lines.append("3. Consider if it could become a problem")
3877
+ else:
3878
+ lines.append("This is an informational message.")
3879
+ lines.append(f"\nMessage: `{message}`")
3880
+
3881
+ # Context-specific advice
3882
+ if context == "production":
3883
+ lines.append("\n**Production Context:**")
3884
+ lines.append("- Check monitoring dashboards for patterns")
3885
+ lines.append("- Review recent deployments")
3886
+ lines.append("- Consider impact on users")
3887
+ lines.append("- Prepare rollback plan if needed")
3888
+
3889
+ return "\n".join(lines)
3890
+
3891
+
3892
+ def suggest_next_action(
3893
+ current_results: Dict[str, Any], investigation_context: Optional[Dict] = None
3894
+ ) -> List[str]:
3895
+ """
3896
+ Suggest what to investigate next based on current results.
3897
+
3898
+ Args:
3899
+ current_results: Results from previous operation (search, pattern finding, etc.)
3900
+ investigation_context: Optional context about what's been investigated so far
3901
+
3902
+ Returns:
3903
+ List of suggested next actions with example code
3904
+ """
3905
+ suggestions = []
3906
+
3907
+ # Based on search results
3908
+ if "total_matches" in current_results:
3909
+ total = current_results["total_matches"]
3910
+ if total == 0:
3911
+ suggestions.append("No matches found. Try:")
3912
+ suggestions.append(" - Broaden search (remove filters)")
3913
+ suggestions.append(" - Check different log files")
3914
+ suggestions.append(" - Verify time range")
3915
+ elif total > 1000:
3916
+ suggestions.append(f"Large result set ({total} matches). Consider:")
3917
+ suggestions.append(" - Use output_format='summary' for token efficiency")
3918
+ suggestions.append(" - Add more filters (level, time range, thread_id)")
3919
+ suggestions.append(" - Use smart_sample() to get representative sample")
3920
+ elif total > 0:
3921
+ # Good result size, suggest next steps
3922
+ if "top_messages" in current_results:
3923
+ top_msg = (
3924
+ current_results["top_messages"][0] if current_results["top_messages"] else None
3925
+ )
3926
+ if top_msg and top_msg.get("count", 0) > 3:
3927
+ suggestions.append("Repeated errors detected. Next:")
3928
+ suggestions.append(" - find_patterns(files, min_occurrences=3)")
3929
+ suggestions.append(" - Follow one of these threads to see full context")
3930
+
3931
+ # Based on patterns
3932
+ if "patterns" in current_results:
3933
+ pattern_count = len(current_results.get("patterns", []))
3934
+ if pattern_count > 0:
3935
+ suggestions.append(f"Found {pattern_count} patterns. Next:")
3936
+ suggestions.append(" - Compare successful vs failed requests")
3937
+ suggestions.append(" - Check timestamps - are they clustered?")
3938
+
3939
+ # Default suggestions
3940
+ if not suggestions:
3941
+ suggestions.append("Continue investigation:")
3942
+ suggestions.append(" - analyze_with_insights(files) - Get automatic insights")
3943
+ suggestions.append(" - find_patterns(files) - Find repeated issues")
3944
+ suggestions.append(" - compare_time_periods() - Before/after analysis")
3945
+
3946
+ return suggestions
3947
+
3948
+
3949
+ def _load_files_with_config(
3950
+ inv: Any,
3951
+ files: List[str],
3952
+ parser_format: Optional[str] = None,
3953
+ custom_regex: Optional[str] = None,
3954
+ ):
3955
+ """Load files with optional parser config; falls back to plain load if config not supported."""
3956
+ try:
3957
+ if parser_format or custom_regex:
3958
+ return inv.load_files_with_config(files, parser_format, custom_regex)
3959
+ except Exception:
3960
+ # Fall back silently to default loader if enhanced path fails
3961
+ pass
3962
+ return inv.load_files(files)