ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. ai_pipeline_core/__init__.py +78 -125
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +130 -81
  37. ai_pipeline_core/llm/client.py +327 -193
  38. ai_pipeline_core/llm/model_options.py +14 -86
  39. ai_pipeline_core/llm/model_response.py +60 -103
  40. ai_pipeline_core/llm/model_types.py +16 -34
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -483
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/simple_runner/__init__.py +0 -14
  85. ai_pipeline_core/simple_runner/cli.py +0 -254
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -247
  87. ai_pipeline_core/storage/__init__.py +0 -8
  88. ai_pipeline_core/storage/storage.py +0 -628
  89. ai_pipeline_core/utils/__init__.py +0 -8
  90. ai_pipeline_core/utils/deploy.py +0 -373
  91. ai_pipeline_core/utils/remote_deployment.py +0 -269
  92. ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
  93. ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
  94. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,843 @@
1
+ """Local trace writer for filesystem-based debugging."""
2
+
3
+ import asyncio
4
+ import atexit
5
+ import hashlib
6
+ import importlib
7
+ import json
8
+ import os
9
+ import re
10
+ import shutil
11
+ import socket
12
+ from datetime import UTC, datetime
13
+ from queue import Empty, Queue
14
+ from threading import Lock, Thread
15
+ from typing import Any
16
+
17
+ import yaml
18
+
19
+ from ai_pipeline_core.logging import get_pipeline_logger
20
+
21
+ from ._config import TraceDebugConfig
22
+ from ._content import ArtifactStore, ContentWriter
23
+ from ._summary import generate_summary
24
+ from ._types import SpanInfo, TraceState, WriteJob
25
+
26
+ logger = get_pipeline_logger(__name__)
27
+
28
+
29
+ class LocalTraceWriter:
30
+ """Writes trace spans to local filesystem via background thread.
31
+
32
+ Uses a hierarchical directory structure where child spans are nested
33
+ inside parent span directories. Directory names use numeric prefixes
34
+ (01_, 02_, etc.) to preserve execution order when viewed with `tree`.
35
+ Generates index files and optionally produces _summary.md and
36
+ _auto_summary.md for trace analysis.
37
+ """
38
+
39
+ def __init__(self, config: TraceDebugConfig):
40
+ """Initialize trace writer with config."""
41
+ self._config = config
42
+ self._queue: Queue[WriteJob | None] = Queue()
43
+ self._traces: dict[str, TraceState] = {}
44
+ self._artifact_stores: dict[str, ArtifactStore] = {} # One per trace for deduplication
45
+ self._lock = Lock()
46
+ self._shutdown = False
47
+
48
+ # Ensure base path exists
49
+ config.path.mkdir(parents=True, exist_ok=True)
50
+
51
+ # Clean up old traces if needed
52
+ self._cleanup_old_traces()
53
+
54
+ # Start background writer thread
55
+ self._writer_thread = Thread(
56
+ target=self._writer_loop,
57
+ name="trace-debug-writer",
58
+ daemon=True,
59
+ )
60
+ self._writer_thread.start()
61
+
62
+ # Register shutdown handler
63
+ atexit.register(self.shutdown)
64
+
65
+ def on_span_start(
66
+ self,
67
+ trace_id: str,
68
+ span_id: str,
69
+ parent_id: str | None,
70
+ name: str,
71
+ ) -> None:
72
+ """Handle span start - create directories and record metadata.
73
+
74
+ Called from SpanProcessor.on_start() in the main thread.
75
+ Creates hierarchical directories nested under parent spans.
76
+ """
77
+ with self._lock:
78
+ trace = self._get_or_create_trace(trace_id, name)
79
+
80
+ # Determine parent path and depth
81
+ if parent_id and parent_id in trace.spans:
82
+ parent_info = trace.spans[parent_id]
83
+ parent_path = parent_info.path
84
+ depth = parent_info.depth + 1
85
+ elif parent_id:
86
+ # Parent ID provided but not found - orphan span, place at root
87
+ logger.warning(f"Span {span_id} has unknown parent {parent_id}, placing at trace root")
88
+ parent_path = trace.path
89
+ depth = 0
90
+ else:
91
+ parent_path = trace.path
92
+ depth = 0
93
+
94
+ # Generate ordered directory name (4 digits supports up to 9999 spans)
95
+ trace.span_counter += 1
96
+ safe_name = self._sanitize_name(name)
97
+ dir_name = f"{trace.span_counter:04d}_{safe_name}"
98
+
99
+ # Create nested directory
100
+ span_dir = parent_path / dir_name
101
+ span_dir.mkdir(parents=True, exist_ok=True)
102
+
103
+ # Record span info
104
+ now = datetime.now(UTC)
105
+ span_info = SpanInfo(
106
+ span_id=span_id,
107
+ parent_id=parent_id,
108
+ name=name,
109
+ span_type="default",
110
+ status="running",
111
+ start_time=now,
112
+ path=span_dir,
113
+ depth=depth,
114
+ order=trace.span_counter,
115
+ )
116
+ trace.spans[span_id] = span_info
117
+
118
+ # Track root span
119
+ if parent_id is None:
120
+ trace.root_span_id = span_id
121
+
122
+ # Update parent's children list
123
+ if parent_id and parent_id in trace.spans:
124
+ trace.spans[parent_id].children.append(span_id)
125
+
126
+ def on_span_end(self, job: WriteJob) -> None:
127
+ """Queue span end job for background processing.
128
+
129
+ Called from SpanProcessor.on_end() in the main thread.
130
+ """
131
+ if not self._shutdown:
132
+ self._queue.put(job)
133
+
134
+ def shutdown(self, timeout: float = 30.0) -> None:
135
+ """Flush queue and stop writer thread."""
136
+ if self._shutdown:
137
+ return
138
+
139
+ # Send sentinel before setting _shutdown so in-flight on_span_end calls
140
+ # can still queue their jobs (they check _shutdown before putting).
141
+ self._queue.put(None)
142
+ self._writer_thread.join(timeout=timeout)
143
+ self._shutdown = True
144
+
145
+ # Drain any jobs that arrived after the sentinel (race window between
146
+ # sentinel pickup and thread exit where on_span_end could still queue).
147
+ while True:
148
+ try:
149
+ job = self._queue.get_nowait()
150
+ if job is not None:
151
+ self._process_job(job)
152
+ except Empty:
153
+ break
154
+
155
+ # Finalize any remaining traces (ones that didn't have root span end yet)
156
+ with self._lock:
157
+ for trace in list(self._traces.values()):
158
+ try:
159
+ self._finalize_trace(trace)
160
+ except Exception as e:
161
+ logger.warning(f"Failed to finalize trace {trace.trace_id}: {e}")
162
+ self._traces.clear()
163
+
164
+ def _get_or_create_trace(self, trace_id: str, name: str) -> TraceState:
165
+ """Get existing trace or create new one."""
166
+ if trace_id in self._traces:
167
+ return self._traces[trace_id]
168
+
169
+ # Create new trace
170
+ timestamp = datetime.now(UTC)
171
+ safe_name = self._sanitize_name(name)
172
+ dir_name = f"{timestamp.strftime('%Y%m%d_%H%M%S')}_{trace_id[:8]}_{safe_name}"
173
+ trace_path = self._config.path / dir_name
174
+
175
+ trace_path.mkdir(parents=True, exist_ok=True)
176
+ # Note: No 'spans/' subdirectory - spans are nested hierarchically
177
+
178
+ trace = TraceState(
179
+ trace_id=trace_id,
180
+ name=name,
181
+ path=trace_path,
182
+ start_time=timestamp,
183
+ )
184
+ self._traces[trace_id] = trace
185
+
186
+ # Create artifact store for this trace
187
+ self._artifact_stores[trace_id] = ArtifactStore(trace_path)
188
+
189
+ # Write initial trace metadata
190
+ self._write_trace_yaml(trace)
191
+
192
+ return trace
193
+
194
+ def _writer_loop(self) -> None:
195
+ """Background thread loop for processing write jobs."""
196
+ while True:
197
+ try:
198
+ job = self._queue.get(timeout=1.0)
199
+ except Empty:
200
+ continue
201
+
202
+ if job is None:
203
+ # Shutdown signal
204
+ break
205
+
206
+ try:
207
+ self._process_job(job)
208
+ except Exception as e:
209
+ logger.warning(f"Trace debug write failed for span {job.span_id}: {e}")
210
+
211
+ def _process_job(self, job: WriteJob) -> None: # noqa: PLR0914
212
+ """Process a span end job - write all span data."""
213
+ with self._lock:
214
+ trace = self._traces.get(job.trace_id)
215
+ if not trace:
216
+ logger.warning(f"Trace {job.trace_id} not found for span {job.span_id}")
217
+ return
218
+
219
+ span_info = trace.spans.get(job.span_id)
220
+ if not span_info:
221
+ logger.warning(f"Span {job.span_id} not found in trace {job.trace_id}")
222
+ return
223
+
224
+ span_dir = span_info.path
225
+
226
+ # Extract input/output from attributes
227
+ input_content = self._extract_input(job.attributes)
228
+ output_content = self._extract_output(job.attributes)
229
+
230
+ # Get artifact store for this trace
231
+ artifact_store = self._artifact_stores.get(job.trace_id)
232
+
233
+ # Create content writer with artifact store
234
+ content_writer = ContentWriter(self._config, artifact_store)
235
+
236
+ # Write input/output
237
+ input_ref = content_writer.write(input_content, span_dir, "input")
238
+ output_ref = content_writer.write(output_content, span_dir, "output")
239
+
240
+ # Extract span type and metadata
241
+ span_type = self._extract_span_type(job.attributes)
242
+ llm_info = self._extract_llm_info(job.attributes)
243
+ prefect_info = self._extract_prefect_info(job.attributes)
244
+
245
+ # Update span info (span_info already validated above)
246
+ end_time = datetime.fromtimestamp(job.end_time_ns / 1e9, tz=UTC)
247
+ span_info.end_time = end_time
248
+ span_info.duration_ms = int((job.end_time_ns - job.start_time_ns) / 1e6)
249
+ span_info.status = "failed" if job.status_code == "ERROR" else "completed"
250
+ span_info.span_type = span_type
251
+ span_info.llm_info = llm_info
252
+ span_info.prefect_info = prefect_info
253
+
254
+ # Extract description and expected_cost from span attributes
255
+ span_info.description = job.attributes.get("description")
256
+ ec = job.attributes.get("expected_cost")
257
+ span_info.expected_cost = float(ec) if ec is not None else None
258
+
259
+ # Update trace stats
260
+ if llm_info:
261
+ trace.llm_call_count += 1
262
+ trace.total_tokens += llm_info.get("total_tokens", 0)
263
+ trace.total_cost += llm_info.get("cost", 0.0)
264
+ llm_expected = llm_info.get("expected_cost")
265
+ if llm_expected is not None:
266
+ trace.total_expected_cost += float(llm_expected)
267
+
268
+ # Build span metadata (input_ref and output_ref are now dicts)
269
+ span_meta = self._build_span_metadata_v3(job, input_ref, output_ref, span_type, llm_info, prefect_info)
270
+
271
+ # Write _span.yaml
272
+ span_yaml_path = span_dir / "_span.yaml"
273
+ span_yaml_path.write_text(
274
+ yaml.dump(span_meta, default_flow_style=False, allow_unicode=True, sort_keys=False),
275
+ encoding="utf-8",
276
+ )
277
+
278
+ # Write events.yaml (OTel span events including log records from the bridge)
279
+ if job.events:
280
+ events_data = self._format_span_events(job.events)
281
+ events_path = span_dir / "events.yaml"
282
+ events_path.write_text(
283
+ yaml.dump(events_data, default_flow_style=False, allow_unicode=True),
284
+ encoding="utf-8",
285
+ )
286
+
287
+ # Update index
288
+ self._write_index(trace)
289
+
290
+ # Finalize trace when ALL spans are completed (not just root)
291
+ # This handles the case where child span end jobs arrive after root
292
+ running_spans = [s for s in trace.spans.values() if s.status == "running"]
293
+ if not running_spans:
294
+ self._finalize_trace(trace)
295
+ # Remove from memory to prevent memory leak
296
+ del self._traces[job.trace_id]
297
+ if job.trace_id in self._artifact_stores:
298
+ del self._artifact_stores[job.trace_id]
299
+
300
+ @staticmethod
301
+ def _extract_input(attributes: dict[str, Any]) -> Any:
302
+ """Extract input from span attributes."""
303
+ input_str = attributes.get("lmnr.span.input")
304
+ if input_str:
305
+ try:
306
+ return json.loads(input_str)
307
+ except (json.JSONDecodeError, TypeError):
308
+ return input_str
309
+ return None
310
+
311
+ @staticmethod
312
+ def _extract_output(attributes: dict[str, Any]) -> Any:
313
+ """Extract output from span attributes."""
314
+ output_str = attributes.get("lmnr.span.output")
315
+ if output_str:
316
+ try:
317
+ return json.loads(output_str)
318
+ except (json.JSONDecodeError, TypeError):
319
+ return output_str
320
+ return None
321
+
322
+ @staticmethod
323
+ def _extract_span_type(attributes: dict[str, Any]) -> str:
324
+ """Extract span type from attributes."""
325
+ span_type = attributes.get("lmnr.span.type", "DEFAULT")
326
+ # Map to our types
327
+ type_map = {
328
+ "LLM": "llm",
329
+ "TOOL": "tool",
330
+ "DEFAULT": "default",
331
+ }
332
+ return type_map.get(span_type, "default")
333
+
334
+ @staticmethod
335
+ def _extract_llm_info(attributes: dict[str, Any]) -> dict[str, Any] | None:
336
+ """Extract LLM-specific info from attributes."""
337
+ # Check for LLM attributes
338
+ input_tokens = attributes.get("gen_ai.usage.input_tokens") or attributes.get("gen_ai.usage.prompt_tokens")
339
+ output_tokens = attributes.get("gen_ai.usage.output_tokens") or attributes.get("gen_ai.usage.completion_tokens")
340
+
341
+ if input_tokens is None and output_tokens is None:
342
+ return None
343
+
344
+ return {
345
+ "model": attributes.get("gen_ai.response.model") or attributes.get("gen_ai.request.model"),
346
+ "provider": attributes.get("gen_ai.system"),
347
+ "input_tokens": input_tokens or 0,
348
+ "output_tokens": output_tokens or 0,
349
+ "total_tokens": (input_tokens or 0) + (output_tokens or 0),
350
+ "cost": attributes.get("gen_ai.usage.cost", 0.0),
351
+ "expected_cost": attributes.get("expected_cost"),
352
+ "purpose": attributes.get("purpose"),
353
+ }
354
+
355
+ @staticmethod
356
+ def _extract_prefect_info(attributes: dict[str, Any]) -> dict[str, Any] | None:
357
+ """Extract Prefect-specific info from attributes."""
358
+ run_id = attributes.get("prefect.run.id")
359
+ if not run_id:
360
+ return None
361
+
362
+ return {
363
+ "run_id": run_id,
364
+ "run_name": attributes.get("prefect.run.name"),
365
+ "run_type": attributes.get("prefect.run.type"),
366
+ "tags": attributes.get("prefect.tags", []),
367
+ }
368
+
369
+ _EXCLUDED_ATTRIBUTES: frozenset[str] = frozenset({"lmnr.span.input", "lmnr.span.output"})
370
+
371
+ @staticmethod
372
+ def _build_span_metadata_v3( # noqa: PLR0917
373
+ job: WriteJob,
374
+ input_ref: dict[str, Any],
375
+ output_ref: dict[str, Any],
376
+ span_type: str,
377
+ llm_info: dict[str, Any] | None,
378
+ prefect_info: dict[str, Any] | None,
379
+ ) -> dict[str, Any]:
380
+ """Build span metadata dictionary (V3 format with dict refs)."""
381
+ start_time = datetime.fromtimestamp(job.start_time_ns / 1e9, tz=UTC)
382
+ end_time = datetime.fromtimestamp(job.end_time_ns / 1e9, tz=UTC)
383
+ duration_ms = int((job.end_time_ns - job.start_time_ns) / 1e6)
384
+
385
+ meta: dict[str, Any] = {
386
+ "span_id": job.span_id,
387
+ "trace_id": job.trace_id,
388
+ "parent_id": job.parent_id,
389
+ "name": job.name,
390
+ "type": span_type,
391
+ "timing": {
392
+ "start": start_time.isoformat(),
393
+ "end": end_time.isoformat(),
394
+ "duration_ms": duration_ms,
395
+ },
396
+ "status": "failed" if job.status_code == "ERROR" else "completed",
397
+ }
398
+
399
+ # Add type-specific metadata
400
+ if prefect_info:
401
+ meta["prefect"] = prefect_info
402
+
403
+ if llm_info:
404
+ meta["llm"] = llm_info
405
+
406
+ # Add observability metadata
407
+ description = job.attributes.get("description")
408
+ if description:
409
+ meta["description"] = description
410
+ expected_cost = job.attributes.get("expected_cost")
411
+ if expected_cost is not None:
412
+ meta["expected_cost"] = float(expected_cost)
413
+
414
+ # Add content references (input_ref and output_ref are dicts from ContentWriter.write())
415
+ meta["input"] = input_ref
416
+ meta["output"] = output_ref
417
+
418
+ # Add error info if failed
419
+ if job.status_code != "OK" and job.status_description:
420
+ meta["error"] = {
421
+ "message": job.status_description,
422
+ }
423
+
424
+ # Add raw span attributes (excluding input/output which are in separate files)
425
+ filtered_attrs = {k: v for k, v in job.attributes.items() if k not in LocalTraceWriter._EXCLUDED_ATTRIBUTES}
426
+ if filtered_attrs:
427
+ meta["attributes"] = filtered_attrs
428
+
429
+ return meta
430
+
431
+ @staticmethod
432
+ def _format_span_events(events: list[Any]) -> list[dict[str, Any]]:
433
+ """Format span events for YAML output."""
434
+ result: list[dict[str, Any]] = []
435
+ for event in events:
436
+ try:
437
+ event_dict = {
438
+ "name": event.name,
439
+ "timestamp": datetime.fromtimestamp(event.timestamp / 1e9, tz=UTC).isoformat(),
440
+ }
441
+ if event.attributes:
442
+ event_dict["attributes"] = dict(event.attributes)
443
+ result.append(event_dict)
444
+ except Exception:
445
+ continue
446
+ return result
447
+
448
+ @staticmethod
449
+ def _write_trace_yaml(trace: TraceState) -> None:
450
+ """Write _trace.yaml file."""
451
+ trace_meta = {
452
+ "trace_id": trace.trace_id,
453
+ "name": trace.name,
454
+ "start_time": trace.start_time.isoformat(),
455
+ "end_time": None,
456
+ "duration_seconds": None,
457
+ "status": "running",
458
+ "correlation": {
459
+ "hostname": socket.gethostname(),
460
+ "pid": os.getpid(),
461
+ },
462
+ "stats": {
463
+ "total_spans": len(trace.spans),
464
+ "llm_calls": trace.llm_call_count,
465
+ "total_tokens": trace.total_tokens,
466
+ "total_cost": round(trace.total_cost, 6),
467
+ },
468
+ }
469
+
470
+ trace_yaml_path = trace.path / "_trace.yaml"
471
+ trace_yaml_path.write_text(
472
+ yaml.dump(trace_meta, default_flow_style=False, allow_unicode=True, sort_keys=False),
473
+ encoding="utf-8",
474
+ )
475
+
476
+ def _write_index(self, trace: TraceState) -> None:
477
+ """Write split index files: _tree.yaml, _llm_calls.yaml, _errors.yaml."""
478
+ # Sort spans by execution order
479
+ sorted_spans = sorted(trace.spans.values(), key=lambda s: s.order)
480
+
481
+ # Write lightweight tree index (always)
482
+ self._write_tree_index(trace, sorted_spans)
483
+
484
+ # Write LLM calls index (if enabled)
485
+ if self._config.include_llm_index:
486
+ self._write_llm_index(trace, sorted_spans)
487
+
488
+ # Write errors index (if enabled)
489
+ if self._config.include_error_index:
490
+ self._write_errors_index(trace, sorted_spans)
491
+
492
+ @staticmethod
493
+ def _write_tree_index(trace: TraceState, sorted_spans: list[SpanInfo]) -> None:
494
+ """Write _tree.yaml - lightweight tree structure (~5KB)."""
495
+ span_paths: dict[str, str] = {}
496
+ tree_entries: list[dict[str, Any]] = []
497
+
498
+ for span in sorted_spans:
499
+ # Skip spans that were identified as wrappers during merge
500
+ if span.span_id in trace.merged_wrapper_ids:
501
+ continue
502
+
503
+ relative_path = span.path.relative_to(trace.path).as_posix() + "/"
504
+ span_paths[span.span_id] = relative_path
505
+
506
+ # Minimal entry - just hierarchy and navigation
507
+ entry: dict[str, Any] = {
508
+ "span_id": span.span_id,
509
+ "name": span.name,
510
+ "type": span.span_type,
511
+ "status": span.status,
512
+ "path": relative_path,
513
+ }
514
+
515
+ # Add parent_id if not root
516
+ if span.parent_id:
517
+ entry["parent_id"] = span.parent_id
518
+
519
+ # Add children if any
520
+ if span.children:
521
+ entry["children"] = span.children
522
+
523
+ tree_entries.append(entry)
524
+
525
+ tree_data: dict[str, Any] = {
526
+ "format_version": 3,
527
+ "trace_id": trace.trace_id,
528
+ "root_span_id": trace.root_span_id,
529
+ "span_count": len(tree_entries),
530
+ "span_paths": span_paths,
531
+ "tree": tree_entries,
532
+ }
533
+
534
+ tree_path = trace.path / "_tree.yaml"
535
+ tree_path.write_text(
536
+ yaml.dump(tree_data, default_flow_style=False, allow_unicode=True, sort_keys=False),
537
+ encoding="utf-8",
538
+ )
539
+
540
+ @staticmethod
541
+ def _write_llm_index(trace: TraceState, sorted_spans: list[SpanInfo]) -> None:
542
+ """Write _llm_calls.yaml - LLM-specific details."""
543
+ llm_calls: list[dict[str, Any]] = []
544
+
545
+ for span in sorted_spans:
546
+ if span.llm_info:
547
+ relative_path = span.path.relative_to(trace.path).as_posix() + "/"
548
+
549
+ # Get parent context for better identification
550
+ parent_context = ""
551
+ if span.parent_id and span.parent_id in trace.spans:
552
+ parent_span = trace.spans[span.parent_id]
553
+ parent_context = f" (in {parent_span.name})"
554
+
555
+ llm_entry = {
556
+ "span_id": span.span_id,
557
+ "name": span.name + parent_context, # Add context to distinguish
558
+ "model": span.llm_info.get("model"),
559
+ "provider": span.llm_info.get("provider"),
560
+ "input_tokens": span.llm_info.get("input_tokens", 0),
561
+ "output_tokens": span.llm_info.get("output_tokens", 0),
562
+ "total_tokens": span.llm_info.get("total_tokens", 0),
563
+ "cost": span.llm_info.get("cost", 0.0),
564
+ "expected_cost": span.llm_info.get("expected_cost"),
565
+ "purpose": span.llm_info.get("purpose"),
566
+ "duration_ms": span.duration_ms,
567
+ "status": span.status,
568
+ "path": relative_path,
569
+ }
570
+
571
+ if span.start_time:
572
+ llm_entry["start_time"] = span.start_time.isoformat()
573
+
574
+ llm_calls.append(llm_entry)
575
+
576
+ llm_data: dict[str, Any] = {
577
+ "format_version": 3,
578
+ "trace_id": trace.trace_id,
579
+ "llm_call_count": len(llm_calls),
580
+ "total_tokens": trace.total_tokens,
581
+ "total_cost": round(trace.total_cost, 6),
582
+ "total_expected_cost": round(trace.total_expected_cost, 6),
583
+ "calls": llm_calls,
584
+ }
585
+
586
+ llm_path = trace.path / "_llm_calls.yaml"
587
+ llm_path.write_text(
588
+ yaml.dump(llm_data, default_flow_style=False, allow_unicode=True, sort_keys=False),
589
+ encoding="utf-8",
590
+ )
591
+
592
+ @staticmethod
593
+ def _write_errors_index(trace: TraceState, sorted_spans: list[SpanInfo]) -> None:
594
+ """Write _errors.yaml - failed spans only."""
595
+ error_spans: list[dict[str, Any]] = []
596
+
597
+ for span in sorted_spans:
598
+ if span.status == "failed":
599
+ relative_path = span.path.relative_to(trace.path).as_posix() + "/"
600
+
601
+ error_entry: dict[str, Any] = {
602
+ "span_id": span.span_id,
603
+ "name": span.name,
604
+ "type": span.span_type,
605
+ "depth": span.depth,
606
+ "duration_ms": span.duration_ms,
607
+ "path": relative_path,
608
+ }
609
+
610
+ if span.start_time:
611
+ error_entry["start_time"] = span.start_time.isoformat()
612
+ if span.end_time:
613
+ error_entry["end_time"] = span.end_time.isoformat()
614
+
615
+ # Get parent chain for context
616
+ parent_chain: list[str] = []
617
+ current_id = span.parent_id
618
+ while current_id and current_id in trace.spans:
619
+ parent = trace.spans[current_id]
620
+ parent_chain.append(parent.name)
621
+ current_id = parent.parent_id
622
+ if parent_chain:
623
+ error_entry["parent_chain"] = list(reversed(parent_chain))
624
+
625
+ error_spans.append(error_entry)
626
+
627
+ if error_spans: # Only write if there are errors
628
+ errors_data: dict[str, Any] = {
629
+ "format_version": 3,
630
+ "trace_id": trace.trace_id,
631
+ "error_count": len(error_spans),
632
+ "errors": error_spans,
633
+ }
634
+
635
+ errors_path = trace.path / "_errors.yaml"
636
+ errors_path.write_text(
637
+ yaml.dump(errors_data, default_flow_style=False, allow_unicode=True, sort_keys=False),
638
+ encoding="utf-8",
639
+ )
640
+
641
+ @staticmethod
642
+ def _detect_wrapper_spans(trace: TraceState) -> set[str]:
643
+ """Detect Prefect wrapper spans that should be merged with their inner spans.
644
+
645
+ Detection criteria:
646
+ 1. Parent has exactly one child
647
+ 2. Names match after stripping hash suffix (e.g., "task-abc123" matches "task")
648
+ 3. Parent has no I/O (input type is "none")
649
+ 4. Parent has prefect.run.id, child does not
650
+ """
651
+ wrappers: set[str] = set()
652
+
653
+ for span_id, span in trace.spans.items():
654
+ # Must have exactly one child
655
+ if len(span.children) != 1:
656
+ continue
657
+
658
+ child_id = span.children[0]
659
+ child = trace.spans.get(child_id)
660
+ if not child:
661
+ continue
662
+
663
+ # Names must match after stripping hash suffix
664
+ parent_base = re.sub(r"-[a-f0-9]{3,}$", "", span.name)
665
+ child_base = re.sub(r"-[a-f0-9]{3,}$", "", child.name)
666
+ if parent_base != child_base:
667
+ continue
668
+
669
+ # Parent must have no I/O (check _span.yaml)
670
+ span_yaml = span.path / "_span.yaml"
671
+ if span_yaml.exists():
672
+ try:
673
+ span_meta = yaml.safe_load(span_yaml.read_text())
674
+ if span_meta.get("input", {}).get("type") != "none":
675
+ continue
676
+ except Exception:
677
+ continue
678
+
679
+ # Parent must have prefect info
680
+ if not span.prefect_info:
681
+ continue
682
+
683
+ # Child may have prefect_info if it inherited context from Prefect wrapper
684
+ # Only skip merge if child has DIFFERENT run_id (indicates nested task/flow)
685
+ if child.prefect_info:
686
+ child_run_id = child.prefect_info.get("run_id")
687
+ parent_run_id = span.prefect_info.get("run_id")
688
+ if child_run_id != parent_run_id:
689
+ # Different run IDs = truly nested Prefect task/flow, don't merge
690
+ continue
691
+
692
+ wrappers.add(span_id)
693
+
694
+ return wrappers
695
+
696
+ def _merge_wrapper_spans(self, trace: TraceState) -> None:
697
+ """Merge wrapper spans with their inner spans (virtual merge).
698
+
699
+ This modifies the span hierarchy so wrappers are skipped in index output.
700
+ Physical directories remain unchanged - only the logical view changes.
701
+ """
702
+ if not self._config.merge_wrapper_spans:
703
+ return
704
+
705
+ wrappers = self._detect_wrapper_spans(trace)
706
+ if not wrappers:
707
+ return
708
+
709
+ logger.debug(f"Merging {len(wrappers)} wrapper spans in trace {trace.trace_id}")
710
+
711
+ # Cache wrapper IDs for use in tree index writing
712
+ trace.merged_wrapper_ids = wrappers
713
+
714
+ # For each wrapper, reparent its child to the wrapper's parent
715
+ for wrapper_id in wrappers:
716
+ wrapper = trace.spans[wrapper_id]
717
+ child_id = wrapper.children[0]
718
+ child = trace.spans[child_id]
719
+ grandparent_id = wrapper.parent_id
720
+
721
+ # Update child's parent
722
+ child.parent_id = grandparent_id
723
+
724
+ # Update grandparent's children (if grandparent exists)
725
+ if grandparent_id and grandparent_id in trace.spans:
726
+ grandparent = trace.spans[grandparent_id]
727
+ # Remove wrapper, add child
728
+ if wrapper_id in grandparent.children:
729
+ idx = grandparent.children.index(wrapper_id)
730
+ grandparent.children[idx] = child_id
731
+ # Wrapper was root - child becomes new root
732
+ elif trace.root_span_id == wrapper_id:
733
+ trace.root_span_id = child_id
734
+
735
+ # Mark wrapper as merged (used in index generation)
736
+ wrapper.children = [] # Clear to indicate it's merged
737
+
738
+ def _finalize_trace(self, trace: TraceState) -> None:
739
+ """Finalize a trace - update metadata and generate summary."""
740
+ end_time = datetime.now(UTC)
741
+ duration = (end_time - trace.start_time).total_seconds()
742
+
743
+ # Determine final status
744
+ failed_spans = [s for s in trace.spans.values() if s.status == "failed"]
745
+ status = "failed" if failed_spans else "completed"
746
+
747
+ # Merge wrapper spans before generating indexes
748
+ self._merge_wrapper_spans(trace)
749
+
750
+ # Update _trace.yaml
751
+ trace_meta = {
752
+ "trace_id": trace.trace_id,
753
+ "name": trace.name,
754
+ "start_time": trace.start_time.isoformat(),
755
+ "end_time": end_time.isoformat(),
756
+ "duration_seconds": round(duration, 2),
757
+ "status": status,
758
+ "correlation": {
759
+ "hostname": socket.gethostname(),
760
+ "pid": os.getpid(),
761
+ },
762
+ "stats": {
763
+ "total_spans": len(trace.spans),
764
+ "llm_calls": trace.llm_call_count,
765
+ "total_tokens": trace.total_tokens,
766
+ "total_cost": round(trace.total_cost, 6),
767
+ "total_expected_cost": round(trace.total_expected_cost, 6),
768
+ },
769
+ }
770
+
771
+ trace_yaml_path = trace.path / "_trace.yaml"
772
+ trace_yaml_path.write_text(
773
+ yaml.dump(trace_meta, default_flow_style=False, allow_unicode=True, sort_keys=False),
774
+ encoding="utf-8",
775
+ )
776
+
777
+ # Final index update
778
+ self._write_index(trace)
779
+
780
+ # Generate summary if enabled
781
+ summary: str | None = None
782
+ if self._config.generate_summary:
783
+ summary = generate_summary(trace)
784
+ summary_path = trace.path / "_summary.md"
785
+ summary_path.write_text(summary, encoding="utf-8")
786
+
787
+ # Generate LLM-powered auto-summary if enabled.
788
+ # asyncio.run() is unsafe when the current thread already has a running event loop.
789
+ # Skip if static summary is unavailable: auto-summary uses it as context input.
790
+ has_running_loop = False
791
+ try:
792
+ asyncio.get_running_loop()
793
+ has_running_loop = True
794
+ except RuntimeError:
795
+ pass
796
+ if self._config.auto_summary_enabled and not has_running_loop and summary is not None:
797
+ try:
798
+ auto_mod = importlib.import_module("ai_pipeline_core.observability._debug._auto_summary")
799
+ auto_summary_text = asyncio.run(auto_mod.generate_auto_summary(trace, summary, self._config.auto_summary_model))
800
+ if auto_summary_text:
801
+ auto_summary_path = trace.path / "_auto_summary.md"
802
+ auto_summary_path.write_text(auto_summary_text, encoding="utf-8")
803
+ except Exception as e:
804
+ logger.warning(f"Auto-summary generation failed: {e}")
805
+
806
+ def _cleanup_old_traces(self) -> None:
807
+ """Delete old traces beyond max_traces limit."""
808
+ if self._config.max_traces is None:
809
+ return
810
+
811
+ # Get all trace directories sorted by modification time
812
+ trace_dirs = [(path.stat().st_mtime, path) for path in self._config.path.iterdir() if path.is_dir() and (path / "_trace.yaml").exists()]
813
+
814
+ trace_dirs.sort(reverse=True) # Newest first
815
+
816
+ # Delete excess traces
817
+ for _, path in trace_dirs[self._config.max_traces :]:
818
+ try:
819
+ shutil.rmtree(path)
820
+ except Exception as e:
821
+ logger.warning(f"Failed to delete old trace {path}: {e}")
822
+
823
+ @staticmethod
824
+ def _sanitize_name(name: str) -> str:
825
+ """Sanitize name for safe filesystem use.
826
+
827
+ Truncates to 24 chars + 4-char hash to avoid collisions and keep
828
+ paths manageable with deep nesting.
829
+ """
830
+ safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", name)
831
+ safe = safe.strip(". ")
832
+
833
+ # Handle Windows reserved names (CON, PRN, AUX, NUL, COM1-9, LPT1-9)
834
+ reserved = {"CON", "PRN", "AUX", "NUL"} | {f"COM{i}" for i in range(1, 10)} | {f"LPT{i}" for i in range(1, 10)}
835
+ if safe.upper() in reserved:
836
+ safe = f"_{safe}"
837
+
838
+ # Truncate with hash suffix to avoid collisions
839
+ if len(safe) > 28:
840
+ name_hash = hashlib.md5(name.encode()).hexdigest()[:4]
841
+ safe = f"{safe[:24]}_{name_hash}"
842
+
843
+ return safe or "span"