ai-pipeline-core 0.3.0__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,705 @@
1
+ """Content writing and extraction for trace debugging V3.
2
+
3
+ Uses hash-based artifact storage with automatic deduplication.
4
+ """
5
+
6
+ import base64
7
+ import hashlib
8
+ import json
9
+ import re
10
+ from datetime import datetime
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Any
14
+ from uuid import UUID
15
+
16
+ import yaml
17
+ from pydantic import BaseModel, ConfigDict, SecretStr
18
+
19
+ from .config import TraceDebugConfig
20
+
21
+
22
+ class ContentRef(BaseModel):
23
+ """Reference to content in artifact store."""
24
+
25
+ hash: str # "sha256:abcdef..."
26
+ path: str # "artifacts/sha256/ab/cd/abcdef...1234.txt"
27
+ size_bytes: int
28
+ mime_type: str | None = None
29
+ encoding: str | None = None # "utf-8" | "binary"
30
+
31
+ model_config = ConfigDict(frozen=True)
32
+
33
+
34
+ class ArtifactStore:
35
+ """Hash-based artifact storage with automatic deduplication.
36
+
37
+ Stores large content elements in artifacts/sha256/<first2>/<next2>/<hash>.<ext>
38
+ Identical content automatically deduplicates (same hash = same file).
39
+ """
40
+
41
+ def __init__(self, trace_path: Path):
42
+ """Initialize artifact store for given trace path."""
43
+ self._artifacts_path = trace_path / "artifacts" / "sha256"
44
+ self._artifacts_path.mkdir(parents=True, exist_ok=True)
45
+ self._known_hashes: dict[str, ContentRef] = {}
46
+ self._trace_path = trace_path
47
+
48
+ def store_text(self, text: str, mime_type: str = "text/plain") -> ContentRef:
49
+ """Store text content, return reference."""
50
+ data = text.encode("utf-8")
51
+ content_hash = hashlib.sha256(data).hexdigest()
52
+
53
+ if content_hash in self._known_hashes:
54
+ return self._known_hashes[content_hash]
55
+
56
+ # Create sharded path: ab/cd/abcdef...1234.txt
57
+ file_path = (
58
+ self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}.txt"
59
+ )
60
+ file_path.parent.mkdir(parents=True, exist_ok=True)
61
+
62
+ if not file_path.exists():
63
+ file_path.write_bytes(data)
64
+
65
+ ref = ContentRef(
66
+ hash=f"sha256:{content_hash}",
67
+ path=str(file_path.relative_to(self._trace_path)),
68
+ size_bytes=len(data),
69
+ mime_type=mime_type,
70
+ encoding="utf-8",
71
+ )
72
+
73
+ self._known_hashes[content_hash] = ref
74
+ return ref
75
+
76
+ def store_binary(self, data: bytes, mime_type: str = "application/octet-stream") -> ContentRef:
77
+ """Store binary content, return reference."""
78
+ content_hash = hashlib.sha256(data).hexdigest()
79
+
80
+ if content_hash in self._known_hashes:
81
+ return self._known_hashes[content_hash]
82
+
83
+ # Determine extension from mime type
84
+ ext_map = {
85
+ "image/png": ".png",
86
+ "image/jpeg": ".jpg",
87
+ "image/gif": ".gif",
88
+ "application/pdf": ".pdf",
89
+ }
90
+ ext = ext_map.get(mime_type, ".bin")
91
+
92
+ file_path = (
93
+ self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}{ext}"
94
+ )
95
+ file_path.parent.mkdir(parents=True, exist_ok=True)
96
+
97
+ if not file_path.exists():
98
+ file_path.write_bytes(data)
99
+
100
+ ref = ContentRef(
101
+ hash=f"sha256:{content_hash}",
102
+ path=str(file_path.relative_to(self._trace_path)),
103
+ size_bytes=len(data),
104
+ mime_type=mime_type,
105
+ encoding="binary",
106
+ )
107
+
108
+ self._known_hashes[content_hash] = ref
109
+ return ref
110
+
111
+ def get_stats(self) -> dict[str, int | float]:
112
+ """Get deduplication statistics."""
113
+ total_files = len(list(self._artifacts_path.rglob("*.*")))
114
+ total_size = sum(f.stat().st_size for f in self._artifacts_path.rglob("*.*") if f.is_file())
115
+ total_refs = len(self._known_hashes)
116
+
117
+ return {
118
+ "unique_artifacts": total_files,
119
+ "total_references": total_refs,
120
+ "total_bytes": total_size,
121
+ "dedup_ratio": total_refs / total_files if total_files > 0 else 1.0,
122
+ }
123
+
124
+
125
+ class ContentWriter:
126
+ """Writes content as input.yaml / output.yaml with artifact externalization."""
127
+
128
+ def __init__(self, config: TraceDebugConfig, artifact_store: ArtifactStore | None = None):
129
+ """Initialize content writer with config and optional artifact store."""
130
+ self._config = config
131
+ self._compiled_patterns = [re.compile(p) for p in config.redact_patterns]
132
+ self._artifact_store = artifact_store
133
+
134
+ def write(self, content: Any, span_dir: Path, name: str) -> dict[str, Any]:
135
+ """Write content as {name}.yaml with artifact externalization.
136
+
137
+ Args:
138
+ content: Raw content (LLM messages, documents, dicts, etc.)
139
+ span_dir: Span directory
140
+ name: "input" or "output"
141
+
142
+ Returns:
143
+ Metadata dict with type, path, size_bytes, breakdown
144
+ """
145
+ if content is None:
146
+ return {"type": "none", "size_bytes": 0}
147
+
148
+ # Structure content (recursive processing with externalization)
149
+ structured = self._structure_content(content)
150
+
151
+ # Serialize to YAML
152
+ serialized = yaml.dump(
153
+ structured,
154
+ default_flow_style=False,
155
+ allow_unicode=True,
156
+ sort_keys=False,
157
+ )
158
+ serialized = self._redact(serialized)
159
+ size = len(serialized.encode("utf-8"))
160
+
161
+ # Check file size limit
162
+ if size > self._config.max_file_bytes:
163
+ # Reduce preview sizes to fit under limit
164
+ structured = self._reduce_previews(structured)
165
+ serialized = yaml.dump(
166
+ structured, default_flow_style=False, allow_unicode=True, sort_keys=False
167
+ )
168
+ serialized = self._redact(serialized)
169
+ size = len(serialized.encode("utf-8"))
170
+
171
+ # If still over, truncate with warning
172
+ if size > self._config.max_file_bytes:
173
+ serialized = serialized[: self._config.max_file_bytes]
174
+ max_bytes = self._config.max_file_bytes
175
+ serialized += (
176
+ f"\n\n# [TRUNCATED: original {size} bytes exceeded {max_bytes} limit]\n"
177
+ )
178
+ size = len(serialized.encode("utf-8"))
179
+
180
+ # Write file
181
+ file_path = span_dir / f"{name}.yaml"
182
+ file_path.write_text(serialized, encoding="utf-8")
183
+
184
+ return {
185
+ "type": "file",
186
+ "path": f"{name}.yaml",
187
+ "size_bytes": size,
188
+ "breakdown": self._extract_breakdown(structured),
189
+ }
190
+
191
+ def _structure_content(self, content: Any) -> dict[str, Any]:
192
+ """Convert raw content to structured YAML-ready format."""
193
+ if self._is_llm_messages(content):
194
+ return self._structure_llm_messages(content)
195
+ elif self._is_document_list(content):
196
+ return self._structure_documents(content)
197
+ else:
198
+ return self._structure_generic(content)
199
+
200
+ def _is_llm_messages(self, content: Any) -> bool:
201
+ """Check if content looks like LLM messages."""
202
+ if not isinstance(content, list):
203
+ return False
204
+ if not content:
205
+ return False
206
+ first = content[0]
207
+ if not isinstance(first, dict):
208
+ return False
209
+ return "role" in first and "content" in first
210
+
211
+ def _is_document_list(self, content: Any) -> bool:
212
+ """Check if content looks like a DocumentList."""
213
+ if not isinstance(content, list):
214
+ return False
215
+ if not content:
216
+ return False
217
+ first = content[0]
218
+ if not isinstance(first, dict):
219
+ return False
220
+ return "base_type" in first and "content" in first
221
+
222
+ def _structure_llm_messages(self, messages: list[Any]) -> dict[str, Any]:
223
+ """Structure LLM messages preserving ALL parts losslessly."""
224
+ message_entries: list[dict[str, Any]] = []
225
+
226
+ total_text_bytes = 0
227
+ total_image_bytes = 0
228
+ total_tool_bytes = 0
229
+
230
+ for i, msg in enumerate(messages):
231
+ role = msg.get("role", "unknown")
232
+ content = msg.get("content")
233
+
234
+ msg_entry: dict[str, Any] = {
235
+ "index": i,
236
+ "role": role,
237
+ }
238
+
239
+ if isinstance(content, list):
240
+ # Multimodal: preserve each part separately
241
+ msg_entry["parts"] = []
242
+ for j, part in enumerate(content):
243
+ structured_part, part_bytes = self._structure_message_part(part, j)
244
+ msg_entry["parts"].append(structured_part)
245
+ part_type = structured_part.get("type", "")
246
+ if part_type == "text":
247
+ total_text_bytes += part_bytes
248
+ elif part_type == "image":
249
+ total_image_bytes += part_bytes
250
+ elif part_type in ("tool_use", "tool_result"):
251
+ total_tool_bytes += part_bytes
252
+ elif isinstance(content, str):
253
+ # Simple text message
254
+ text_entry = self._structure_text_element(content, 0)
255
+ msg_entry["parts"] = [text_entry]
256
+ total_text_bytes += text_entry.get("size_bytes", 0)
257
+ elif content is None:
258
+ msg_entry["parts"] = []
259
+ else:
260
+ msg_entry["parts"] = [{"type": "unknown", "sequence": 0, "raw": str(content)}]
261
+
262
+ # Preserve tool_calls at message level (OpenAI format)
263
+ if "tool_calls" in msg:
264
+ msg_entry["tool_calls"] = self._convert_types(msg["tool_calls"])
265
+ if "function_call" in msg:
266
+ msg_entry["function_call"] = self._convert_types(msg["function_call"])
267
+ if "tool_call_id" in msg:
268
+ msg_entry["tool_call_id"] = msg["tool_call_id"]
269
+ if "name" in msg:
270
+ msg_entry["name"] = msg["name"]
271
+
272
+ message_entries.append(msg_entry)
273
+
274
+ return {
275
+ "format_version": 3,
276
+ "type": "llm_messages",
277
+ "message_count": len(messages),
278
+ "messages": message_entries,
279
+ "metadata": {
280
+ "total_text_bytes": total_text_bytes,
281
+ "total_image_bytes": total_image_bytes,
282
+ "total_tool_bytes": total_tool_bytes,
283
+ },
284
+ "size_bytes": total_text_bytes + total_image_bytes + total_tool_bytes,
285
+ }
286
+
287
+ def _structure_message_part(
288
+ self, part: dict[str, Any], sequence: int
289
+ ) -> tuple[dict[str, Any], int]:
290
+ """Structure a single message part losslessly.
291
+
292
+ Returns:
293
+ Tuple of (structured_dict, size_bytes)
294
+ """
295
+ part_type = part.get("type", "")
296
+
297
+ if part_type == "text":
298
+ entry = self._structure_text_element(part.get("text", ""), sequence)
299
+ return entry, entry.get("size_bytes", 0)
300
+ elif part_type == "image_url":
301
+ entry = self._structure_image_openai(part, sequence)
302
+ return entry, entry.get("size_bytes", 0)
303
+ elif part_type == "image":
304
+ entry = self._structure_image_anthropic(part, sequence)
305
+ return entry, entry.get("size_bytes", 0)
306
+ elif part_type == "tool_use":
307
+ input_str = json.dumps(part.get("input", {}))
308
+ size = len(input_str.encode("utf-8"))
309
+ return {
310
+ "type": "tool_use",
311
+ "sequence": sequence,
312
+ "id": part.get("id"),
313
+ "name": part.get("name"),
314
+ "input": self._convert_types(part.get("input")),
315
+ }, size
316
+ elif part_type == "tool_result":
317
+ result_content = part.get("content")
318
+ entry: dict[str, Any] = {
319
+ "type": "tool_result",
320
+ "sequence": sequence,
321
+ "tool_use_id": part.get("tool_use_id"),
322
+ "is_error": part.get("is_error", False),
323
+ }
324
+ size = 0
325
+ if isinstance(result_content, str):
326
+ text_entry = self._structure_text_element(result_content, 0)
327
+ entry["content"] = text_entry
328
+ size = text_entry.get("size_bytes", 0)
329
+ elif isinstance(result_content, list):
330
+ entry["content"] = []
331
+ for k, p in enumerate(result_content):
332
+ part_entry, part_size = self._structure_message_part(p, k)
333
+ entry["content"].append(part_entry)
334
+ size += part_size
335
+ else:
336
+ entry["content"] = self._convert_types(result_content)
337
+ return entry, size
338
+ else:
339
+ # Unknown type — preserve raw data, never drop
340
+ raw = self._convert_types(part)
341
+ raw_str = json.dumps(raw)
342
+ size = len(raw_str.encode("utf-8"))
343
+ return {
344
+ "type": "unknown",
345
+ "sequence": sequence,
346
+ "original_type": part_type,
347
+ "raw_data": raw,
348
+ }, size
349
+
350
+ def _structure_text_element(self, text: str, sequence: int) -> dict[str, Any]:
351
+ """Structure a text element, optionally externalizing large content."""
352
+ text = self._redact(text)
353
+ text_bytes = len(text.encode("utf-8"))
354
+
355
+ entry: dict[str, Any] = {
356
+ "type": "text",
357
+ "sequence": sequence,
358
+ "size_bytes": text_bytes,
359
+ }
360
+
361
+ if text_bytes > self._config.max_element_bytes:
362
+ # Store full content in artifact store
363
+ if self._artifact_store:
364
+ ref = self._artifact_store.store_text(text, "text/plain")
365
+ excerpt_len = self._config.element_excerpt_bytes
366
+ entry["content_ref"] = {
367
+ "hash": ref.hash,
368
+ "path": ref.path,
369
+ "mime_type": ref.mime_type,
370
+ "encoding": ref.encoding,
371
+ }
372
+ entry["excerpt"] = (
373
+ text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
374
+ )
375
+ else:
376
+ # No artifact store — truncate with marker
377
+ entry["content"] = text[: self._config.max_element_bytes]
378
+ entry["truncated"] = True
379
+ entry["original_size_bytes"] = text_bytes
380
+ else:
381
+ entry["content"] = text
382
+
383
+ return entry
384
+
385
+ def _structure_image_openai(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
386
+ """Structure OpenAI format image part."""
387
+ url = part.get("image_url", {}).get("url", "")
388
+ detail = part.get("image_url", {}).get("detail", "auto")
389
+
390
+ if not url.startswith("data:image/"):
391
+ return {
392
+ "type": "image_url",
393
+ "sequence": sequence,
394
+ "url": url,
395
+ "detail": detail,
396
+ "size_bytes": 0,
397
+ }
398
+
399
+ match = re.match(r"data:image/(\w+);base64,(.+)", url)
400
+ if not match:
401
+ return {
402
+ "type": "image_parse_error",
403
+ "sequence": sequence,
404
+ "url_preview": url[:100],
405
+ "size_bytes": 0,
406
+ }
407
+
408
+ ext, b64_data = match.groups()
409
+ estimated_size = len(b64_data) * 3 // 4
410
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest()
411
+
412
+ entry: dict[str, Any] = {
413
+ "type": "image",
414
+ "sequence": sequence,
415
+ "format": ext,
416
+ "size_bytes": estimated_size,
417
+ "hash": content_hash[:16],
418
+ "detail": detail,
419
+ }
420
+
421
+ # Extract if configured
422
+ if self._config.extract_base64_images and self._artifact_store:
423
+ try:
424
+ image_bytes = base64.b64decode(b64_data)
425
+ ref = self._artifact_store.store_binary(image_bytes, f"image/{ext}")
426
+ entry["content_ref"] = {
427
+ "hash": ref.hash,
428
+ "path": ref.path,
429
+ "mime_type": ref.mime_type,
430
+ "encoding": ref.encoding,
431
+ }
432
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
433
+ entry["extracted"] = True
434
+ except Exception as e:
435
+ entry["extract_error"] = str(e)
436
+ entry["extracted"] = False
437
+ else:
438
+ entry["extracted"] = False
439
+
440
+ return entry
441
+
442
+ def _structure_image_anthropic(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
443
+ """Structure Anthropic format image part."""
444
+ source = part.get("source", {})
445
+ media_type = source.get("media_type", "image/png")
446
+ ext = media_type.split("/")[-1] if "/" in media_type else "png"
447
+
448
+ if source.get("type") != "base64":
449
+ return {
450
+ "type": "image",
451
+ "sequence": sequence,
452
+ "source_type": source.get("type"),
453
+ "format": ext,
454
+ "size_bytes": 0,
455
+ }
456
+
457
+ b64_data = source.get("data", "")
458
+ estimated_size = len(b64_data) * 3 // 4 if b64_data else 0
459
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest() if b64_data else "empty"
460
+
461
+ entry: dict[str, Any] = {
462
+ "type": "image",
463
+ "sequence": sequence,
464
+ "format": ext,
465
+ "size_bytes": estimated_size,
466
+ "hash": content_hash[:16],
467
+ }
468
+
469
+ if self._config.extract_base64_images and self._artifact_store and b64_data:
470
+ try:
471
+ image_bytes = base64.b64decode(b64_data)
472
+ ref = self._artifact_store.store_binary(image_bytes, media_type)
473
+ entry["content_ref"] = {
474
+ "hash": ref.hash,
475
+ "path": ref.path,
476
+ "mime_type": ref.mime_type,
477
+ "encoding": ref.encoding,
478
+ }
479
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
480
+ entry["extracted"] = True
481
+ except Exception as e:
482
+ entry["extract_error"] = str(e)
483
+ entry["extracted"] = False
484
+ else:
485
+ entry["extracted"] = False
486
+
487
+ return entry
488
+
489
+ def _structure_documents(self, docs: list[Any]) -> dict[str, Any]:
490
+ """Structure document list."""
491
+ doc_entries: list[dict[str, Any]] = []
492
+
493
+ for i, doc in enumerate(docs):
494
+ doc_name = doc.get("name", f"doc_{i}")
495
+ base_type = doc.get("base_type", "unknown")
496
+ content = doc.get("content", "")
497
+ content_encoding = doc.get("content_encoding", "utf-8")
498
+
499
+ doc_entry: dict[str, Any] = {
500
+ "index": i,
501
+ "name": doc_name,
502
+ "base_type": base_type,
503
+ }
504
+
505
+ if content_encoding == "base64":
506
+ # Binary content
507
+ try:
508
+ binary_data = base64.b64decode(content)
509
+ size = len(binary_data)
510
+ doc_entry["size_bytes"] = size
511
+ doc_entry["encoding"] = "base64"
512
+
513
+ if size > self._config.max_element_bytes and self._artifact_store:
514
+ # Externalize binary
515
+ mime_type = doc.get("mime_type", "application/octet-stream")
516
+ ref = self._artifact_store.store_binary(binary_data, mime_type)
517
+ doc_entry["content_ref"] = {
518
+ "hash": ref.hash,
519
+ "path": ref.path,
520
+ "mime_type": ref.mime_type,
521
+ "encoding": ref.encoding,
522
+ }
523
+ doc_entry["preview"] = f"[Binary content, {size} bytes]"
524
+ else:
525
+ doc_entry["content"] = content # Keep base64 inline
526
+ except Exception:
527
+ doc_entry["content"] = "[binary content - decode failed]"
528
+ doc_entry["size_bytes"] = 0
529
+ else:
530
+ # Text content
531
+ text = self._redact(str(content))
532
+ text_bytes = len(text.encode("utf-8"))
533
+ doc_entry["size_bytes"] = text_bytes
534
+
535
+ if text_bytes > self._config.max_element_bytes and self._artifact_store:
536
+ ref = self._artifact_store.store_text(text)
537
+ excerpt_len = self._config.element_excerpt_bytes
538
+ doc_entry["content_ref"] = {
539
+ "hash": ref.hash,
540
+ "path": ref.path,
541
+ "mime_type": ref.mime_type,
542
+ "encoding": ref.encoding,
543
+ }
544
+ doc_entry["excerpt"] = (
545
+ text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
546
+ )
547
+ else:
548
+ doc_entry["content"] = text
549
+
550
+ doc_entries.append(doc_entry)
551
+
552
+ return {
553
+ "format_version": 3,
554
+ "type": "document_list",
555
+ "document_count": len(docs),
556
+ "documents": doc_entries,
557
+ }
558
+
559
+ def _structure_generic(self, content: Any) -> dict[str, Any]:
560
+ """Structure generic content."""
561
+ converted = self._convert_types(content)
562
+ serialized = json.dumps(converted)
563
+ size = len(serialized.encode("utf-8"))
564
+
565
+ return {
566
+ "format_version": 3,
567
+ "type": "generic",
568
+ "size_bytes": size,
569
+ "content": converted,
570
+ }
571
+
572
+ def _extract_breakdown(self, structured: dict[str, Any]) -> dict[str, int]:
573
+ """Extract size breakdown from already-structured content.
574
+
575
+ Uses metadata computed during structuring (which has access to full
576
+ image data) rather than recalculating from LMNR attributes (where
577
+ base64 image data is stripped).
578
+ """
579
+ if structured.get("type") == "llm_messages":
580
+ metadata = structured.get("metadata", {})
581
+ return {
582
+ "text_bytes": metadata.get("total_text_bytes", 0),
583
+ "image_bytes": metadata.get("total_image_bytes", 0),
584
+ "tool_bytes": metadata.get("total_tool_bytes", 0),
585
+ }
586
+ elif "size_bytes" in structured:
587
+ return {"total_bytes": structured["size_bytes"]}
588
+ else:
589
+ serialized = json.dumps(self._convert_types(structured))
590
+ return {"total_bytes": len(serialized.encode("utf-8"))}
591
+
592
+ def _reduce_previews(self, structured: dict[str, Any]) -> dict[str, Any]:
593
+ """Reduce preview/excerpt sizes to fit file under max_file_bytes."""
594
+ if structured.get("type") == "llm_messages":
595
+ # Reduce excerpt sizes in messages
596
+ for msg in structured.get("messages", []):
597
+ for part in msg.get("parts", []):
598
+ if "excerpt" in part:
599
+ # Reduce to 500 bytes
600
+ part["excerpt"] = part["excerpt"][:500] + "\n[TRUNCATED]"
601
+ return structured
602
+
603
+ def _redact(self, text: str) -> str:
604
+ """Apply redaction patterns to text."""
605
+ for pattern in self._compiled_patterns:
606
+ text = pattern.sub("[REDACTED]", text)
607
+ return text
608
+
609
+ def _convert_types(self, value: Any, seen: set[int] | None = None) -> Any:
610
+ """Convert non-serializable types recursively with cycle detection."""
611
+ # Cycle detection
612
+ if seen is None:
613
+ seen = set()
614
+
615
+ obj_id = id(value)
616
+ if obj_id in seen:
617
+ return "[circular reference]"
618
+
619
+ match value:
620
+ case None | bool() | int() | float() | str():
621
+ return value
622
+ case SecretStr():
623
+ return "[REDACTED:SecretStr]"
624
+ case bytes():
625
+ if len(value) < 100:
626
+ return f"[bytes: {len(value)} bytes, preview: {value[:50].hex()}...]"
627
+ return f"[bytes: {len(value)} bytes]"
628
+ case Path():
629
+ return str(value)
630
+ case UUID():
631
+ return str(value)
632
+ case datetime():
633
+ return value.isoformat()
634
+ case Enum():
635
+ return value.value
636
+ case set() | frozenset():
637
+ return sorted(str(x) for x in value)
638
+ case BaseModel():
639
+ try:
640
+ return value.model_dump(mode="json")
641
+ except Exception:
642
+ return str(value)
643
+ case dict():
644
+ seen.add(obj_id)
645
+ result = {str(k): self._convert_types(v, seen) for k, v in value.items()}
646
+ seen.discard(obj_id)
647
+ return result
648
+ case list() | tuple():
649
+ seen.add(obj_id)
650
+ result = [self._convert_types(x, seen) for x in value]
651
+ seen.discard(obj_id)
652
+ return result
653
+ case _:
654
+ # Try str() as fallback
655
+ try:
656
+ return str(value)
657
+ except Exception:
658
+ return f"<{type(value).__name__}>"
659
+
660
+
661
+ def reconstruct_span_content(trace_root: Path, span_dir: Path, content_type: str) -> dict[str, Any]:
662
+ """Reconstruct full content from input.yaml/output.yaml + artifacts.
663
+
664
+ Args:
665
+ trace_root: Trace root directory
666
+ span_dir: Span directory containing input.yaml or output.yaml
667
+ content_type: "input" or "output"
668
+
669
+ Returns:
670
+ Complete reconstructed content with all artifact refs resolved
671
+ """
672
+ content_path = span_dir / f"{content_type}.yaml"
673
+ if not content_path.exists():
674
+ return {}
675
+
676
+ content = yaml.safe_load(content_path.read_text(encoding="utf-8"))
677
+ return _rehydrate(content, trace_root)
678
+
679
+
680
+ def _rehydrate(obj: Any, trace_root: Path) -> Any:
681
+ """Recursively replace content_ref entries with actual content."""
682
+ if isinstance(obj, dict):
683
+ if "content_ref" in obj:
684
+ # This is an artifact reference - load the full content
685
+ ref = obj["content_ref"]
686
+ artifact_path = trace_root / ref["path"]
687
+
688
+ if ref.get("encoding") == "utf-8":
689
+ full_content = artifact_path.read_text(encoding="utf-8")
690
+ else:
691
+ full_content = artifact_path.read_bytes()
692
+
693
+ # Replace ref with full content
694
+ obj = obj.copy()
695
+ obj["content"] = full_content
696
+ del obj["content_ref"]
697
+ if "excerpt" in obj:
698
+ del obj["excerpt"]
699
+
700
+ return {k: _rehydrate(v, trace_root) for k, v in obj.items()}
701
+
702
+ elif isinstance(obj, list):
703
+ return [_rehydrate(v, trace_root) for v in obj]
704
+
705
+ return obj