ai-pipeline-core 0.3.0__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,706 @@
1
+ """Content writing and extraction for trace debugging V3.
2
+
3
+ Uses hash-based artifact storage with automatic deduplication.
4
+ """
5
+
6
+ import base64
7
+ import hashlib
8
+ import json
9
+ import re
10
+ from datetime import datetime
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Any
14
+ from uuid import UUID
15
+
16
+ import yaml
17
+ from pydantic import BaseModel, ConfigDict, SecretStr
18
+
19
+ from .config import TraceDebugConfig
20
+
21
+
22
+ class ContentRef(BaseModel):
23
+ """Reference to content in artifact store."""
24
+
25
+ hash: str # "sha256:abcdef..."
26
+ path: str # "artifacts/sha256/ab/cd/abcdef...1234.txt"
27
+ size_bytes: int
28
+ mime_type: str | None = None
29
+ encoding: str | None = None # "utf-8" | "binary"
30
+
31
+ model_config = ConfigDict(frozen=True)
32
+
33
+
34
+ class ArtifactStore:
35
+ """Hash-based artifact storage with automatic deduplication.
36
+
37
+ Stores large content elements in artifacts/sha256/<first2>/<next2>/<hash>.<ext>
38
+ Identical content automatically deduplicates (same hash = same file).
39
+ """
40
+
41
+ def __init__(self, trace_path: Path):
42
+ """Initialize artifact store for given trace path."""
43
+ self._artifacts_path = trace_path / "artifacts" / "sha256"
44
+ self._artifacts_path.mkdir(parents=True, exist_ok=True)
45
+ self._known_hashes: dict[str, ContentRef] = {}
46
+ self._trace_path = trace_path
47
+
48
+ def store_text(self, text: str, mime_type: str = "text/plain") -> ContentRef:
49
+ """Store text content, return reference."""
50
+ data = text.encode("utf-8")
51
+ content_hash = hashlib.sha256(data).hexdigest()
52
+
53
+ if content_hash in self._known_hashes:
54
+ return self._known_hashes[content_hash]
55
+
56
+ # Create sharded path: ab/cd/abcdef...1234.txt
57
+ file_path = (
58
+ self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}.txt"
59
+ )
60
+ file_path.parent.mkdir(parents=True, exist_ok=True)
61
+
62
+ if not file_path.exists():
63
+ file_path.write_bytes(data)
64
+
65
+ ref = ContentRef(
66
+ hash=f"sha256:{content_hash}",
67
+ path=str(file_path.relative_to(self._trace_path)),
68
+ size_bytes=len(data),
69
+ mime_type=mime_type,
70
+ encoding="utf-8",
71
+ )
72
+
73
+ self._known_hashes[content_hash] = ref
74
+ return ref
75
+
76
+ def store_binary(self, data: bytes, mime_type: str = "application/octet-stream") -> ContentRef:
77
+ """Store binary content, return reference."""
78
+ content_hash = hashlib.sha256(data).hexdigest()
79
+
80
+ if content_hash in self._known_hashes:
81
+ return self._known_hashes[content_hash]
82
+
83
+ # Determine extension from mime type
84
+ ext_map = {
85
+ "image/png": ".png",
86
+ "image/jpeg": ".jpg",
87
+ "image/gif": ".gif",
88
+ "image/webp": ".webp",
89
+ "application/pdf": ".pdf",
90
+ }
91
+ ext = ext_map.get(mime_type, ".bin")
92
+
93
+ file_path = (
94
+ self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}{ext}"
95
+ )
96
+ file_path.parent.mkdir(parents=True, exist_ok=True)
97
+
98
+ if not file_path.exists():
99
+ file_path.write_bytes(data)
100
+
101
+ ref = ContentRef(
102
+ hash=f"sha256:{content_hash}",
103
+ path=str(file_path.relative_to(self._trace_path)),
104
+ size_bytes=len(data),
105
+ mime_type=mime_type,
106
+ encoding="binary",
107
+ )
108
+
109
+ self._known_hashes[content_hash] = ref
110
+ return ref
111
+
112
+ def get_stats(self) -> dict[str, int | float]:
113
+ """Get deduplication statistics."""
114
+ total_files = len(list(self._artifacts_path.rglob("*.*")))
115
+ total_size = sum(f.stat().st_size for f in self._artifacts_path.rglob("*.*") if f.is_file())
116
+ total_refs = len(self._known_hashes)
117
+
118
+ return {
119
+ "unique_artifacts": total_files,
120
+ "total_references": total_refs,
121
+ "total_bytes": total_size,
122
+ "dedup_ratio": total_refs / total_files if total_files > 0 else 1.0,
123
+ }
124
+
125
+
126
+ class ContentWriter:
127
+ """Writes content as input.yaml / output.yaml with artifact externalization."""
128
+
129
+ def __init__(self, config: TraceDebugConfig, artifact_store: ArtifactStore | None = None):
130
+ """Initialize content writer with config and optional artifact store."""
131
+ self._config = config
132
+ self._compiled_patterns = [re.compile(p) for p in config.redact_patterns]
133
+ self._artifact_store = artifact_store
134
+
135
+ def write(self, content: Any, span_dir: Path, name: str) -> dict[str, Any]:
136
+ """Write content as {name}.yaml with artifact externalization.
137
+
138
+ Args:
139
+ content: Raw content (LLM messages, documents, dicts, etc.)
140
+ span_dir: Span directory
141
+ name: "input" or "output"
142
+
143
+ Returns:
144
+ Metadata dict with type, path, size_bytes, breakdown
145
+ """
146
+ if content is None:
147
+ return {"type": "none", "size_bytes": 0}
148
+
149
+ # Structure content (recursive processing with externalization)
150
+ structured = self._structure_content(content)
151
+
152
+ # Serialize to YAML
153
+ serialized = yaml.dump(
154
+ structured,
155
+ default_flow_style=False,
156
+ allow_unicode=True,
157
+ sort_keys=False,
158
+ )
159
+ serialized = self._redact(serialized)
160
+ size = len(serialized.encode("utf-8"))
161
+
162
+ # Check file size limit
163
+ if size > self._config.max_file_bytes:
164
+ # Reduce preview sizes to fit under limit
165
+ structured = self._reduce_previews(structured)
166
+ serialized = yaml.dump(
167
+ structured, default_flow_style=False, allow_unicode=True, sort_keys=False
168
+ )
169
+ serialized = self._redact(serialized)
170
+ size = len(serialized.encode("utf-8"))
171
+
172
+ # If still over, truncate with warning
173
+ if size > self._config.max_file_bytes:
174
+ serialized = serialized[: self._config.max_file_bytes]
175
+ max_bytes = self._config.max_file_bytes
176
+ serialized += (
177
+ f"\n\n# [TRUNCATED: original {size} bytes exceeded {max_bytes} limit]\n"
178
+ )
179
+ size = len(serialized.encode("utf-8"))
180
+
181
+ # Write file
182
+ file_path = span_dir / f"{name}.yaml"
183
+ file_path.write_text(serialized, encoding="utf-8")
184
+
185
+ return {
186
+ "type": "file",
187
+ "path": f"{name}.yaml",
188
+ "size_bytes": size,
189
+ "breakdown": self._extract_breakdown(structured),
190
+ }
191
+
192
+ def _structure_content(self, content: Any) -> dict[str, Any]:
193
+ """Convert raw content to structured YAML-ready format."""
194
+ if self._is_llm_messages(content):
195
+ return self._structure_llm_messages(content)
196
+ elif self._is_document_list(content):
197
+ return self._structure_documents(content)
198
+ else:
199
+ return self._structure_generic(content)
200
+
201
+ def _is_llm_messages(self, content: Any) -> bool:
202
+ """Check if content looks like LLM messages."""
203
+ if not isinstance(content, list):
204
+ return False
205
+ if not content:
206
+ return False
207
+ first = content[0]
208
+ if not isinstance(first, dict):
209
+ return False
210
+ return "role" in first and "content" in first
211
+
212
+ def _is_document_list(self, content: Any) -> bool:
213
+ """Check if content looks like a DocumentList."""
214
+ if not isinstance(content, list):
215
+ return False
216
+ if not content:
217
+ return False
218
+ first = content[0]
219
+ if not isinstance(first, dict):
220
+ return False
221
+ return "base_type" in first and "content" in first
222
+
223
+ def _structure_llm_messages(self, messages: list[Any]) -> dict[str, Any]:
224
+ """Structure LLM messages preserving ALL parts losslessly."""
225
+ message_entries: list[dict[str, Any]] = []
226
+
227
+ total_text_bytes = 0
228
+ total_image_bytes = 0
229
+ total_tool_bytes = 0
230
+
231
+ for i, msg in enumerate(messages):
232
+ role = msg.get("role", "unknown")
233
+ content = msg.get("content")
234
+
235
+ msg_entry: dict[str, Any] = {
236
+ "index": i,
237
+ "role": role,
238
+ }
239
+
240
+ if isinstance(content, list):
241
+ # Multimodal: preserve each part separately
242
+ msg_entry["parts"] = []
243
+ for j, part in enumerate(content):
244
+ structured_part, part_bytes = self._structure_message_part(part, j)
245
+ msg_entry["parts"].append(structured_part)
246
+ part_type = structured_part.get("type", "")
247
+ if part_type == "text":
248
+ total_text_bytes += part_bytes
249
+ elif part_type == "image":
250
+ total_image_bytes += part_bytes
251
+ elif part_type in ("tool_use", "tool_result"):
252
+ total_tool_bytes += part_bytes
253
+ elif isinstance(content, str):
254
+ # Simple text message
255
+ text_entry = self._structure_text_element(content, 0)
256
+ msg_entry["parts"] = [text_entry]
257
+ total_text_bytes += text_entry.get("size_bytes", 0)
258
+ elif content is None:
259
+ msg_entry["parts"] = []
260
+ else:
261
+ msg_entry["parts"] = [{"type": "unknown", "sequence": 0, "raw": str(content)}]
262
+
263
+ # Preserve tool_calls at message level (OpenAI format)
264
+ if "tool_calls" in msg:
265
+ msg_entry["tool_calls"] = self._convert_types(msg["tool_calls"])
266
+ if "function_call" in msg:
267
+ msg_entry["function_call"] = self._convert_types(msg["function_call"])
268
+ if "tool_call_id" in msg:
269
+ msg_entry["tool_call_id"] = msg["tool_call_id"]
270
+ if "name" in msg:
271
+ msg_entry["name"] = msg["name"]
272
+
273
+ message_entries.append(msg_entry)
274
+
275
+ return {
276
+ "format_version": 3,
277
+ "type": "llm_messages",
278
+ "message_count": len(messages),
279
+ "messages": message_entries,
280
+ "metadata": {
281
+ "total_text_bytes": total_text_bytes,
282
+ "total_image_bytes": total_image_bytes,
283
+ "total_tool_bytes": total_tool_bytes,
284
+ },
285
+ "size_bytes": total_text_bytes + total_image_bytes + total_tool_bytes,
286
+ }
287
+
288
+ def _structure_message_part(
289
+ self, part: dict[str, Any], sequence: int
290
+ ) -> tuple[dict[str, Any], int]:
291
+ """Structure a single message part losslessly.
292
+
293
+ Returns:
294
+ Tuple of (structured_dict, size_bytes)
295
+ """
296
+ part_type = part.get("type", "")
297
+
298
+ if part_type == "text":
299
+ entry = self._structure_text_element(part.get("text", ""), sequence)
300
+ return entry, entry.get("size_bytes", 0)
301
+ elif part_type == "image_url":
302
+ entry = self._structure_image_openai(part, sequence)
303
+ return entry, entry.get("size_bytes", 0)
304
+ elif part_type == "image":
305
+ entry = self._structure_image_anthropic(part, sequence)
306
+ return entry, entry.get("size_bytes", 0)
307
+ elif part_type == "tool_use":
308
+ input_str = json.dumps(part.get("input", {}))
309
+ size = len(input_str.encode("utf-8"))
310
+ return {
311
+ "type": "tool_use",
312
+ "sequence": sequence,
313
+ "id": part.get("id"),
314
+ "name": part.get("name"),
315
+ "input": self._convert_types(part.get("input")),
316
+ }, size
317
+ elif part_type == "tool_result":
318
+ result_content = part.get("content")
319
+ entry: dict[str, Any] = {
320
+ "type": "tool_result",
321
+ "sequence": sequence,
322
+ "tool_use_id": part.get("tool_use_id"),
323
+ "is_error": part.get("is_error", False),
324
+ }
325
+ size = 0
326
+ if isinstance(result_content, str):
327
+ text_entry = self._structure_text_element(result_content, 0)
328
+ entry["content"] = text_entry
329
+ size = text_entry.get("size_bytes", 0)
330
+ elif isinstance(result_content, list):
331
+ entry["content"] = []
332
+ for k, p in enumerate(result_content):
333
+ part_entry, part_size = self._structure_message_part(p, k)
334
+ entry["content"].append(part_entry)
335
+ size += part_size
336
+ else:
337
+ entry["content"] = self._convert_types(result_content)
338
+ return entry, size
339
+ else:
340
+ # Unknown type — preserve raw data, never drop
341
+ raw = self._convert_types(part)
342
+ raw_str = json.dumps(raw)
343
+ size = len(raw_str.encode("utf-8"))
344
+ return {
345
+ "type": "unknown",
346
+ "sequence": sequence,
347
+ "original_type": part_type,
348
+ "raw_data": raw,
349
+ }, size
350
+
351
+ def _structure_text_element(self, text: str, sequence: int) -> dict[str, Any]:
352
+ """Structure a text element, optionally externalizing large content."""
353
+ text = self._redact(text)
354
+ text_bytes = len(text.encode("utf-8"))
355
+
356
+ entry: dict[str, Any] = {
357
+ "type": "text",
358
+ "sequence": sequence,
359
+ "size_bytes": text_bytes,
360
+ }
361
+
362
+ if text_bytes > self._config.max_element_bytes:
363
+ # Store full content in artifact store
364
+ if self._artifact_store:
365
+ ref = self._artifact_store.store_text(text, "text/plain")
366
+ excerpt_len = self._config.element_excerpt_bytes
367
+ entry["content_ref"] = {
368
+ "hash": ref.hash,
369
+ "path": ref.path,
370
+ "mime_type": ref.mime_type,
371
+ "encoding": ref.encoding,
372
+ }
373
+ entry["excerpt"] = (
374
+ text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
375
+ )
376
+ else:
377
+ # No artifact store — truncate with marker
378
+ entry["content"] = text[: self._config.max_element_bytes]
379
+ entry["truncated"] = True
380
+ entry["original_size_bytes"] = text_bytes
381
+ else:
382
+ entry["content"] = text
383
+
384
+ return entry
385
+
386
+ def _structure_image_openai(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
387
+ """Structure OpenAI format image part."""
388
+ url = part.get("image_url", {}).get("url", "")
389
+ detail = part.get("image_url", {}).get("detail", "auto")
390
+
391
+ if not url.startswith("data:image/"):
392
+ return {
393
+ "type": "image_url",
394
+ "sequence": sequence,
395
+ "url": url,
396
+ "detail": detail,
397
+ "size_bytes": 0,
398
+ }
399
+
400
+ match = re.match(r"data:image/(\w+);base64,(.+)", url)
401
+ if not match:
402
+ return {
403
+ "type": "image_parse_error",
404
+ "sequence": sequence,
405
+ "url_preview": url[:100],
406
+ "size_bytes": 0,
407
+ }
408
+
409
+ ext, b64_data = match.groups()
410
+ estimated_size = len(b64_data) * 3 // 4
411
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest()
412
+
413
+ entry: dict[str, Any] = {
414
+ "type": "image",
415
+ "sequence": sequence,
416
+ "format": ext,
417
+ "size_bytes": estimated_size,
418
+ "hash": content_hash[:16],
419
+ "detail": detail,
420
+ }
421
+
422
+ # Extract if configured
423
+ if self._config.extract_base64_images and self._artifact_store:
424
+ try:
425
+ image_bytes = base64.b64decode(b64_data)
426
+ ref = self._artifact_store.store_binary(image_bytes, f"image/{ext}")
427
+ entry["content_ref"] = {
428
+ "hash": ref.hash,
429
+ "path": ref.path,
430
+ "mime_type": ref.mime_type,
431
+ "encoding": ref.encoding,
432
+ }
433
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
434
+ entry["extracted"] = True
435
+ except Exception as e:
436
+ entry["extract_error"] = str(e)
437
+ entry["extracted"] = False
438
+ else:
439
+ entry["extracted"] = False
440
+
441
+ return entry
442
+
443
+ def _structure_image_anthropic(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
444
+ """Structure Anthropic format image part."""
445
+ source = part.get("source", {})
446
+ media_type = source.get("media_type", "image/png")
447
+ ext = media_type.split("/")[-1] if "/" in media_type else "png"
448
+
449
+ if source.get("type") != "base64":
450
+ return {
451
+ "type": "image",
452
+ "sequence": sequence,
453
+ "source_type": source.get("type"),
454
+ "format": ext,
455
+ "size_bytes": 0,
456
+ }
457
+
458
+ b64_data = source.get("data", "")
459
+ estimated_size = len(b64_data) * 3 // 4 if b64_data else 0
460
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest() if b64_data else "empty"
461
+
462
+ entry: dict[str, Any] = {
463
+ "type": "image",
464
+ "sequence": sequence,
465
+ "format": ext,
466
+ "size_bytes": estimated_size,
467
+ "hash": content_hash[:16],
468
+ }
469
+
470
+ if self._config.extract_base64_images and self._artifact_store and b64_data:
471
+ try:
472
+ image_bytes = base64.b64decode(b64_data)
473
+ ref = self._artifact_store.store_binary(image_bytes, media_type)
474
+ entry["content_ref"] = {
475
+ "hash": ref.hash,
476
+ "path": ref.path,
477
+ "mime_type": ref.mime_type,
478
+ "encoding": ref.encoding,
479
+ }
480
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
481
+ entry["extracted"] = True
482
+ except Exception as e:
483
+ entry["extract_error"] = str(e)
484
+ entry["extracted"] = False
485
+ else:
486
+ entry["extracted"] = False
487
+
488
+ return entry
489
+
490
+ def _structure_documents(self, docs: list[Any]) -> dict[str, Any]:
491
+ """Structure document list."""
492
+ doc_entries: list[dict[str, Any]] = []
493
+
494
+ for i, doc in enumerate(docs):
495
+ doc_name = doc.get("name", f"doc_{i}")
496
+ base_type = doc.get("base_type", "unknown")
497
+ content = doc.get("content", "")
498
+ content_encoding = doc.get("content_encoding", "utf-8")
499
+
500
+ doc_entry: dict[str, Any] = {
501
+ "index": i,
502
+ "name": doc_name,
503
+ "base_type": base_type,
504
+ }
505
+
506
+ if content_encoding == "base64":
507
+ # Binary content
508
+ try:
509
+ binary_data = base64.b64decode(content)
510
+ size = len(binary_data)
511
+ doc_entry["size_bytes"] = size
512
+ doc_entry["encoding"] = "base64"
513
+
514
+ if size > self._config.max_element_bytes and self._artifact_store:
515
+ # Externalize binary
516
+ mime_type = doc.get("mime_type", "application/octet-stream")
517
+ ref = self._artifact_store.store_binary(binary_data, mime_type)
518
+ doc_entry["content_ref"] = {
519
+ "hash": ref.hash,
520
+ "path": ref.path,
521
+ "mime_type": ref.mime_type,
522
+ "encoding": ref.encoding,
523
+ }
524
+ doc_entry["preview"] = f"[Binary content, {size} bytes]"
525
+ else:
526
+ doc_entry["content"] = content # Keep base64 inline
527
+ except Exception:
528
+ doc_entry["content"] = "[binary content - decode failed]"
529
+ doc_entry["size_bytes"] = 0
530
+ else:
531
+ # Text content
532
+ text = self._redact(str(content))
533
+ text_bytes = len(text.encode("utf-8"))
534
+ doc_entry["size_bytes"] = text_bytes
535
+
536
+ if text_bytes > self._config.max_element_bytes and self._artifact_store:
537
+ ref = self._artifact_store.store_text(text)
538
+ excerpt_len = self._config.element_excerpt_bytes
539
+ doc_entry["content_ref"] = {
540
+ "hash": ref.hash,
541
+ "path": ref.path,
542
+ "mime_type": ref.mime_type,
543
+ "encoding": ref.encoding,
544
+ }
545
+ doc_entry["excerpt"] = (
546
+ text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
547
+ )
548
+ else:
549
+ doc_entry["content"] = text
550
+
551
+ doc_entries.append(doc_entry)
552
+
553
+ return {
554
+ "format_version": 3,
555
+ "type": "document_list",
556
+ "document_count": len(docs),
557
+ "documents": doc_entries,
558
+ }
559
+
560
+ def _structure_generic(self, content: Any) -> dict[str, Any]:
561
+ """Structure generic content."""
562
+ converted = self._convert_types(content)
563
+ serialized = json.dumps(converted)
564
+ size = len(serialized.encode("utf-8"))
565
+
566
+ return {
567
+ "format_version": 3,
568
+ "type": "generic",
569
+ "size_bytes": size,
570
+ "content": converted,
571
+ }
572
+
573
+ def _extract_breakdown(self, structured: dict[str, Any]) -> dict[str, int]:
574
+ """Extract size breakdown from already-structured content.
575
+
576
+ Uses metadata computed during structuring (which has access to full
577
+ image data) rather than recalculating from LMNR attributes (where
578
+ base64 image data is stripped).
579
+ """
580
+ if structured.get("type") == "llm_messages":
581
+ metadata = structured.get("metadata", {})
582
+ return {
583
+ "text_bytes": metadata.get("total_text_bytes", 0),
584
+ "image_bytes": metadata.get("total_image_bytes", 0),
585
+ "tool_bytes": metadata.get("total_tool_bytes", 0),
586
+ }
587
+ elif "size_bytes" in structured:
588
+ return {"total_bytes": structured["size_bytes"]}
589
+ else:
590
+ serialized = json.dumps(self._convert_types(structured))
591
+ return {"total_bytes": len(serialized.encode("utf-8"))}
592
+
593
+ def _reduce_previews(self, structured: dict[str, Any]) -> dict[str, Any]:
594
+ """Reduce preview/excerpt sizes to fit file under max_file_bytes."""
595
+ if structured.get("type") == "llm_messages":
596
+ # Reduce excerpt sizes in messages
597
+ for msg in structured.get("messages", []):
598
+ for part in msg.get("parts", []):
599
+ if "excerpt" in part:
600
+ # Reduce to 500 bytes
601
+ part["excerpt"] = part["excerpt"][:500] + "\n[TRUNCATED]"
602
+ return structured
603
+
604
+ def _redact(self, text: str) -> str:
605
+ """Apply redaction patterns to text."""
606
+ for pattern in self._compiled_patterns:
607
+ text = pattern.sub("[REDACTED]", text)
608
+ return text
609
+
610
+ def _convert_types(self, value: Any, seen: set[int] | None = None) -> Any:
611
+ """Convert non-serializable types recursively with cycle detection."""
612
+ # Cycle detection
613
+ if seen is None:
614
+ seen = set()
615
+
616
+ obj_id = id(value)
617
+ if obj_id in seen:
618
+ return "[circular reference]"
619
+
620
+ match value:
621
+ case None | bool() | int() | float() | str():
622
+ return value
623
+ case SecretStr():
624
+ return "[REDACTED:SecretStr]"
625
+ case bytes():
626
+ if len(value) < 100:
627
+ return f"[bytes: {len(value)} bytes, preview: {value[:50].hex()}...]"
628
+ return f"[bytes: {len(value)} bytes]"
629
+ case Path():
630
+ return str(value)
631
+ case UUID():
632
+ return str(value)
633
+ case datetime():
634
+ return value.isoformat()
635
+ case Enum():
636
+ return value.value
637
+ case set() | frozenset():
638
+ return sorted(str(x) for x in value)
639
+ case BaseModel():
640
+ try:
641
+ return value.model_dump(mode="json")
642
+ except Exception:
643
+ return str(value)
644
+ case dict():
645
+ seen.add(obj_id)
646
+ result = {str(k): self._convert_types(v, seen) for k, v in value.items()}
647
+ seen.discard(obj_id)
648
+ return result
649
+ case list() | tuple():
650
+ seen.add(obj_id)
651
+ result = [self._convert_types(x, seen) for x in value]
652
+ seen.discard(obj_id)
653
+ return result
654
+ case _:
655
+ # Try str() as fallback
656
+ try:
657
+ return str(value)
658
+ except Exception:
659
+ return f"<{type(value).__name__}>"
660
+
661
+
662
+ def reconstruct_span_content(trace_root: Path, span_dir: Path, content_type: str) -> dict[str, Any]:
663
+ """Reconstruct full content from input.yaml/output.yaml + artifacts.
664
+
665
+ Args:
666
+ trace_root: Trace root directory
667
+ span_dir: Span directory containing input.yaml or output.yaml
668
+ content_type: "input" or "output"
669
+
670
+ Returns:
671
+ Complete reconstructed content with all artifact refs resolved
672
+ """
673
+ content_path = span_dir / f"{content_type}.yaml"
674
+ if not content_path.exists():
675
+ return {}
676
+
677
+ content = yaml.safe_load(content_path.read_text(encoding="utf-8"))
678
+ return _rehydrate(content, trace_root)
679
+
680
+
681
+ def _rehydrate(obj: Any, trace_root: Path) -> Any:
682
+ """Recursively replace content_ref entries with actual content."""
683
+ if isinstance(obj, dict):
684
+ if "content_ref" in obj:
685
+ # This is an artifact reference - load the full content
686
+ ref = obj["content_ref"]
687
+ artifact_path = trace_root / ref["path"]
688
+
689
+ if ref.get("encoding") == "utf-8":
690
+ full_content = artifact_path.read_text(encoding="utf-8")
691
+ else:
692
+ full_content = artifact_path.read_bytes()
693
+
694
+ # Replace ref with full content
695
+ obj = obj.copy()
696
+ obj["content"] = full_content
697
+ del obj["content_ref"]
698
+ if "excerpt" in obj:
699
+ del obj["excerpt"]
700
+
701
+ return {k: _rehydrate(v, trace_root) for k, v in obj.items()}
702
+
703
+ elif isinstance(obj, list):
704
+ return [_rehydrate(v, trace_root) for v in obj]
705
+
706
+ return obj