ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. ai_pipeline_core/__init__.py +78 -125
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +130 -81
  37. ai_pipeline_core/llm/client.py +327 -193
  38. ai_pipeline_core/llm/model_options.py +14 -86
  39. ai_pipeline_core/llm/model_response.py +60 -103
  40. ai_pipeline_core/llm/model_types.py +16 -34
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -483
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/simple_runner/__init__.py +0 -14
  85. ai_pipeline_core/simple_runner/cli.py +0 -254
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -247
  87. ai_pipeline_core/storage/__init__.py +0 -8
  88. ai_pipeline_core/storage/storage.py +0 -628
  89. ai_pipeline_core/utils/__init__.py +0 -8
  90. ai_pipeline_core/utils/deploy.py +0 -373
  91. ai_pipeline_core/utils/remote_deployment.py +0 -269
  92. ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
  93. ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
  94. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,764 @@
1
+ """Content writing and extraction for trace debugging V3.
2
+
3
+ Uses hash-based artifact storage with automatic deduplication.
4
+ Handles Document attachments by externalizing large binary/text attachments to the artifact store.
5
+ """
6
+
7
+ import base64
8
+ import hashlib
9
+ import json
10
+ import re
11
+ from datetime import datetime
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import Any, cast
15
+ from uuid import UUID
16
+
17
+ import yaml
18
+ from pydantic import BaseModel, ConfigDict, SecretStr
19
+
20
+ from ._config import TraceDebugConfig
21
+
22
+
23
+ class ContentRef(BaseModel):
24
+ """Reference to content in artifact store."""
25
+
26
+ hash: str # "sha256:abcdef..."
27
+ path: str # "artifacts/sha256/ab/cd/abcdef...1234.txt"
28
+ size_bytes: int
29
+ mime_type: str | None = None
30
+ encoding: str | None = None # "utf-8" | "binary"
31
+
32
+ model_config = ConfigDict(frozen=True)
33
+
34
+
35
+ class ArtifactStore:
36
+ """Hash-based artifact storage with automatic deduplication.
37
+
38
+ Stores large content elements in artifacts/sha256/<first2>/<next2>/<hash>.<ext>
39
+ Identical content automatically deduplicates (same hash = same file).
40
+ """
41
+
42
+ def __init__(self, trace_path: Path):
43
+ """Initialize artifact store for given trace path."""
44
+ self._artifacts_path = trace_path / "artifacts" / "sha256"
45
+ self._artifacts_path.mkdir(parents=True, exist_ok=True)
46
+ self._known_hashes: dict[str, ContentRef] = {}
47
+ self._trace_path = trace_path
48
+
49
+ def store_text(self, text: str, mime_type: str = "text/plain") -> ContentRef:
50
+ """Store text content, return reference."""
51
+ data = text.encode("utf-8")
52
+ content_hash = hashlib.sha256(data).hexdigest()
53
+
54
+ if content_hash in self._known_hashes:
55
+ return self._known_hashes[content_hash]
56
+
57
+ # Create sharded path: ab/cd/abcdef...1234.txt
58
+ file_path = self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}.txt"
59
+ file_path.parent.mkdir(parents=True, exist_ok=True)
60
+
61
+ if not file_path.exists():
62
+ file_path.write_bytes(data)
63
+
64
+ ref = ContentRef(
65
+ hash=f"sha256:{content_hash}",
66
+ path=str(file_path.relative_to(self._trace_path)),
67
+ size_bytes=len(data),
68
+ mime_type=mime_type,
69
+ encoding="utf-8",
70
+ )
71
+
72
+ self._known_hashes[content_hash] = ref
73
+ return ref
74
+
75
+ def store_binary(self, data: bytes, mime_type: str = "application/octet-stream") -> ContentRef:
76
+ """Store binary content, return reference."""
77
+ content_hash = hashlib.sha256(data).hexdigest()
78
+
79
+ if content_hash in self._known_hashes:
80
+ return self._known_hashes[content_hash]
81
+
82
+ # Determine extension from mime type
83
+ ext_map = {
84
+ "image/png": ".png",
85
+ "image/jpeg": ".jpg",
86
+ "image/gif": ".gif",
87
+ "image/webp": ".webp",
88
+ "application/pdf": ".pdf",
89
+ }
90
+ ext = ext_map.get(mime_type, ".bin")
91
+
92
+ file_path = self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}{ext}"
93
+ file_path.parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ if not file_path.exists():
96
+ file_path.write_bytes(data)
97
+
98
+ ref = ContentRef(
99
+ hash=f"sha256:{content_hash}",
100
+ path=str(file_path.relative_to(self._trace_path)),
101
+ size_bytes=len(data),
102
+ mime_type=mime_type,
103
+ encoding="binary",
104
+ )
105
+
106
+ self._known_hashes[content_hash] = ref
107
+ return ref
108
+
109
+ def get_stats(self) -> dict[str, int | float]:
110
+ """Get deduplication statistics."""
111
+ total_files = len(list(self._artifacts_path.rglob("*.*")))
112
+ total_size = sum(f.stat().st_size for f in self._artifacts_path.rglob("*.*") if f.is_file())
113
+ total_refs = len(self._known_hashes)
114
+
115
+ return {
116
+ "unique_artifacts": total_files,
117
+ "total_references": total_refs,
118
+ "total_bytes": total_size,
119
+ "dedup_ratio": total_refs / total_files if total_files > 0 else 1.0,
120
+ }
121
+
122
+
123
+ class ContentWriter:
124
+ """Writes content as input.yaml / output.yaml with artifact externalization."""
125
+
126
+ def __init__(self, config: TraceDebugConfig, artifact_store: ArtifactStore | None = None):
127
+ """Initialize content writer with config and optional artifact store."""
128
+ self._config = config
129
+ self._compiled_patterns = [re.compile(p) for p in config.redact_patterns]
130
+ self._artifact_store = artifact_store
131
+
132
+ def write(self, content: Any, span_dir: Path, name: str) -> dict[str, Any]:
133
+ """Write content as {name}.yaml with artifact externalization.
134
+
135
+ Args:
136
+ content: Raw content (LLM messages, documents, dicts, etc.)
137
+ span_dir: Span directory
138
+ name: "input" or "output"
139
+
140
+ Returns:
141
+ Metadata dict with type, path, size_bytes, breakdown
142
+ """
143
+ if content is None:
144
+ return {"type": "none", "size_bytes": 0}
145
+
146
+ # Structure content (recursive processing with externalization)
147
+ structured = self._structure_content(content)
148
+
149
+ # Serialize to YAML
150
+ serialized = yaml.dump(
151
+ structured,
152
+ default_flow_style=False,
153
+ allow_unicode=True,
154
+ sort_keys=False,
155
+ )
156
+ serialized = self._redact(serialized)
157
+ size = len(serialized.encode("utf-8"))
158
+
159
+ # Check file size limit
160
+ if size > self._config.max_file_bytes:
161
+ # Reduce preview sizes to fit under limit
162
+ structured = self._reduce_previews(structured)
163
+ serialized = yaml.dump(structured, default_flow_style=False, allow_unicode=True, sort_keys=False)
164
+ serialized = self._redact(serialized)
165
+ size = len(serialized.encode("utf-8"))
166
+
167
+ # If still over, truncate with warning
168
+ if size > self._config.max_file_bytes:
169
+ serialized = serialized[: self._config.max_file_bytes]
170
+ max_bytes = self._config.max_file_bytes
171
+ serialized += f"\n\n# [TRUNCATED: original {size} bytes exceeded {max_bytes} limit]\n"
172
+ size = len(serialized.encode("utf-8"))
173
+
174
+ # Write file
175
+ file_path = span_dir / f"{name}.yaml"
176
+ file_path.write_text(serialized, encoding="utf-8")
177
+
178
+ return {
179
+ "type": "file",
180
+ "path": f"{name}.yaml",
181
+ "size_bytes": size,
182
+ "breakdown": self._extract_breakdown(structured),
183
+ }
184
+
185
+ def _structure_content(self, content: Any) -> dict[str, Any]:
186
+ """Convert raw content to structured YAML-ready format."""
187
+ if self._is_llm_messages(content):
188
+ return self._structure_llm_messages(content)
189
+ if self._is_document_list(content):
190
+ return self._structure_documents(content)
191
+ return self._structure_generic(content)
192
+
193
+ def _is_llm_messages(self, content: Any) -> bool:
194
+ """Check if content looks like LLM messages."""
195
+ if not isinstance(content, list):
196
+ return False
197
+ if not content:
198
+ return False
199
+ first = cast(Any, content[0])
200
+ if not isinstance(first, dict):
201
+ return False
202
+ return "role" in first and "content" in first
203
+
204
+ def _is_document_list(self, content: Any) -> bool:
205
+ """Check if content looks like a list of serialized documents."""
206
+ if not isinstance(content, list):
207
+ return False
208
+ if not content:
209
+ return False
210
+ first = cast(Any, content[0])
211
+ if not isinstance(first, dict):
212
+ return False
213
+ return "class_name" in first and "content" in first
214
+
215
+ def _structure_llm_messages(self, messages: list[Any]) -> dict[str, Any]:
216
+ """Structure LLM messages preserving ALL parts losslessly."""
217
+ message_entries: list[dict[str, Any]] = []
218
+
219
+ total_text_bytes = 0
220
+ total_image_bytes = 0
221
+ total_tool_bytes = 0
222
+
223
+ for i, msg in enumerate(messages):
224
+ role = msg.get("role", "unknown")
225
+ content = msg.get("content")
226
+
227
+ msg_entry: dict[str, Any] = {
228
+ "index": i,
229
+ "role": role,
230
+ }
231
+
232
+ if isinstance(content, list):
233
+ # Multimodal: preserve each part separately
234
+ content_parts = cast(list[Any], content)
235
+ msg_parts: list[dict[str, Any]] = []
236
+ msg_entry["parts"] = msg_parts
237
+ for j, part in enumerate(content_parts):
238
+ structured_part, part_bytes = self._structure_message_part(part, j)
239
+ msg_parts.append(structured_part)
240
+ part_type = structured_part.get("type", "")
241
+ if part_type == "text":
242
+ total_text_bytes += part_bytes
243
+ elif part_type == "image":
244
+ total_image_bytes += part_bytes
245
+ elif part_type in {"tool_use", "tool_result"}:
246
+ total_tool_bytes += part_bytes
247
+ elif isinstance(content, str):
248
+ # Simple text message
249
+ text_entry = self._structure_text_element(content, 0)
250
+ msg_entry["parts"] = [text_entry]
251
+ total_text_bytes += text_entry.get("size_bytes", 0)
252
+ elif content is None:
253
+ msg_entry["parts"] = []
254
+ else:
255
+ msg_entry["parts"] = [{"type": "unknown", "sequence": 0, "raw": str(content)}]
256
+
257
+ # Preserve tool_calls at message level (OpenAI format)
258
+ if "tool_calls" in msg:
259
+ msg_entry["tool_calls"] = self._convert_types(msg["tool_calls"])
260
+ if "function_call" in msg:
261
+ msg_entry["function_call"] = self._convert_types(msg["function_call"])
262
+ if "tool_call_id" in msg:
263
+ msg_entry["tool_call_id"] = msg["tool_call_id"]
264
+ if "name" in msg:
265
+ msg_entry["name"] = msg["name"]
266
+
267
+ message_entries.append(msg_entry)
268
+
269
+ return {
270
+ "format_version": 3,
271
+ "type": "llm_messages",
272
+ "message_count": len(messages),
273
+ "messages": message_entries,
274
+ "metadata": {
275
+ "total_text_bytes": total_text_bytes,
276
+ "total_image_bytes": total_image_bytes,
277
+ "total_tool_bytes": total_tool_bytes,
278
+ },
279
+ "size_bytes": total_text_bytes + total_image_bytes + total_tool_bytes,
280
+ }
281
+
282
+ def _structure_message_part(self, part: dict[str, Any], sequence: int) -> tuple[dict[str, Any], int]:
283
+ """Structure a single message part losslessly.
284
+
285
+ Returns:
286
+ Tuple of (structured_dict, size_bytes)
287
+ """
288
+ part_type = part.get("type", "")
289
+
290
+ if part_type == "text":
291
+ entry = self._structure_text_element(part.get("text", ""), sequence)
292
+ return entry, entry.get("size_bytes", 0)
293
+ if part_type == "image_url":
294
+ entry = self._structure_image_openai(part, sequence)
295
+ return entry, entry.get("size_bytes", 0)
296
+ if part_type == "image":
297
+ entry = self._structure_image_anthropic(part, sequence)
298
+ return entry, entry.get("size_bytes", 0)
299
+ if part_type == "tool_use":
300
+ input_str = json.dumps(part.get("input", {}))
301
+ size = len(input_str.encode("utf-8"))
302
+ return {
303
+ "type": "tool_use",
304
+ "sequence": sequence,
305
+ "id": part.get("id"),
306
+ "name": part.get("name"),
307
+ "input": self._convert_types(part.get("input")),
308
+ }, size
309
+ if part_type == "tool_result":
310
+ result_content = part.get("content")
311
+ entry: dict[str, Any] = {
312
+ "type": "tool_result",
313
+ "sequence": sequence,
314
+ "tool_use_id": part.get("tool_use_id"),
315
+ "is_error": part.get("is_error", False),
316
+ }
317
+ size = 0
318
+ if isinstance(result_content, str):
319
+ text_entry = self._structure_text_element(result_content, 0)
320
+ entry["content"] = text_entry
321
+ size = text_entry.get("size_bytes", 0)
322
+ elif isinstance(result_content, list):
323
+ result_parts = cast(list[Any], result_content)
324
+ content_list: list[dict[str, Any]] = []
325
+ entry["content"] = content_list
326
+ for k, p in enumerate(result_parts):
327
+ part_entry, part_size = self._structure_message_part(p, k)
328
+ content_list.append(part_entry)
329
+ size += part_size
330
+ else:
331
+ entry["content"] = self._convert_types(result_content)
332
+ return entry, size
333
+ # Unknown type — preserve raw data, never drop
334
+ raw = self._convert_types(part)
335
+ raw_str = json.dumps(raw)
336
+ size = len(raw_str.encode("utf-8"))
337
+ return {
338
+ "type": "unknown",
339
+ "sequence": sequence,
340
+ "original_type": part_type,
341
+ "raw_data": raw,
342
+ }, size
343
+
344
+ def _structure_text_element(self, text: str, sequence: int) -> dict[str, Any]:
345
+ """Structure a text element, optionally externalizing large content."""
346
+ text = self._redact(text)
347
+ text_bytes = len(text.encode("utf-8"))
348
+
349
+ entry: dict[str, Any] = {
350
+ "type": "text",
351
+ "sequence": sequence,
352
+ "size_bytes": text_bytes,
353
+ }
354
+
355
+ if text_bytes > self._config.max_element_bytes:
356
+ # Store full content in artifact store
357
+ if self._artifact_store:
358
+ ref = self._artifact_store.store_text(text, "text/plain")
359
+ excerpt_len = self._config.element_excerpt_bytes
360
+ entry["content_ref"] = {
361
+ "hash": ref.hash,
362
+ "path": ref.path,
363
+ "mime_type": ref.mime_type,
364
+ "encoding": ref.encoding,
365
+ }
366
+ entry["excerpt"] = text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
367
+ else:
368
+ # No artifact store — truncate with marker
369
+ entry["content"] = text[: self._config.max_element_bytes]
370
+ entry["truncated"] = True
371
+ entry["original_size_bytes"] = text_bytes
372
+ else:
373
+ entry["content"] = text
374
+
375
+ return entry
376
+
377
+ def _structure_image_openai(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
378
+ """Structure OpenAI format image part."""
379
+ url = part.get("image_url", {}).get("url", "")
380
+ detail = part.get("image_url", {}).get("detail", "auto")
381
+
382
+ if not url.startswith("data:image/"):
383
+ return {
384
+ "type": "image_url",
385
+ "sequence": sequence,
386
+ "url": url,
387
+ "detail": detail,
388
+ "size_bytes": 0,
389
+ }
390
+
391
+ match = re.match(r"data:image/(\w+);base64,(.+)", url)
392
+ if not match:
393
+ return {
394
+ "type": "image_parse_error",
395
+ "sequence": sequence,
396
+ "url_preview": url[:100],
397
+ "size_bytes": 0,
398
+ }
399
+
400
+ ext, b64_data = match.groups()
401
+ estimated_size = len(b64_data) * 3 // 4
402
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest()
403
+
404
+ entry: dict[str, Any] = {
405
+ "type": "image",
406
+ "sequence": sequence,
407
+ "format": ext,
408
+ "size_bytes": estimated_size,
409
+ "hash": content_hash[:16],
410
+ "detail": detail,
411
+ }
412
+
413
+ # Extract if configured
414
+ if self._config.extract_base64_images and self._artifact_store:
415
+ try:
416
+ image_bytes = base64.b64decode(b64_data)
417
+ ref = self._artifact_store.store_binary(image_bytes, f"image/{ext}")
418
+ entry["content_ref"] = {
419
+ "hash": ref.hash,
420
+ "path": ref.path,
421
+ "mime_type": ref.mime_type,
422
+ "encoding": ref.encoding,
423
+ }
424
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
425
+ entry["extracted"] = True
426
+ except Exception as e:
427
+ entry["extract_error"] = str(e)
428
+ entry["extracted"] = False
429
+ else:
430
+ entry["extracted"] = False
431
+
432
+ return entry
433
+
434
+ def _structure_image_anthropic(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
435
+ """Structure Anthropic format image part."""
436
+ source = part.get("source", {})
437
+ media_type = source.get("media_type", "image/png")
438
+ ext = media_type.split("/")[-1] if "/" in media_type else "png"
439
+
440
+ if source.get("type") != "base64":
441
+ return {
442
+ "type": "image",
443
+ "sequence": sequence,
444
+ "source_type": source.get("type"),
445
+ "format": ext,
446
+ "size_bytes": 0,
447
+ }
448
+
449
+ b64_data = source.get("data", "")
450
+ estimated_size = len(b64_data) * 3 // 4 if b64_data else 0
451
+ content_hash = hashlib.sha256(b64_data.encode()).hexdigest() if b64_data else "empty"
452
+
453
+ entry: dict[str, Any] = {
454
+ "type": "image",
455
+ "sequence": sequence,
456
+ "format": ext,
457
+ "size_bytes": estimated_size,
458
+ "hash": content_hash[:16],
459
+ }
460
+
461
+ if self._config.extract_base64_images and self._artifact_store and b64_data:
462
+ try:
463
+ image_bytes = base64.b64decode(b64_data)
464
+ ref = self._artifact_store.store_binary(image_bytes, media_type)
465
+ entry["content_ref"] = {
466
+ "hash": ref.hash,
467
+ "path": ref.path,
468
+ "mime_type": ref.mime_type,
469
+ "encoding": ref.encoding,
470
+ }
471
+ entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
472
+ entry["extracted"] = True
473
+ except Exception as e:
474
+ entry["extract_error"] = str(e)
475
+ entry["extracted"] = False
476
+ else:
477
+ entry["extracted"] = False
478
+
479
+ return entry
480
+
481
+ def _structure_documents(self, docs: list[Any]) -> dict[str, Any]: # noqa: PLR0914
482
+ """Structure document list with attachment externalization."""
483
+ doc_entries: list[dict[str, Any]] = []
484
+
485
+ for i, doc in enumerate(docs):
486
+ doc_name = doc.get("name", f"doc_{i}")
487
+ class_name = doc.get("class_name", "Document")
488
+ content = doc.get("content", "")
489
+ content_encoding = doc.get("content_encoding", "utf-8")
490
+
491
+ doc_entry: dict[str, Any] = {
492
+ "index": i,
493
+ "name": doc_name,
494
+ "class_name": class_name,
495
+ }
496
+
497
+ if content_encoding == "base64":
498
+ # Binary content
499
+ try:
500
+ binary_data = base64.b64decode(content)
501
+ size = len(binary_data)
502
+ doc_entry["size_bytes"] = size
503
+ doc_entry["encoding"] = "base64"
504
+
505
+ if size > self._config.max_element_bytes and self._artifact_store:
506
+ # Externalize binary
507
+ mime_type = doc.get("mime_type", "application/octet-stream")
508
+ ref = self._artifact_store.store_binary(binary_data, mime_type)
509
+ doc_entry["content_ref"] = {
510
+ "hash": ref.hash,
511
+ "path": ref.path,
512
+ "mime_type": ref.mime_type,
513
+ "encoding": ref.encoding,
514
+ }
515
+ doc_entry["preview"] = f"[Binary content, {size} bytes]"
516
+ else:
517
+ doc_entry["content"] = content # Keep base64 inline
518
+ except Exception:
519
+ doc_entry["content"] = "[binary content - decode failed]"
520
+ doc_entry["size_bytes"] = 0
521
+ else:
522
+ # Text content
523
+ text = self._redact(str(content))
524
+ text_bytes = len(text.encode("utf-8"))
525
+ doc_entry["size_bytes"] = text_bytes
526
+
527
+ if text_bytes > self._config.max_element_bytes and self._artifact_store:
528
+ ref = self._artifact_store.store_text(text)
529
+ excerpt_len = self._config.element_excerpt_bytes
530
+ doc_entry["content_ref"] = {
531
+ "hash": ref.hash,
532
+ "path": ref.path,
533
+ "mime_type": ref.mime_type,
534
+ "encoding": ref.encoding,
535
+ }
536
+ doc_entry["excerpt"] = text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
537
+ else:
538
+ doc_entry["content"] = text
539
+
540
+ # Structure attachments if present
541
+ raw_attachments = doc.get("attachments")
542
+ if isinstance(raw_attachments, list) and raw_attachments:
543
+ att_entries: list[dict[str, Any]] = []
544
+ attachments_list = cast(list[Any], raw_attachments)
545
+ for j, att in enumerate(attachments_list):
546
+ if not isinstance(att, dict):
547
+ continue
548
+ att_dict = cast(dict[str, Any], att)
549
+ att_name = att_dict.get("name", f"attachment_{j}")
550
+ att_encoding = att_dict.get("content_encoding", "utf-8")
551
+ att_content = att_dict.get("content", "")
552
+
553
+ att_entry: dict[str, Any] = {
554
+ "index": j,
555
+ "name": att_name,
556
+ }
557
+ if att_dict.get("description"):
558
+ att_entry["description"] = att_dict["description"]
559
+
560
+ if att_encoding == "base64":
561
+ try:
562
+ binary_data = base64.b64decode(att_content)
563
+ size = len(binary_data)
564
+ att_entry["size_bytes"] = size
565
+ att_entry["encoding"] = "base64"
566
+ mime_type = att_dict.get("mime_type", "application/octet-stream")
567
+
568
+ if size > self._config.max_element_bytes and self._artifact_store:
569
+ ref = self._artifact_store.store_binary(binary_data, mime_type)
570
+ att_entry["content_ref"] = {
571
+ "hash": ref.hash,
572
+ "path": ref.path,
573
+ "mime_type": ref.mime_type,
574
+ "encoding": ref.encoding,
575
+ }
576
+ att_entry["preview"] = f"[Binary attachment, {size} bytes]"
577
+ else:
578
+ att_entry["content"] = att_content
579
+ except Exception:
580
+ att_entry["content"] = "[binary content - decode failed]"
581
+ att_entry["size_bytes"] = 0
582
+ else:
583
+ text = self._redact(str(att_content))
584
+ text_bytes = len(text.encode("utf-8"))
585
+ att_entry["size_bytes"] = text_bytes
586
+
587
+ if text_bytes > self._config.max_element_bytes and self._artifact_store:
588
+ ref = self._artifact_store.store_text(text)
589
+ excerpt_len = self._config.element_excerpt_bytes
590
+ att_entry["content_ref"] = {
591
+ "hash": ref.hash,
592
+ "path": ref.path,
593
+ "mime_type": ref.mime_type,
594
+ "encoding": ref.encoding,
595
+ }
596
+ att_entry["excerpt"] = text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
597
+ else:
598
+ att_entry["content"] = text
599
+
600
+ att_entries.append(att_entry)
601
+
602
+ doc_entry["attachment_count"] = len(att_entries)
603
+ doc_entry["attachments"] = att_entries
604
+
605
+ doc_entries.append(doc_entry)
606
+
607
+ return {
608
+ "format_version": 3,
609
+ "type": "document_list",
610
+ "document_count": len(docs),
611
+ "documents": doc_entries,
612
+ }
613
+
614
+ def _structure_generic(self, content: Any) -> dict[str, Any]:
615
+ """Structure generic content."""
616
+ converted = self._convert_types(content)
617
+ serialized = json.dumps(converted)
618
+ size = len(serialized.encode("utf-8"))
619
+
620
+ return {
621
+ "format_version": 3,
622
+ "type": "generic",
623
+ "size_bytes": size,
624
+ "content": converted,
625
+ }
626
+
627
+ def _extract_breakdown(self, structured: dict[str, Any]) -> dict[str, int]:
628
+ """Extract size breakdown from already-structured content.
629
+
630
+ Uses metadata computed during structuring (which has access to full
631
+ image data) rather than recalculating from LMNR attributes (where
632
+ base64 image data is stripped).
633
+ """
634
+ if structured.get("type") == "llm_messages":
635
+ metadata = structured.get("metadata", {})
636
+ return {
637
+ "text_bytes": metadata.get("total_text_bytes", 0),
638
+ "image_bytes": metadata.get("total_image_bytes", 0),
639
+ "tool_bytes": metadata.get("total_tool_bytes", 0),
640
+ }
641
+ if "size_bytes" in structured:
642
+ return {"total_bytes": structured["size_bytes"]}
643
+ serialized = json.dumps(self._convert_types(structured))
644
+ return {"total_bytes": len(serialized.encode("utf-8"))}
645
+
646
+ def _reduce_previews(self, structured: dict[str, Any]) -> dict[str, Any]:
647
+ """Reduce preview/excerpt sizes to fit file under max_file_bytes."""
648
+ if structured.get("type") == "llm_messages":
649
+ # Reduce excerpt sizes in messages
650
+ for msg in structured.get("messages", []):
651
+ for part in msg.get("parts", []):
652
+ if "excerpt" in part:
653
+ # Reduce to 500 bytes
654
+ part["excerpt"] = part["excerpt"][:500] + "\n[TRUNCATED]"
655
+ return structured
656
+
657
+ def _redact(self, text: str) -> str:
658
+ """Apply redaction patterns to text."""
659
+ for pattern in self._compiled_patterns:
660
+ text = pattern.sub("[REDACTED]", text)
661
+ return text
662
+
663
+ def _convert_types(self, value: Any, seen: set[int] | None = None) -> Any: # noqa: PLR0911
664
+ """Convert non-serializable types recursively with cycle detection."""
665
+ # Cycle detection
666
+ if seen is None:
667
+ seen = set()
668
+
669
+ obj_id = id(value)
670
+ if obj_id in seen:
671
+ return "[circular reference]"
672
+
673
+ match value:
674
+ case None | bool() | int() | float() | str():
675
+ return value
676
+ case SecretStr():
677
+ return "[REDACTED:SecretStr]"
678
+ case bytes():
679
+ if len(value) < 100:
680
+ return f"[bytes: {len(value)} bytes, preview: {value[:50].hex()}...]"
681
+ return f"[bytes: {len(value)} bytes]"
682
+ case Path():
683
+ return str(value)
684
+ case UUID():
685
+ return str(value)
686
+ case datetime():
687
+ return value.isoformat()
688
+ case Enum():
689
+ return value.value
690
+ case set() | frozenset():
691
+ return sorted(str(x) for x in cast(set[Any] | frozenset[Any], value))
692
+ case BaseModel():
693
+ try:
694
+ return value.model_dump(mode="json")
695
+ except Exception:
696
+ return str(value)
697
+ case dict():
698
+ seen.add(obj_id)
699
+ typed_dict = cast(dict[Any, Any], value)
700
+ result = {str(k): self._convert_types(v, seen) for k, v in typed_dict.items()}
701
+ seen.discard(obj_id)
702
+ return result
703
+ case list() | tuple():
704
+ seen.add(obj_id)
705
+ typed_seq = cast(list[Any] | tuple[Any, ...], value)
706
+ result = [self._convert_types(x, seen) for x in typed_seq]
707
+ seen.discard(obj_id)
708
+ return result
709
+ case _:
710
+ # Try str() as fallback
711
+ try:
712
+ return str(value)
713
+ except Exception:
714
+ return f"<{type(value).__name__}>"
715
+
716
+
717
+ def reconstruct_span_content(trace_root: Path, span_dir: Path, content_type: str) -> dict[str, Any]:
718
+ """Reconstruct full content from input.yaml/output.yaml + artifacts.
719
+
720
+ Args:
721
+ trace_root: Trace root directory
722
+ span_dir: Span directory containing input.yaml or output.yaml
723
+ content_type: "input" or "output"
724
+
725
+ Returns:
726
+ Complete reconstructed content with all artifact refs resolved
727
+ """
728
+ content_path = span_dir / f"{content_type}.yaml"
729
+ if not content_path.exists():
730
+ return {}
731
+
732
+ content = yaml.safe_load(content_path.read_text(encoding="utf-8"))
733
+ return _rehydrate(content, trace_root)
734
+
735
+
736
+ def _rehydrate(obj: Any, trace_root: Path) -> Any:
737
+ """Recursively replace content_ref entries with actual content."""
738
+ if isinstance(obj, dict):
739
+ obj_dict = cast(dict[str, Any], obj)
740
+ if "content_ref" in obj_dict:
741
+ # This is an artifact reference - load the full content
742
+ ref: dict[str, Any] = obj_dict["content_ref"]
743
+ artifact_path: Path = trace_root / ref["path"]
744
+
745
+ full_content: str | bytes
746
+ if ref.get("encoding") == "utf-8":
747
+ full_content = artifact_path.read_text(encoding="utf-8")
748
+ else:
749
+ full_content = artifact_path.read_bytes()
750
+
751
+ # Replace ref with full content
752
+ obj_dict = obj_dict.copy()
753
+ obj_dict["content"] = full_content
754
+ del obj_dict["content_ref"]
755
+ if "excerpt" in obj_dict:
756
+ del obj_dict["excerpt"]
757
+
758
+ return {k: _rehydrate(v, trace_root) for k, v in obj_dict.items()}
759
+
760
+ if isinstance(obj, list):
761
+ obj_list = cast(list[Any], obj)
762
+ return [_rehydrate(v, trace_root) for v in obj_list]
763
+
764
+ return obj