ai-pipeline-core 0.2.9__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +32 -5
- ai_pipeline_core/debug/__init__.py +26 -0
- ai_pipeline_core/debug/config.py +91 -0
- ai_pipeline_core/debug/content.py +705 -0
- ai_pipeline_core/debug/processor.py +99 -0
- ai_pipeline_core/debug/summary.py +236 -0
- ai_pipeline_core/debug/writer.py +913 -0
- ai_pipeline_core/deployment/__init__.py +46 -0
- ai_pipeline_core/deployment/base.py +681 -0
- ai_pipeline_core/deployment/contract.py +84 -0
- ai_pipeline_core/deployment/helpers.py +98 -0
- ai_pipeline_core/documents/flow_document.py +1 -1
- ai_pipeline_core/documents/task_document.py +1 -1
- ai_pipeline_core/documents/temporary_document.py +1 -1
- ai_pipeline_core/flow/config.py +13 -2
- ai_pipeline_core/flow/options.py +4 -4
- ai_pipeline_core/images/__init__.py +362 -0
- ai_pipeline_core/images/_processing.py +157 -0
- ai_pipeline_core/llm/ai_messages.py +25 -4
- ai_pipeline_core/llm/client.py +15 -19
- ai_pipeline_core/llm/model_response.py +5 -5
- ai_pipeline_core/llm/model_types.py +10 -13
- ai_pipeline_core/logging/logging_mixin.py +2 -2
- ai_pipeline_core/pipeline.py +1 -1
- ai_pipeline_core/progress.py +127 -0
- ai_pipeline_core/prompt_builder/__init__.py +5 -0
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +23 -0
- ai_pipeline_core/prompt_builder/global_cache.py +78 -0
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +6 -0
- ai_pipeline_core/prompt_builder/prompt_builder.py +253 -0
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +41 -0
- ai_pipeline_core/tracing.py +54 -2
- ai_pipeline_core/utils/deploy.py +214 -6
- ai_pipeline_core/utils/remote_deployment.py +37 -187
- {ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/METADATA +96 -27
- ai_pipeline_core-0.3.3.dist-info/RECORD +57 -0
- {ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/WHEEL +1 -1
- ai_pipeline_core/simple_runner/__init__.py +0 -14
- ai_pipeline_core/simple_runner/cli.py +0 -254
- ai_pipeline_core/simple_runner/simple_runner.py +0 -247
- ai_pipeline_core-0.2.9.dist-info/RECORD +0 -41
- {ai_pipeline_core-0.2.9.dist-info → ai_pipeline_core-0.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,705 @@
|
|
|
1
|
+
"""Content writing and extraction for trace debugging V3.
|
|
2
|
+
|
|
3
|
+
Uses hash-based artifact storage with automatic deduplication.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
from uuid import UUID
|
|
15
|
+
|
|
16
|
+
import yaml
|
|
17
|
+
from pydantic import BaseModel, ConfigDict, SecretStr
|
|
18
|
+
|
|
19
|
+
from .config import TraceDebugConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ContentRef(BaseModel):
|
|
23
|
+
"""Reference to content in artifact store."""
|
|
24
|
+
|
|
25
|
+
hash: str # "sha256:abcdef..."
|
|
26
|
+
path: str # "artifacts/sha256/ab/cd/abcdef...1234.txt"
|
|
27
|
+
size_bytes: int
|
|
28
|
+
mime_type: str | None = None
|
|
29
|
+
encoding: str | None = None # "utf-8" | "binary"
|
|
30
|
+
|
|
31
|
+
model_config = ConfigDict(frozen=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ArtifactStore:
|
|
35
|
+
"""Hash-based artifact storage with automatic deduplication.
|
|
36
|
+
|
|
37
|
+
Stores large content elements in artifacts/sha256/<first2>/<next2>/<hash>.<ext>
|
|
38
|
+
Identical content automatically deduplicates (same hash = same file).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, trace_path: Path):
|
|
42
|
+
"""Initialize artifact store for given trace path."""
|
|
43
|
+
self._artifacts_path = trace_path / "artifacts" / "sha256"
|
|
44
|
+
self._artifacts_path.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
self._known_hashes: dict[str, ContentRef] = {}
|
|
46
|
+
self._trace_path = trace_path
|
|
47
|
+
|
|
48
|
+
def store_text(self, text: str, mime_type: str = "text/plain") -> ContentRef:
|
|
49
|
+
"""Store text content, return reference."""
|
|
50
|
+
data = text.encode("utf-8")
|
|
51
|
+
content_hash = hashlib.sha256(data).hexdigest()
|
|
52
|
+
|
|
53
|
+
if content_hash in self._known_hashes:
|
|
54
|
+
return self._known_hashes[content_hash]
|
|
55
|
+
|
|
56
|
+
# Create sharded path: ab/cd/abcdef...1234.txt
|
|
57
|
+
file_path = (
|
|
58
|
+
self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}.txt"
|
|
59
|
+
)
|
|
60
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
|
|
62
|
+
if not file_path.exists():
|
|
63
|
+
file_path.write_bytes(data)
|
|
64
|
+
|
|
65
|
+
ref = ContentRef(
|
|
66
|
+
hash=f"sha256:{content_hash}",
|
|
67
|
+
path=str(file_path.relative_to(self._trace_path)),
|
|
68
|
+
size_bytes=len(data),
|
|
69
|
+
mime_type=mime_type,
|
|
70
|
+
encoding="utf-8",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
self._known_hashes[content_hash] = ref
|
|
74
|
+
return ref
|
|
75
|
+
|
|
76
|
+
def store_binary(self, data: bytes, mime_type: str = "application/octet-stream") -> ContentRef:
|
|
77
|
+
"""Store binary content, return reference."""
|
|
78
|
+
content_hash = hashlib.sha256(data).hexdigest()
|
|
79
|
+
|
|
80
|
+
if content_hash in self._known_hashes:
|
|
81
|
+
return self._known_hashes[content_hash]
|
|
82
|
+
|
|
83
|
+
# Determine extension from mime type
|
|
84
|
+
ext_map = {
|
|
85
|
+
"image/png": ".png",
|
|
86
|
+
"image/jpeg": ".jpg",
|
|
87
|
+
"image/gif": ".gif",
|
|
88
|
+
"application/pdf": ".pdf",
|
|
89
|
+
}
|
|
90
|
+
ext = ext_map.get(mime_type, ".bin")
|
|
91
|
+
|
|
92
|
+
file_path = (
|
|
93
|
+
self._artifacts_path / content_hash[:2] / content_hash[2:4] / f"{content_hash}{ext}"
|
|
94
|
+
)
|
|
95
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
if not file_path.exists():
|
|
98
|
+
file_path.write_bytes(data)
|
|
99
|
+
|
|
100
|
+
ref = ContentRef(
|
|
101
|
+
hash=f"sha256:{content_hash}",
|
|
102
|
+
path=str(file_path.relative_to(self._trace_path)),
|
|
103
|
+
size_bytes=len(data),
|
|
104
|
+
mime_type=mime_type,
|
|
105
|
+
encoding="binary",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self._known_hashes[content_hash] = ref
|
|
109
|
+
return ref
|
|
110
|
+
|
|
111
|
+
def get_stats(self) -> dict[str, int | float]:
|
|
112
|
+
"""Get deduplication statistics."""
|
|
113
|
+
total_files = len(list(self._artifacts_path.rglob("*.*")))
|
|
114
|
+
total_size = sum(f.stat().st_size for f in self._artifacts_path.rglob("*.*") if f.is_file())
|
|
115
|
+
total_refs = len(self._known_hashes)
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"unique_artifacts": total_files,
|
|
119
|
+
"total_references": total_refs,
|
|
120
|
+
"total_bytes": total_size,
|
|
121
|
+
"dedup_ratio": total_refs / total_files if total_files > 0 else 1.0,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class ContentWriter:
|
|
126
|
+
"""Writes content as input.yaml / output.yaml with artifact externalization."""
|
|
127
|
+
|
|
128
|
+
def __init__(self, config: TraceDebugConfig, artifact_store: ArtifactStore | None = None):
|
|
129
|
+
"""Initialize content writer with config and optional artifact store."""
|
|
130
|
+
self._config = config
|
|
131
|
+
self._compiled_patterns = [re.compile(p) for p in config.redact_patterns]
|
|
132
|
+
self._artifact_store = artifact_store
|
|
133
|
+
|
|
134
|
+
def write(self, content: Any, span_dir: Path, name: str) -> dict[str, Any]:
|
|
135
|
+
"""Write content as {name}.yaml with artifact externalization.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
content: Raw content (LLM messages, documents, dicts, etc.)
|
|
139
|
+
span_dir: Span directory
|
|
140
|
+
name: "input" or "output"
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Metadata dict with type, path, size_bytes, breakdown
|
|
144
|
+
"""
|
|
145
|
+
if content is None:
|
|
146
|
+
return {"type": "none", "size_bytes": 0}
|
|
147
|
+
|
|
148
|
+
# Structure content (recursive processing with externalization)
|
|
149
|
+
structured = self._structure_content(content)
|
|
150
|
+
|
|
151
|
+
# Serialize to YAML
|
|
152
|
+
serialized = yaml.dump(
|
|
153
|
+
structured,
|
|
154
|
+
default_flow_style=False,
|
|
155
|
+
allow_unicode=True,
|
|
156
|
+
sort_keys=False,
|
|
157
|
+
)
|
|
158
|
+
serialized = self._redact(serialized)
|
|
159
|
+
size = len(serialized.encode("utf-8"))
|
|
160
|
+
|
|
161
|
+
# Check file size limit
|
|
162
|
+
if size > self._config.max_file_bytes:
|
|
163
|
+
# Reduce preview sizes to fit under limit
|
|
164
|
+
structured = self._reduce_previews(structured)
|
|
165
|
+
serialized = yaml.dump(
|
|
166
|
+
structured, default_flow_style=False, allow_unicode=True, sort_keys=False
|
|
167
|
+
)
|
|
168
|
+
serialized = self._redact(serialized)
|
|
169
|
+
size = len(serialized.encode("utf-8"))
|
|
170
|
+
|
|
171
|
+
# If still over, truncate with warning
|
|
172
|
+
if size > self._config.max_file_bytes:
|
|
173
|
+
serialized = serialized[: self._config.max_file_bytes]
|
|
174
|
+
max_bytes = self._config.max_file_bytes
|
|
175
|
+
serialized += (
|
|
176
|
+
f"\n\n# [TRUNCATED: original {size} bytes exceeded {max_bytes} limit]\n"
|
|
177
|
+
)
|
|
178
|
+
size = len(serialized.encode("utf-8"))
|
|
179
|
+
|
|
180
|
+
# Write file
|
|
181
|
+
file_path = span_dir / f"{name}.yaml"
|
|
182
|
+
file_path.write_text(serialized, encoding="utf-8")
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
"type": "file",
|
|
186
|
+
"path": f"{name}.yaml",
|
|
187
|
+
"size_bytes": size,
|
|
188
|
+
"breakdown": self._extract_breakdown(structured),
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def _structure_content(self, content: Any) -> dict[str, Any]:
|
|
192
|
+
"""Convert raw content to structured YAML-ready format."""
|
|
193
|
+
if self._is_llm_messages(content):
|
|
194
|
+
return self._structure_llm_messages(content)
|
|
195
|
+
elif self._is_document_list(content):
|
|
196
|
+
return self._structure_documents(content)
|
|
197
|
+
else:
|
|
198
|
+
return self._structure_generic(content)
|
|
199
|
+
|
|
200
|
+
def _is_llm_messages(self, content: Any) -> bool:
|
|
201
|
+
"""Check if content looks like LLM messages."""
|
|
202
|
+
if not isinstance(content, list):
|
|
203
|
+
return False
|
|
204
|
+
if not content:
|
|
205
|
+
return False
|
|
206
|
+
first = content[0]
|
|
207
|
+
if not isinstance(first, dict):
|
|
208
|
+
return False
|
|
209
|
+
return "role" in first and "content" in first
|
|
210
|
+
|
|
211
|
+
def _is_document_list(self, content: Any) -> bool:
|
|
212
|
+
"""Check if content looks like a DocumentList."""
|
|
213
|
+
if not isinstance(content, list):
|
|
214
|
+
return False
|
|
215
|
+
if not content:
|
|
216
|
+
return False
|
|
217
|
+
first = content[0]
|
|
218
|
+
if not isinstance(first, dict):
|
|
219
|
+
return False
|
|
220
|
+
return "base_type" in first and "content" in first
|
|
221
|
+
|
|
222
|
+
def _structure_llm_messages(self, messages: list[Any]) -> dict[str, Any]:
|
|
223
|
+
"""Structure LLM messages preserving ALL parts losslessly."""
|
|
224
|
+
message_entries: list[dict[str, Any]] = []
|
|
225
|
+
|
|
226
|
+
total_text_bytes = 0
|
|
227
|
+
total_image_bytes = 0
|
|
228
|
+
total_tool_bytes = 0
|
|
229
|
+
|
|
230
|
+
for i, msg in enumerate(messages):
|
|
231
|
+
role = msg.get("role", "unknown")
|
|
232
|
+
content = msg.get("content")
|
|
233
|
+
|
|
234
|
+
msg_entry: dict[str, Any] = {
|
|
235
|
+
"index": i,
|
|
236
|
+
"role": role,
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if isinstance(content, list):
|
|
240
|
+
# Multimodal: preserve each part separately
|
|
241
|
+
msg_entry["parts"] = []
|
|
242
|
+
for j, part in enumerate(content):
|
|
243
|
+
structured_part, part_bytes = self._structure_message_part(part, j)
|
|
244
|
+
msg_entry["parts"].append(structured_part)
|
|
245
|
+
part_type = structured_part.get("type", "")
|
|
246
|
+
if part_type == "text":
|
|
247
|
+
total_text_bytes += part_bytes
|
|
248
|
+
elif part_type == "image":
|
|
249
|
+
total_image_bytes += part_bytes
|
|
250
|
+
elif part_type in ("tool_use", "tool_result"):
|
|
251
|
+
total_tool_bytes += part_bytes
|
|
252
|
+
elif isinstance(content, str):
|
|
253
|
+
# Simple text message
|
|
254
|
+
text_entry = self._structure_text_element(content, 0)
|
|
255
|
+
msg_entry["parts"] = [text_entry]
|
|
256
|
+
total_text_bytes += text_entry.get("size_bytes", 0)
|
|
257
|
+
elif content is None:
|
|
258
|
+
msg_entry["parts"] = []
|
|
259
|
+
else:
|
|
260
|
+
msg_entry["parts"] = [{"type": "unknown", "sequence": 0, "raw": str(content)}]
|
|
261
|
+
|
|
262
|
+
# Preserve tool_calls at message level (OpenAI format)
|
|
263
|
+
if "tool_calls" in msg:
|
|
264
|
+
msg_entry["tool_calls"] = self._convert_types(msg["tool_calls"])
|
|
265
|
+
if "function_call" in msg:
|
|
266
|
+
msg_entry["function_call"] = self._convert_types(msg["function_call"])
|
|
267
|
+
if "tool_call_id" in msg:
|
|
268
|
+
msg_entry["tool_call_id"] = msg["tool_call_id"]
|
|
269
|
+
if "name" in msg:
|
|
270
|
+
msg_entry["name"] = msg["name"]
|
|
271
|
+
|
|
272
|
+
message_entries.append(msg_entry)
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
"format_version": 3,
|
|
276
|
+
"type": "llm_messages",
|
|
277
|
+
"message_count": len(messages),
|
|
278
|
+
"messages": message_entries,
|
|
279
|
+
"metadata": {
|
|
280
|
+
"total_text_bytes": total_text_bytes,
|
|
281
|
+
"total_image_bytes": total_image_bytes,
|
|
282
|
+
"total_tool_bytes": total_tool_bytes,
|
|
283
|
+
},
|
|
284
|
+
"size_bytes": total_text_bytes + total_image_bytes + total_tool_bytes,
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
def _structure_message_part(
|
|
288
|
+
self, part: dict[str, Any], sequence: int
|
|
289
|
+
) -> tuple[dict[str, Any], int]:
|
|
290
|
+
"""Structure a single message part losslessly.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Tuple of (structured_dict, size_bytes)
|
|
294
|
+
"""
|
|
295
|
+
part_type = part.get("type", "")
|
|
296
|
+
|
|
297
|
+
if part_type == "text":
|
|
298
|
+
entry = self._structure_text_element(part.get("text", ""), sequence)
|
|
299
|
+
return entry, entry.get("size_bytes", 0)
|
|
300
|
+
elif part_type == "image_url":
|
|
301
|
+
entry = self._structure_image_openai(part, sequence)
|
|
302
|
+
return entry, entry.get("size_bytes", 0)
|
|
303
|
+
elif part_type == "image":
|
|
304
|
+
entry = self._structure_image_anthropic(part, sequence)
|
|
305
|
+
return entry, entry.get("size_bytes", 0)
|
|
306
|
+
elif part_type == "tool_use":
|
|
307
|
+
input_str = json.dumps(part.get("input", {}))
|
|
308
|
+
size = len(input_str.encode("utf-8"))
|
|
309
|
+
return {
|
|
310
|
+
"type": "tool_use",
|
|
311
|
+
"sequence": sequence,
|
|
312
|
+
"id": part.get("id"),
|
|
313
|
+
"name": part.get("name"),
|
|
314
|
+
"input": self._convert_types(part.get("input")),
|
|
315
|
+
}, size
|
|
316
|
+
elif part_type == "tool_result":
|
|
317
|
+
result_content = part.get("content")
|
|
318
|
+
entry: dict[str, Any] = {
|
|
319
|
+
"type": "tool_result",
|
|
320
|
+
"sequence": sequence,
|
|
321
|
+
"tool_use_id": part.get("tool_use_id"),
|
|
322
|
+
"is_error": part.get("is_error", False),
|
|
323
|
+
}
|
|
324
|
+
size = 0
|
|
325
|
+
if isinstance(result_content, str):
|
|
326
|
+
text_entry = self._structure_text_element(result_content, 0)
|
|
327
|
+
entry["content"] = text_entry
|
|
328
|
+
size = text_entry.get("size_bytes", 0)
|
|
329
|
+
elif isinstance(result_content, list):
|
|
330
|
+
entry["content"] = []
|
|
331
|
+
for k, p in enumerate(result_content):
|
|
332
|
+
part_entry, part_size = self._structure_message_part(p, k)
|
|
333
|
+
entry["content"].append(part_entry)
|
|
334
|
+
size += part_size
|
|
335
|
+
else:
|
|
336
|
+
entry["content"] = self._convert_types(result_content)
|
|
337
|
+
return entry, size
|
|
338
|
+
else:
|
|
339
|
+
# Unknown type — preserve raw data, never drop
|
|
340
|
+
raw = self._convert_types(part)
|
|
341
|
+
raw_str = json.dumps(raw)
|
|
342
|
+
size = len(raw_str.encode("utf-8"))
|
|
343
|
+
return {
|
|
344
|
+
"type": "unknown",
|
|
345
|
+
"sequence": sequence,
|
|
346
|
+
"original_type": part_type,
|
|
347
|
+
"raw_data": raw,
|
|
348
|
+
}, size
|
|
349
|
+
|
|
350
|
+
def _structure_text_element(self, text: str, sequence: int) -> dict[str, Any]:
|
|
351
|
+
"""Structure a text element, optionally externalizing large content."""
|
|
352
|
+
text = self._redact(text)
|
|
353
|
+
text_bytes = len(text.encode("utf-8"))
|
|
354
|
+
|
|
355
|
+
entry: dict[str, Any] = {
|
|
356
|
+
"type": "text",
|
|
357
|
+
"sequence": sequence,
|
|
358
|
+
"size_bytes": text_bytes,
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
if text_bytes > self._config.max_element_bytes:
|
|
362
|
+
# Store full content in artifact store
|
|
363
|
+
if self._artifact_store:
|
|
364
|
+
ref = self._artifact_store.store_text(text, "text/plain")
|
|
365
|
+
excerpt_len = self._config.element_excerpt_bytes
|
|
366
|
+
entry["content_ref"] = {
|
|
367
|
+
"hash": ref.hash,
|
|
368
|
+
"path": ref.path,
|
|
369
|
+
"mime_type": ref.mime_type,
|
|
370
|
+
"encoding": ref.encoding,
|
|
371
|
+
}
|
|
372
|
+
entry["excerpt"] = (
|
|
373
|
+
text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
# No artifact store — truncate with marker
|
|
377
|
+
entry["content"] = text[: self._config.max_element_bytes]
|
|
378
|
+
entry["truncated"] = True
|
|
379
|
+
entry["original_size_bytes"] = text_bytes
|
|
380
|
+
else:
|
|
381
|
+
entry["content"] = text
|
|
382
|
+
|
|
383
|
+
return entry
|
|
384
|
+
|
|
385
|
+
def _structure_image_openai(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
|
|
386
|
+
"""Structure OpenAI format image part."""
|
|
387
|
+
url = part.get("image_url", {}).get("url", "")
|
|
388
|
+
detail = part.get("image_url", {}).get("detail", "auto")
|
|
389
|
+
|
|
390
|
+
if not url.startswith("data:image/"):
|
|
391
|
+
return {
|
|
392
|
+
"type": "image_url",
|
|
393
|
+
"sequence": sequence,
|
|
394
|
+
"url": url,
|
|
395
|
+
"detail": detail,
|
|
396
|
+
"size_bytes": 0,
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
match = re.match(r"data:image/(\w+);base64,(.+)", url)
|
|
400
|
+
if not match:
|
|
401
|
+
return {
|
|
402
|
+
"type": "image_parse_error",
|
|
403
|
+
"sequence": sequence,
|
|
404
|
+
"url_preview": url[:100],
|
|
405
|
+
"size_bytes": 0,
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
ext, b64_data = match.groups()
|
|
409
|
+
estimated_size = len(b64_data) * 3 // 4
|
|
410
|
+
content_hash = hashlib.sha256(b64_data.encode()).hexdigest()
|
|
411
|
+
|
|
412
|
+
entry: dict[str, Any] = {
|
|
413
|
+
"type": "image",
|
|
414
|
+
"sequence": sequence,
|
|
415
|
+
"format": ext,
|
|
416
|
+
"size_bytes": estimated_size,
|
|
417
|
+
"hash": content_hash[:16],
|
|
418
|
+
"detail": detail,
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
# Extract if configured
|
|
422
|
+
if self._config.extract_base64_images and self._artifact_store:
|
|
423
|
+
try:
|
|
424
|
+
image_bytes = base64.b64decode(b64_data)
|
|
425
|
+
ref = self._artifact_store.store_binary(image_bytes, f"image/{ext}")
|
|
426
|
+
entry["content_ref"] = {
|
|
427
|
+
"hash": ref.hash,
|
|
428
|
+
"path": ref.path,
|
|
429
|
+
"mime_type": ref.mime_type,
|
|
430
|
+
"encoding": ref.encoding,
|
|
431
|
+
}
|
|
432
|
+
entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
|
|
433
|
+
entry["extracted"] = True
|
|
434
|
+
except Exception as e:
|
|
435
|
+
entry["extract_error"] = str(e)
|
|
436
|
+
entry["extracted"] = False
|
|
437
|
+
else:
|
|
438
|
+
entry["extracted"] = False
|
|
439
|
+
|
|
440
|
+
return entry
|
|
441
|
+
|
|
442
|
+
def _structure_image_anthropic(self, part: dict[str, Any], sequence: int) -> dict[str, Any]:
|
|
443
|
+
"""Structure Anthropic format image part."""
|
|
444
|
+
source = part.get("source", {})
|
|
445
|
+
media_type = source.get("media_type", "image/png")
|
|
446
|
+
ext = media_type.split("/")[-1] if "/" in media_type else "png"
|
|
447
|
+
|
|
448
|
+
if source.get("type") != "base64":
|
|
449
|
+
return {
|
|
450
|
+
"type": "image",
|
|
451
|
+
"sequence": sequence,
|
|
452
|
+
"source_type": source.get("type"),
|
|
453
|
+
"format": ext,
|
|
454
|
+
"size_bytes": 0,
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
b64_data = source.get("data", "")
|
|
458
|
+
estimated_size = len(b64_data) * 3 // 4 if b64_data else 0
|
|
459
|
+
content_hash = hashlib.sha256(b64_data.encode()).hexdigest() if b64_data else "empty"
|
|
460
|
+
|
|
461
|
+
entry: dict[str, Any] = {
|
|
462
|
+
"type": "image",
|
|
463
|
+
"sequence": sequence,
|
|
464
|
+
"format": ext,
|
|
465
|
+
"size_bytes": estimated_size,
|
|
466
|
+
"hash": content_hash[:16],
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if self._config.extract_base64_images and self._artifact_store and b64_data:
|
|
470
|
+
try:
|
|
471
|
+
image_bytes = base64.b64decode(b64_data)
|
|
472
|
+
ref = self._artifact_store.store_binary(image_bytes, media_type)
|
|
473
|
+
entry["content_ref"] = {
|
|
474
|
+
"hash": ref.hash,
|
|
475
|
+
"path": ref.path,
|
|
476
|
+
"mime_type": ref.mime_type,
|
|
477
|
+
"encoding": ref.encoding,
|
|
478
|
+
}
|
|
479
|
+
entry["preview"] = f"[{ext.upper()} image, {estimated_size} bytes]"
|
|
480
|
+
entry["extracted"] = True
|
|
481
|
+
except Exception as e:
|
|
482
|
+
entry["extract_error"] = str(e)
|
|
483
|
+
entry["extracted"] = False
|
|
484
|
+
else:
|
|
485
|
+
entry["extracted"] = False
|
|
486
|
+
|
|
487
|
+
return entry
|
|
488
|
+
|
|
489
|
+
def _structure_documents(self, docs: list[Any]) -> dict[str, Any]:
|
|
490
|
+
"""Structure document list."""
|
|
491
|
+
doc_entries: list[dict[str, Any]] = []
|
|
492
|
+
|
|
493
|
+
for i, doc in enumerate(docs):
|
|
494
|
+
doc_name = doc.get("name", f"doc_{i}")
|
|
495
|
+
base_type = doc.get("base_type", "unknown")
|
|
496
|
+
content = doc.get("content", "")
|
|
497
|
+
content_encoding = doc.get("content_encoding", "utf-8")
|
|
498
|
+
|
|
499
|
+
doc_entry: dict[str, Any] = {
|
|
500
|
+
"index": i,
|
|
501
|
+
"name": doc_name,
|
|
502
|
+
"base_type": base_type,
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if content_encoding == "base64":
|
|
506
|
+
# Binary content
|
|
507
|
+
try:
|
|
508
|
+
binary_data = base64.b64decode(content)
|
|
509
|
+
size = len(binary_data)
|
|
510
|
+
doc_entry["size_bytes"] = size
|
|
511
|
+
doc_entry["encoding"] = "base64"
|
|
512
|
+
|
|
513
|
+
if size > self._config.max_element_bytes and self._artifact_store:
|
|
514
|
+
# Externalize binary
|
|
515
|
+
mime_type = doc.get("mime_type", "application/octet-stream")
|
|
516
|
+
ref = self._artifact_store.store_binary(binary_data, mime_type)
|
|
517
|
+
doc_entry["content_ref"] = {
|
|
518
|
+
"hash": ref.hash,
|
|
519
|
+
"path": ref.path,
|
|
520
|
+
"mime_type": ref.mime_type,
|
|
521
|
+
"encoding": ref.encoding,
|
|
522
|
+
}
|
|
523
|
+
doc_entry["preview"] = f"[Binary content, {size} bytes]"
|
|
524
|
+
else:
|
|
525
|
+
doc_entry["content"] = content # Keep base64 inline
|
|
526
|
+
except Exception:
|
|
527
|
+
doc_entry["content"] = "[binary content - decode failed]"
|
|
528
|
+
doc_entry["size_bytes"] = 0
|
|
529
|
+
else:
|
|
530
|
+
# Text content
|
|
531
|
+
text = self._redact(str(content))
|
|
532
|
+
text_bytes = len(text.encode("utf-8"))
|
|
533
|
+
doc_entry["size_bytes"] = text_bytes
|
|
534
|
+
|
|
535
|
+
if text_bytes > self._config.max_element_bytes and self._artifact_store:
|
|
536
|
+
ref = self._artifact_store.store_text(text)
|
|
537
|
+
excerpt_len = self._config.element_excerpt_bytes
|
|
538
|
+
doc_entry["content_ref"] = {
|
|
539
|
+
"hash": ref.hash,
|
|
540
|
+
"path": ref.path,
|
|
541
|
+
"mime_type": ref.mime_type,
|
|
542
|
+
"encoding": ref.encoding,
|
|
543
|
+
}
|
|
544
|
+
doc_entry["excerpt"] = (
|
|
545
|
+
text[:excerpt_len] + "\n[TRUNCATED - see artifact for full content]"
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
doc_entry["content"] = text
|
|
549
|
+
|
|
550
|
+
doc_entries.append(doc_entry)
|
|
551
|
+
|
|
552
|
+
return {
|
|
553
|
+
"format_version": 3,
|
|
554
|
+
"type": "document_list",
|
|
555
|
+
"document_count": len(docs),
|
|
556
|
+
"documents": doc_entries,
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
def _structure_generic(self, content: Any) -> dict[str, Any]:
|
|
560
|
+
"""Structure generic content."""
|
|
561
|
+
converted = self._convert_types(content)
|
|
562
|
+
serialized = json.dumps(converted)
|
|
563
|
+
size = len(serialized.encode("utf-8"))
|
|
564
|
+
|
|
565
|
+
return {
|
|
566
|
+
"format_version": 3,
|
|
567
|
+
"type": "generic",
|
|
568
|
+
"size_bytes": size,
|
|
569
|
+
"content": converted,
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
def _extract_breakdown(self, structured: dict[str, Any]) -> dict[str, int]:
|
|
573
|
+
"""Extract size breakdown from already-structured content.
|
|
574
|
+
|
|
575
|
+
Uses metadata computed during structuring (which has access to full
|
|
576
|
+
image data) rather than recalculating from LMNR attributes (where
|
|
577
|
+
base64 image data is stripped).
|
|
578
|
+
"""
|
|
579
|
+
if structured.get("type") == "llm_messages":
|
|
580
|
+
metadata = structured.get("metadata", {})
|
|
581
|
+
return {
|
|
582
|
+
"text_bytes": metadata.get("total_text_bytes", 0),
|
|
583
|
+
"image_bytes": metadata.get("total_image_bytes", 0),
|
|
584
|
+
"tool_bytes": metadata.get("total_tool_bytes", 0),
|
|
585
|
+
}
|
|
586
|
+
elif "size_bytes" in structured:
|
|
587
|
+
return {"total_bytes": structured["size_bytes"]}
|
|
588
|
+
else:
|
|
589
|
+
serialized = json.dumps(self._convert_types(structured))
|
|
590
|
+
return {"total_bytes": len(serialized.encode("utf-8"))}
|
|
591
|
+
|
|
592
|
+
def _reduce_previews(self, structured: dict[str, Any]) -> dict[str, Any]:
|
|
593
|
+
"""Reduce preview/excerpt sizes to fit file under max_file_bytes."""
|
|
594
|
+
if structured.get("type") == "llm_messages":
|
|
595
|
+
# Reduce excerpt sizes in messages
|
|
596
|
+
for msg in structured.get("messages", []):
|
|
597
|
+
for part in msg.get("parts", []):
|
|
598
|
+
if "excerpt" in part:
|
|
599
|
+
# Reduce to 500 bytes
|
|
600
|
+
part["excerpt"] = part["excerpt"][:500] + "\n[TRUNCATED]"
|
|
601
|
+
return structured
|
|
602
|
+
|
|
603
|
+
def _redact(self, text: str) -> str:
|
|
604
|
+
"""Apply redaction patterns to text."""
|
|
605
|
+
for pattern in self._compiled_patterns:
|
|
606
|
+
text = pattern.sub("[REDACTED]", text)
|
|
607
|
+
return text
|
|
608
|
+
|
|
609
|
+
def _convert_types(self, value: Any, seen: set[int] | None = None) -> Any:
|
|
610
|
+
"""Convert non-serializable types recursively with cycle detection."""
|
|
611
|
+
# Cycle detection
|
|
612
|
+
if seen is None:
|
|
613
|
+
seen = set()
|
|
614
|
+
|
|
615
|
+
obj_id = id(value)
|
|
616
|
+
if obj_id in seen:
|
|
617
|
+
return "[circular reference]"
|
|
618
|
+
|
|
619
|
+
match value:
|
|
620
|
+
case None | bool() | int() | float() | str():
|
|
621
|
+
return value
|
|
622
|
+
case SecretStr():
|
|
623
|
+
return "[REDACTED:SecretStr]"
|
|
624
|
+
case bytes():
|
|
625
|
+
if len(value) < 100:
|
|
626
|
+
return f"[bytes: {len(value)} bytes, preview: {value[:50].hex()}...]"
|
|
627
|
+
return f"[bytes: {len(value)} bytes]"
|
|
628
|
+
case Path():
|
|
629
|
+
return str(value)
|
|
630
|
+
case UUID():
|
|
631
|
+
return str(value)
|
|
632
|
+
case datetime():
|
|
633
|
+
return value.isoformat()
|
|
634
|
+
case Enum():
|
|
635
|
+
return value.value
|
|
636
|
+
case set() | frozenset():
|
|
637
|
+
return sorted(str(x) for x in value)
|
|
638
|
+
case BaseModel():
|
|
639
|
+
try:
|
|
640
|
+
return value.model_dump(mode="json")
|
|
641
|
+
except Exception:
|
|
642
|
+
return str(value)
|
|
643
|
+
case dict():
|
|
644
|
+
seen.add(obj_id)
|
|
645
|
+
result = {str(k): self._convert_types(v, seen) for k, v in value.items()}
|
|
646
|
+
seen.discard(obj_id)
|
|
647
|
+
return result
|
|
648
|
+
case list() | tuple():
|
|
649
|
+
seen.add(obj_id)
|
|
650
|
+
result = [self._convert_types(x, seen) for x in value]
|
|
651
|
+
seen.discard(obj_id)
|
|
652
|
+
return result
|
|
653
|
+
case _:
|
|
654
|
+
# Try str() as fallback
|
|
655
|
+
try:
|
|
656
|
+
return str(value)
|
|
657
|
+
except Exception:
|
|
658
|
+
return f"<{type(value).__name__}>"
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def reconstruct_span_content(trace_root: Path, span_dir: Path, content_type: str) -> dict[str, Any]:
|
|
662
|
+
"""Reconstruct full content from input.yaml/output.yaml + artifacts.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
trace_root: Trace root directory
|
|
666
|
+
span_dir: Span directory containing input.yaml or output.yaml
|
|
667
|
+
content_type: "input" or "output"
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
Complete reconstructed content with all artifact refs resolved
|
|
671
|
+
"""
|
|
672
|
+
content_path = span_dir / f"{content_type}.yaml"
|
|
673
|
+
if not content_path.exists():
|
|
674
|
+
return {}
|
|
675
|
+
|
|
676
|
+
content = yaml.safe_load(content_path.read_text(encoding="utf-8"))
|
|
677
|
+
return _rehydrate(content, trace_root)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def _rehydrate(obj: Any, trace_root: Path) -> Any:
|
|
681
|
+
"""Recursively replace content_ref entries with actual content."""
|
|
682
|
+
if isinstance(obj, dict):
|
|
683
|
+
if "content_ref" in obj:
|
|
684
|
+
# This is an artifact reference - load the full content
|
|
685
|
+
ref = obj["content_ref"]
|
|
686
|
+
artifact_path = trace_root / ref["path"]
|
|
687
|
+
|
|
688
|
+
if ref.get("encoding") == "utf-8":
|
|
689
|
+
full_content = artifact_path.read_text(encoding="utf-8")
|
|
690
|
+
else:
|
|
691
|
+
full_content = artifact_path.read_bytes()
|
|
692
|
+
|
|
693
|
+
# Replace ref with full content
|
|
694
|
+
obj = obj.copy()
|
|
695
|
+
obj["content"] = full_content
|
|
696
|
+
del obj["content_ref"]
|
|
697
|
+
if "excerpt" in obj:
|
|
698
|
+
del obj["excerpt"]
|
|
699
|
+
|
|
700
|
+
return {k: _rehydrate(v, trace_root) for k, v in obj.items()}
|
|
701
|
+
|
|
702
|
+
elif isinstance(obj, list):
|
|
703
|
+
return [_rehydrate(v, trace_root) for v in obj]
|
|
704
|
+
|
|
705
|
+
return obj
|