tracellm-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +1 -0
- app/database/__init__.py +1 -0
- app/database/mongodb.py +94 -0
- app/database/project_service.py +97 -0
- app/database/trace_service.py +417 -0
- app/main.py +44 -0
- app/models/__init__.py +14 -0
- app/models/health.py +5 -0
- app/models/project.py +32 -0
- app/models/trace.py +71 -0
- app/models/trace_model.py +62 -0
- app/routes/__init__.py +1 -0
- app/routes/health.py +10 -0
- app/routes/observability.py +60 -0
- app/routes/projects.py +25 -0
- app/websocket/__init__.py +1 -0
- app/websocket/socket.py +64 -0
- sdk/__init__.py +3 -0
- sdk/tracer.py +8 -0
- tracellm/__init__.py +6 -0
- tracellm/banner.py +34 -0
- tracellm/cli.py +124 -0
- tracellm/db.py +75 -0
- tracellm/exporter.py +65 -0
- tracellm/integrations/__init__.py +4 -0
- tracellm/integrations/langchain.py +186 -0
- tracellm/integrations/openai.py +234 -0
- tracellm/integrations/tool_tracer.py +151 -0
- tracellm/mascot.py +49 -0
- tracellm/monitor.py +381 -0
- tracellm/palette.py +186 -0
- tracellm/replay.py +80 -0
- tracellm/startup.py +121 -0
- tracellm/summary.py +53 -0
- tracellm/trace_stream.py +68 -0
- tracellm/tracer.py +598 -0
- tracellm/tree_renderer.py +78 -0
- tracellm/utils.py +390 -0
- tracellm_cli-0.1.0.dist-info/METADATA +30 -0
- tracellm_cli-0.1.0.dist-info/RECORD +43 -0
- tracellm_cli-0.1.0.dist-info/WHEEL +5 -0
- tracellm_cli-0.1.0.dist-info/entry_points.txt +2 -0
- tracellm_cli-0.1.0.dist-info/top_level.txt +3 -0
tracellm/tracer.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextvars
|
|
3
|
+
import functools
|
|
4
|
+
import inspect
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, Callable
|
|
9
|
+
|
|
10
|
+
from tracellm.db import resolve_api_key, save_trace_payload
|
|
11
|
+
from tracellm.mascot import MascotState, header, message
|
|
12
|
+
from tracellm.summary import print_summary
|
|
13
|
+
from tracellm.trace_stream import TraceStream
|
|
14
|
+
from tracellm.utils import (
|
|
15
|
+
SLOW_TRACE_THRESHOLD_MS,
|
|
16
|
+
coerce_failure_reason,
|
|
17
|
+
coerce_response,
|
|
18
|
+
coerce_retry_count,
|
|
19
|
+
coerce_status,
|
|
20
|
+
coerce_steps,
|
|
21
|
+
console,
|
|
22
|
+
estimate_tokens,
|
|
23
|
+
render_trace_report,
|
|
24
|
+
simulate_step,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
_current_trace_context: contextvars.ContextVar[dict[str, Any] | None] = contextvars.ContextVar(
|
|
28
|
+
"_current_trace_context", default=None
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_trace_payload(
|
|
33
|
+
prompt: str,
|
|
34
|
+
model_name: str,
|
|
35
|
+
project_id: str,
|
|
36
|
+
project_name: str | None,
|
|
37
|
+
api_key: str | None,
|
|
38
|
+
environment: str,
|
|
39
|
+
result: Any,
|
|
40
|
+
trace_error: Exception | None,
|
|
41
|
+
started_at: datetime,
|
|
42
|
+
latency: float,
|
|
43
|
+
) -> dict[str, Any]:
|
|
44
|
+
response_text = coerce_response(result)
|
|
45
|
+
steps = coerce_steps(result)
|
|
46
|
+
retry_count = coerce_retry_count(result)
|
|
47
|
+
status = coerce_status(result, retry_count)
|
|
48
|
+
failure_reason = coerce_failure_reason(result)
|
|
49
|
+
|
|
50
|
+
ctx = _current_trace_context.get()
|
|
51
|
+
if ctx and not steps:
|
|
52
|
+
steps = ctx.get("collected_steps", [])
|
|
53
|
+
if ctx and not retry_count:
|
|
54
|
+
retry_count = ctx.get("retry_count", 0)
|
|
55
|
+
|
|
56
|
+
if trace_error is not None:
|
|
57
|
+
status = "failed"
|
|
58
|
+
failure_reason = str(trace_error)
|
|
59
|
+
response_text = response_text or ""
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"trace_id": str(uuid.uuid4()),
|
|
63
|
+
"prompt": prompt,
|
|
64
|
+
"response": response_text,
|
|
65
|
+
"latency": latency,
|
|
66
|
+
"token_count": estimate_tokens(prompt, response_text, steps),
|
|
67
|
+
"model_name": model_name,
|
|
68
|
+
"project_id": project_id,
|
|
69
|
+
"project_name": project_name,
|
|
70
|
+
"api_key": api_key,
|
|
71
|
+
"environment": environment,
|
|
72
|
+
"status": status,
|
|
73
|
+
"steps": steps,
|
|
74
|
+
"retry_count": retry_count,
|
|
75
|
+
"failure_reason": failure_reason,
|
|
76
|
+
"slow_request": latency >= SLOW_TRACE_THRESHOLD_MS,
|
|
77
|
+
"created_at": started_at.isoformat(),
|
|
78
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def persist_trace(trace_data: dict[str, Any]) -> None:
|
|
83
|
+
try:
|
|
84
|
+
save_trace_payload(trace_data)
|
|
85
|
+
except Exception as save_error:
|
|
86
|
+
console.print(f"[yellow]Trace persistence skipped:[/yellow] {save_error}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def finalize_trace(
|
|
90
|
+
prompt: str,
|
|
91
|
+
model_name: str,
|
|
92
|
+
project_id: str,
|
|
93
|
+
project_name: str | None,
|
|
94
|
+
api_key: str | None,
|
|
95
|
+
environment: str,
|
|
96
|
+
result: Any,
|
|
97
|
+
trace_error: Exception | None,
|
|
98
|
+
started_at: datetime,
|
|
99
|
+
latency: float,
|
|
100
|
+
render: bool = True,
|
|
101
|
+
) -> dict[str, Any]:
|
|
102
|
+
trace_data = build_trace_payload(
|
|
103
|
+
prompt, model_name, project_id, project_name, api_key, environment,
|
|
104
|
+
result, trace_error, started_at, latency,
|
|
105
|
+
)
|
|
106
|
+
persist_trace(trace_data)
|
|
107
|
+
if render:
|
|
108
|
+
render_trace_report(trace_data)
|
|
109
|
+
return trace_data
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _resolve_project_context(
|
|
113
|
+
api_key: str | None,
|
|
114
|
+
project: str | None,
|
|
115
|
+
environment: str | None,
|
|
116
|
+
) -> tuple[str, str | None, str, str | None]:
|
|
117
|
+
if api_key:
|
|
118
|
+
try:
|
|
119
|
+
key_record = resolve_api_key(api_key)
|
|
120
|
+
return (
|
|
121
|
+
key_record.project_id,
|
|
122
|
+
project or key_record.project_id,
|
|
123
|
+
environment or key_record.environment,
|
|
124
|
+
key_record.key,
|
|
125
|
+
)
|
|
126
|
+
except Exception:
|
|
127
|
+
return (project or "default", project, environment or "development", api_key)
|
|
128
|
+
return (project or "default", project, environment or "development", None)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def trace(
|
|
132
|
+
prompt: str = "",
|
|
133
|
+
model_name: str = "unknown",
|
|
134
|
+
api_key: str | None = None,
|
|
135
|
+
project: str | None = None,
|
|
136
|
+
environment: str = "development",
|
|
137
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
138
|
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
139
|
+
is_async = inspect.iscoroutinefunction(func)
|
|
140
|
+
|
|
141
|
+
if is_async:
|
|
142
|
+
|
|
143
|
+
@functools.wraps(func)
|
|
144
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
145
|
+
started_at = datetime.now(timezone.utc)
|
|
146
|
+
start = time.perf_counter()
|
|
147
|
+
result: Any = None
|
|
148
|
+
trace_error: Exception | None = None
|
|
149
|
+
effective_prompt = prompt or func.__name__
|
|
150
|
+
project_id, project_name, effective_environment, resolved_key = _resolve_project_context(
|
|
151
|
+
api_key=api_key, project=project, environment=environment,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
ctx_token = _current_trace_context.set({
|
|
155
|
+
"project_id": project_id,
|
|
156
|
+
"project_name": project_name,
|
|
157
|
+
"environment": effective_environment,
|
|
158
|
+
"api_key": resolved_key,
|
|
159
|
+
"collected_steps": [],
|
|
160
|
+
"retry_count": 0,
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
result = await func(*args, **kwargs)
|
|
165
|
+
return result
|
|
166
|
+
except Exception as error:
|
|
167
|
+
trace_error = error
|
|
168
|
+
raise
|
|
169
|
+
finally:
|
|
170
|
+
latency = round((time.perf_counter() - start) * 1000, 2)
|
|
171
|
+
finalize_trace(
|
|
172
|
+
prompt=effective_prompt,
|
|
173
|
+
model_name=model_name,
|
|
174
|
+
project_id=project_id,
|
|
175
|
+
project_name=project_name,
|
|
176
|
+
api_key=resolved_key,
|
|
177
|
+
environment=effective_environment,
|
|
178
|
+
result=result,
|
|
179
|
+
trace_error=trace_error,
|
|
180
|
+
started_at=started_at,
|
|
181
|
+
latency=latency,
|
|
182
|
+
render=True,
|
|
183
|
+
)
|
|
184
|
+
_current_trace_context.reset(ctx_token)
|
|
185
|
+
|
|
186
|
+
return async_wrapper
|
|
187
|
+
|
|
188
|
+
else:
|
|
189
|
+
|
|
190
|
+
@functools.wraps(func)
|
|
191
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
192
|
+
started_at = datetime.now(timezone.utc)
|
|
193
|
+
start = time.perf_counter()
|
|
194
|
+
result: Any = None
|
|
195
|
+
trace_error: Exception | None = None
|
|
196
|
+
effective_prompt = prompt or func.__name__
|
|
197
|
+
project_id, project_name, effective_environment, resolved_key = _resolve_project_context(
|
|
198
|
+
api_key=api_key, project=project, environment=environment,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
ctx_token = _current_trace_context.set({
|
|
202
|
+
"project_id": project_id,
|
|
203
|
+
"project_name": project_name,
|
|
204
|
+
"environment": effective_environment,
|
|
205
|
+
"api_key": resolved_key,
|
|
206
|
+
"collected_steps": [],
|
|
207
|
+
"retry_count": 0,
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
result = func(*args, **kwargs)
|
|
212
|
+
return result
|
|
213
|
+
except Exception as error:
|
|
214
|
+
trace_error = error
|
|
215
|
+
raise
|
|
216
|
+
finally:
|
|
217
|
+
latency = round((time.perf_counter() - start) * 1000, 2)
|
|
218
|
+
finalize_trace(
|
|
219
|
+
prompt=effective_prompt,
|
|
220
|
+
model_name=model_name,
|
|
221
|
+
project_id=project_id,
|
|
222
|
+
project_name=project_name,
|
|
223
|
+
api_key=resolved_key,
|
|
224
|
+
environment=effective_environment,
|
|
225
|
+
result=result,
|
|
226
|
+
trace_error=trace_error,
|
|
227
|
+
started_at=started_at,
|
|
228
|
+
latency=latency,
|
|
229
|
+
render=True,
|
|
230
|
+
)
|
|
231
|
+
_current_trace_context.reset(ctx_token)
|
|
232
|
+
|
|
233
|
+
return wrapper
|
|
234
|
+
|
|
235
|
+
return decorator
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def simulate_llm_response(prompt: str = "Explain transformers for a production RAG + agent engineering team.") -> dict[str, Any]:
|
|
239
|
+
import random
|
|
240
|
+
|
|
241
|
+
question = prompt
|
|
242
|
+
session_id = str(uuid.uuid4())[:8]
|
|
243
|
+
steps: list[dict[str, Any]] = []
|
|
244
|
+
retry_count = random.randint(0, 2)
|
|
245
|
+
attempt_count = retry_count + 1
|
|
246
|
+
corpus_options = [
|
|
247
|
+
"attention_is_all_you_need",
|
|
248
|
+
"rag_failure_playbook",
|
|
249
|
+
"agent_latency_benchmarks",
|
|
250
|
+
"toolformer_notes",
|
|
251
|
+
"long_context_eval_report",
|
|
252
|
+
"retrieval_system_design",
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
embedding_dims = random.choice([1536, 3072])
|
|
256
|
+
query_vector_checksum = hex(random.getrandbits(24))
|
|
257
|
+
simulate_step(
|
|
258
|
+
steps=steps,
|
|
259
|
+
tool_name="query_embedding",
|
|
260
|
+
input_data={
|
|
261
|
+
"session_id": session_id,
|
|
262
|
+
"query": question,
|
|
263
|
+
"embedding_model": "text-embedding-3-large",
|
|
264
|
+
},
|
|
265
|
+
output_data={
|
|
266
|
+
"vector_dimensions": embedding_dims,
|
|
267
|
+
"embedding_norm": round(random.uniform(0.98, 1.04), 4),
|
|
268
|
+
"checksum": query_vector_checksum,
|
|
269
|
+
"replay": {"stage": "embedding", "seed_hint": session_id},
|
|
270
|
+
},
|
|
271
|
+
min_delay=0.08,
|
|
272
|
+
max_delay=0.22,
|
|
273
|
+
random_module=random,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
retrieved_docs = random.randint(14, 24)
|
|
277
|
+
top_k = random.randint(6, 9)
|
|
278
|
+
simulate_step(
|
|
279
|
+
steps=steps,
|
|
280
|
+
tool_name="vector_retrieval",
|
|
281
|
+
input_data={
|
|
282
|
+
"session_id": session_id,
|
|
283
|
+
"query": question,
|
|
284
|
+
"index": "research_embeddings_v2",
|
|
285
|
+
"top_k": top_k,
|
|
286
|
+
"filters": {"domain": "llm-systems", "freshness_days": 180},
|
|
287
|
+
},
|
|
288
|
+
output_data={
|
|
289
|
+
"documents_found": retrieved_docs,
|
|
290
|
+
"candidate_chunks": top_k,
|
|
291
|
+
"latency_bucket": random.choice(["p50", "p75", "p95"]),
|
|
292
|
+
"selected_ids": random.sample(corpus_options, k=min(top_k, len(corpus_options))),
|
|
293
|
+
"replay": {
|
|
294
|
+
"stage": "retrieval",
|
|
295
|
+
"query_hash": query_vector_checksum,
|
|
296
|
+
"cursor": f"retrieval:{session_id}",
|
|
297
|
+
},
|
|
298
|
+
},
|
|
299
|
+
min_delay=0.18,
|
|
300
|
+
max_delay=0.42,
|
|
301
|
+
random_module=random,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
reranked_chunks = random.randint(4, 6)
|
|
305
|
+
simulate_step(
|
|
306
|
+
steps=steps,
|
|
307
|
+
tool_name="rerank_context",
|
|
308
|
+
input_data={
|
|
309
|
+
"session_id": session_id,
|
|
310
|
+
"strategy": "cross-encoder",
|
|
311
|
+
"candidate_count": top_k,
|
|
312
|
+
},
|
|
313
|
+
output_data={
|
|
314
|
+
"reranked_chunks": reranked_chunks,
|
|
315
|
+
"coverage_score": round(random.uniform(0.82, 0.96), 3),
|
|
316
|
+
"dropped_chunks": max(0, top_k - reranked_chunks),
|
|
317
|
+
"replay": {"stage": "rerank", "selected_chunk_count": reranked_chunks},
|
|
318
|
+
},
|
|
319
|
+
min_delay=0.09,
|
|
320
|
+
max_delay=0.24,
|
|
321
|
+
random_module=random,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
simulate_step(
|
|
325
|
+
steps=steps,
|
|
326
|
+
tool_name="agent_planner",
|
|
327
|
+
input_data={
|
|
328
|
+
"session_id": session_id,
|
|
329
|
+
"mode": "multi-hop-reasoning",
|
|
330
|
+
"objective": "teach architecture and operational tradeoffs",
|
|
331
|
+
},
|
|
332
|
+
output_data={
|
|
333
|
+
"plan": [
|
|
334
|
+
"summarize transformer core concepts",
|
|
335
|
+
"connect self-attention to scaling behavior",
|
|
336
|
+
"map concepts to RAG and tool-using agents",
|
|
337
|
+
"call out failure modes and observability metrics",
|
|
338
|
+
],
|
|
339
|
+
"planner_confidence": round(random.uniform(0.81, 0.94), 3),
|
|
340
|
+
"requires_tool_validation": retry_count > 0,
|
|
341
|
+
"replay": {"stage": "planning", "plan_id": f"plan-{session_id}"},
|
|
342
|
+
},
|
|
343
|
+
min_delay=0.12,
|
|
344
|
+
max_delay=0.31,
|
|
345
|
+
random_module=random,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
simulate_step(
|
|
349
|
+
steps=steps,
|
|
350
|
+
tool_name="context_window_allocator",
|
|
351
|
+
input_data={
|
|
352
|
+
"session_id": session_id,
|
|
353
|
+
"budget_tokens": random.randint(4800, 7200),
|
|
354
|
+
"response_budget": random.randint(900, 1400),
|
|
355
|
+
},
|
|
356
|
+
output_data={
|
|
357
|
+
"allocated_context_tokens": random.randint(3000, 5200),
|
|
358
|
+
"reserved_for_tools": random.randint(500, 900),
|
|
359
|
+
"compression_applied": random.choice([True, False]),
|
|
360
|
+
"replay": {
|
|
361
|
+
"stage": "budgeting",
|
|
362
|
+
"slot_map": ["system", "retrieval", "tools", "generation"],
|
|
363
|
+
},
|
|
364
|
+
},
|
|
365
|
+
min_delay=0.05,
|
|
366
|
+
max_delay=0.18,
|
|
367
|
+
random_module=random,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if retry_count > 0:
|
|
371
|
+
for attempt in range(1, attempt_count):
|
|
372
|
+
simulate_step(
|
|
373
|
+
steps=steps,
|
|
374
|
+
tool_name="tool_schema_lookup",
|
|
375
|
+
input_data={
|
|
376
|
+
"session_id": session_id,
|
|
377
|
+
"attempt": attempt,
|
|
378
|
+
"requested_tool": "citation_builder",
|
|
379
|
+
},
|
|
380
|
+
output_data={
|
|
381
|
+
"error": random.choice(
|
|
382
|
+
[
|
|
383
|
+
"schema registry timeout",
|
|
384
|
+
"stale tool contract version",
|
|
385
|
+
"partial metadata returned",
|
|
386
|
+
]
|
|
387
|
+
),
|
|
388
|
+
"retryable": True,
|
|
389
|
+
"replay": {
|
|
390
|
+
"stage": "tool_lookup",
|
|
391
|
+
"attempt": attempt,
|
|
392
|
+
"decision": "retry",
|
|
393
|
+
},
|
|
394
|
+
},
|
|
395
|
+
min_delay=0.11,
|
|
396
|
+
max_delay=0.28,
|
|
397
|
+
random_module=random,
|
|
398
|
+
success=False,
|
|
399
|
+
)
|
|
400
|
+
simulate_step(
|
|
401
|
+
steps=steps,
|
|
402
|
+
tool_name="retry_guard",
|
|
403
|
+
input_data={
|
|
404
|
+
"session_id": session_id,
|
|
405
|
+
"attempt": attempt,
|
|
406
|
+
"policy": "exponential_backoff_with_jitter",
|
|
407
|
+
},
|
|
408
|
+
output_data={
|
|
409
|
+
"status": "retry_scheduled",
|
|
410
|
+
"backoff_ms": random.randint(180, 650),
|
|
411
|
+
"guardrail_state": "within_threshold",
|
|
412
|
+
"replay": {
|
|
413
|
+
"stage": "retry",
|
|
414
|
+
"attempt": attempt,
|
|
415
|
+
"next_attempt": attempt + 1,
|
|
416
|
+
},
|
|
417
|
+
},
|
|
418
|
+
min_delay=0.07,
|
|
419
|
+
max_delay=0.19,
|
|
420
|
+
random_module=random,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
simulate_step(
|
|
424
|
+
steps=steps,
|
|
425
|
+
tool_name="tool_schema_lookup",
|
|
426
|
+
input_data={
|
|
427
|
+
"session_id": session_id,
|
|
428
|
+
"attempt": attempt_count,
|
|
429
|
+
"requested_tool": "citation_builder",
|
|
430
|
+
},
|
|
431
|
+
output_data={
|
|
432
|
+
"tool_contract_version": f"2026.05.{random.randint(10, 28)}",
|
|
433
|
+
"arguments_validated": True,
|
|
434
|
+
"replay": {
|
|
435
|
+
"stage": "tool_lookup",
|
|
436
|
+
"attempt": attempt_count,
|
|
437
|
+
"decision": "continue",
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
min_delay=0.09,
|
|
441
|
+
max_delay=0.21,
|
|
442
|
+
random_module=random,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
simulate_step(
|
|
446
|
+
steps=steps,
|
|
447
|
+
tool_name="citation_builder",
|
|
448
|
+
input_data={
|
|
449
|
+
"session_id": session_id,
|
|
450
|
+
"source_count": reranked_chunks,
|
|
451
|
+
"format": "inline-bullets",
|
|
452
|
+
},
|
|
453
|
+
output_data={
|
|
454
|
+
"citations_generated": reranked_chunks,
|
|
455
|
+
"deduplicated_sources": random.randint(3, reranked_chunks),
|
|
456
|
+
"replay": {"stage": "tool_execution", "artifact_id": f"cite-{session_id}"},
|
|
457
|
+
},
|
|
458
|
+
min_delay=0.1,
|
|
459
|
+
max_delay=0.23,
|
|
460
|
+
random_module=random,
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
generation_started = time.perf_counter()
|
|
464
|
+
time.sleep(random.uniform(0.95, 1.9))
|
|
465
|
+
response = f"""
|
|
466
|
+
Transformers are neural architectures built around self-attention, which means the model can score how strongly every token should attend to every other token while building the next internal representation. That shift matters because it removes the strictly sequential bottleneck of older recurrent systems and makes training dramatically more parallel, which is why transformers became the default foundation for modern language models, multimodal systems, retrieval-heavy copilots, and agent frameworks.
|
|
467
|
+
|
|
468
|
+
At a systems level, the important intuition is that each layer repeatedly mixes three things: token identity, token position, and context relevance. Multi-head attention lets the model inspect several interaction patterns at once, so one head can track local syntax, another can follow long-range references, and another can focus on task-specific structure such as citations, code blocks, or tool outputs. Feed-forward blocks then reshape those mixed representations into features the next layer can use. Stack enough of these layers and the model learns abstractions that look like reasoning traces, latent memory lookups, planning heuristics, and style control even though the runtime primitive is still next-token prediction.
|
|
469
|
+
|
|
470
|
+
For production RAG and agent systems, transformers are only one part of the story. The operational pipeline usually includes query embedding, vector retrieval, reranking, prompt assembly, tool selection, retry handling, and final generation. A good answer is not just a function of the base model weights; it also depends on whether retrieval returned the right evidence, whether the planner selected the right tools, whether context budgeting dropped a critical chunk, and whether retries recovered from transient failures without hiding instability from operators.
|
|
471
|
+
|
|
472
|
+
That is why observability matters. When a transformer-based application appears to hallucinate, the root cause may actually be upstream: a low-recall vector search, schema drift in a tool contract, latency-induced truncation, or a retry path that silently swapped evidence sets between attempts. High-fidelity traces let teams inspect the exact execution graph, including step durations, retries, tool outputs, retrieval confidence, and token budgets. This makes it possible to distinguish model limitations from systems integration issues.
|
|
473
|
+
|
|
474
|
+
In practical terms, transformers excel because they scale with data, compute, and context more effectively than earlier sequence models. Self-attention produces rich contextual representations; retrieval extends the model with fresh external knowledge; tools let the system act beyond pure text generation; and planners coordinate these components into multi-step workflows. The resulting stack is powerful, but it is also failure-prone. The healthiest engineering pattern is to treat the LLM as one subsystem inside a larger distributed decision engine and trace every important boundary the same way you would trace a payment pipeline or a search request path.
|
|
475
|
+
|
|
476
|
+
If you are testing a dashboard, this run is intentionally token-heavy and observability-rich: it includes retrieval, planning, context allocation, tool validation, optional retries, and a long-form answer so latency, token volume, retries, and step timelines are all visible in the resulting trace payload. Session `{session_id}` completed after `{attempt_count}` tool lookup attempt(s), with `{len(steps)}` replayable steps recorded before generation finished.
|
|
477
|
+
""".strip()
|
|
478
|
+
generation_duration = round((time.perf_counter() - generation_started) * 1000, 2)
|
|
479
|
+
steps.append(
|
|
480
|
+
{
|
|
481
|
+
"step_id": str(uuid.uuid4()),
|
|
482
|
+
"tool_name": "response_generation",
|
|
483
|
+
"input": {
|
|
484
|
+
"session_id": session_id,
|
|
485
|
+
"model": "gpt-4.1-mini",
|
|
486
|
+
"temperature": 0.4,
|
|
487
|
+
"max_output_tokens": 1400,
|
|
488
|
+
},
|
|
489
|
+
"output": {
|
|
490
|
+
"preview": response[:220],
|
|
491
|
+
"output_sections": 6,
|
|
492
|
+
"estimated_completion_tokens": estimate_tokens(response),
|
|
493
|
+
"replay": {
|
|
494
|
+
"stage": "generation",
|
|
495
|
+
"response_id": f"resp-{session_id}",
|
|
496
|
+
"attempt_count": attempt_count,
|
|
497
|
+
},
|
|
498
|
+
},
|
|
499
|
+
"duration": generation_duration,
|
|
500
|
+
"success": True,
|
|
501
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
502
|
+
}
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
return {
|
|
506
|
+
"response": response,
|
|
507
|
+
"status": "warning" if retry_count > 0 else "success",
|
|
508
|
+
"retry_count": retry_count,
|
|
509
|
+
"steps": steps,
|
|
510
|
+
"observability": {
|
|
511
|
+
"session_id": session_id,
|
|
512
|
+
"retrieval_candidates": retrieved_docs,
|
|
513
|
+
"final_context_chunks": reranked_chunks,
|
|
514
|
+
"attempt_count": attempt_count,
|
|
515
|
+
},
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def run_live_trace(
|
|
520
|
+
prompt: str,
|
|
521
|
+
model_name: str = "gpt-4.1-mini",
|
|
522
|
+
project: str | None = None,
|
|
523
|
+
api_key: str | None = None,
|
|
524
|
+
environment: str = "development",
|
|
525
|
+
render: bool = True,
|
|
526
|
+
) -> dict[str, Any]:
|
|
527
|
+
started_at = datetime.now(timezone.utc)
|
|
528
|
+
start = time.perf_counter()
|
|
529
|
+
result = None
|
|
530
|
+
trace_error: Exception | None = None
|
|
531
|
+
project_id, project_name, effective_environment, resolved_key = _resolve_project_context(
|
|
532
|
+
api_key=api_key,
|
|
533
|
+
project=project,
|
|
534
|
+
environment=environment,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
console.print()
|
|
538
|
+
console.print(header("Tracing request...", MascotState.LOADING))
|
|
539
|
+
console.print()
|
|
540
|
+
|
|
541
|
+
_STEP_EVENTS: list[tuple[str, str]] = [
|
|
542
|
+
("query.embed", "Embedding prompt"),
|
|
543
|
+
("vector.search", "Searching vector index"),
|
|
544
|
+
("context.rerank", "Reranking context"),
|
|
545
|
+
("agent.plan", "Planning tool execution"),
|
|
546
|
+
("context.allocate", "Allocating context window"),
|
|
547
|
+
("tool.chain", "Running tool chain"),
|
|
548
|
+
("llm.generate", "Generating answer"),
|
|
549
|
+
]
|
|
550
|
+
|
|
551
|
+
with TraceStream(prompt, model_name) as stream:
|
|
552
|
+
finished_steps: list[dict[str, Any]] = []
|
|
553
|
+
for event_name, label in _STEP_EVENTS:
|
|
554
|
+
stream.emit(event_name, label)
|
|
555
|
+
if label == "Generating answer":
|
|
556
|
+
try:
|
|
557
|
+
result = simulate_llm_response(prompt)
|
|
558
|
+
finished_steps = coerce_steps(result)
|
|
559
|
+
except Exception as error:
|
|
560
|
+
trace_error = error
|
|
561
|
+
raise
|
|
562
|
+
|
|
563
|
+
# If simulation didn't generate steps, emit step events from simulation
|
|
564
|
+
if not finished_steps and result:
|
|
565
|
+
finished_steps = coerce_steps(result)
|
|
566
|
+
|
|
567
|
+
latency = round((time.perf_counter() - start) * 1000, 2)
|
|
568
|
+
trace_data = finalize_trace(
|
|
569
|
+
prompt=prompt,
|
|
570
|
+
model_name=model_name,
|
|
571
|
+
project_id=project_id,
|
|
572
|
+
project_name=project_name,
|
|
573
|
+
api_key=resolved_key,
|
|
574
|
+
environment=effective_environment,
|
|
575
|
+
result=result,
|
|
576
|
+
trace_error=trace_error,
|
|
577
|
+
started_at=started_at,
|
|
578
|
+
latency=latency,
|
|
579
|
+
render=False,
|
|
580
|
+
)
|
|
581
|
+
print_summary(trace_data)
|
|
582
|
+
status = str(trace_data.get("status", "success")).lower()
|
|
583
|
+
if status == "success":
|
|
584
|
+
console.print(message("Trace complete", MascotState.SUCCESS))
|
|
585
|
+
elif status in ("warning", "failed"):
|
|
586
|
+
console.print(message("Warning: tool execution failed", MascotState.WARNING))
|
|
587
|
+
console.print()
|
|
588
|
+
return trace_data
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
@trace(
|
|
592
|
+
prompt="Explain transformers",
|
|
593
|
+
model_name="gpt-4.1-mini",
|
|
594
|
+
project="demo-workspace",
|
|
595
|
+
environment="development",
|
|
596
|
+
)
|
|
597
|
+
def llm_response() -> dict[str, Any]:
|
|
598
|
+
return simulate_llm_response()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Execution tree view for replay — Rich Tree with nested steps."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from rich.panel import Panel
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
from rich.tree import Tree
|
|
10
|
+
|
|
11
|
+
from tracellm.utils import console, latency_style
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _step_icon(step: dict[str, Any], active: bool, done: bool) -> str:
|
|
15
|
+
if active:
|
|
16
|
+
return "\u25b6"
|
|
17
|
+
if not step.get("success", True):
|
|
18
|
+
return "\u2717"
|
|
19
|
+
if done:
|
|
20
|
+
return "\u2713"
|
|
21
|
+
return " "
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _step_label(step: dict[str, Any]) -> str:
|
|
25
|
+
parts = []
|
|
26
|
+
tool_name = step.get("tool_name", "unknown")
|
|
27
|
+
duration = float(step.get("duration", 0.0))
|
|
28
|
+
success = bool(step.get("success", True))
|
|
29
|
+
|
|
30
|
+
parts.append(tool_name)
|
|
31
|
+
parts.append(f"[bright_black]{duration:.0f}ms[/bright_black]")
|
|
32
|
+
if not success:
|
|
33
|
+
parts.append("[red]RETRY[/red]")
|
|
34
|
+
|
|
35
|
+
return " ".join(parts)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def build_execution_tree(
|
|
39
|
+
steps: list[dict[str, Any]],
|
|
40
|
+
active_index: int | None = None,
|
|
41
|
+
) -> Tree:
|
|
42
|
+
"""Build a nested execution tree from trace steps."""
|
|
43
|
+
tree = Tree(
|
|
44
|
+
Text("agent:start", style="bold white"),
|
|
45
|
+
guide_style="bright_black",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
for i, step in enumerate(steps, 1):
|
|
49
|
+
is_active = active_index == i
|
|
50
|
+
is_done = active_index is not None and i < active_index
|
|
51
|
+
icon = _step_icon(step, is_active, is_done)
|
|
52
|
+
|
|
53
|
+
style = "cyan" if is_active else "dim" if (active_index is not None and i > active_index) else "white"
|
|
54
|
+
label = f"[{style}]{icon}[/] [{style}]{_step_label(step)}[/]"
|
|
55
|
+
|
|
56
|
+
if "children" in step and step["children"]:
|
|
57
|
+
branch = tree.add(label)
|
|
58
|
+
for child in step["children"]:
|
|
59
|
+
c_icon = _step_icon(child, False, True)
|
|
60
|
+
c_label = f"{c_icon} {_step_label(child)}"
|
|
61
|
+
branch.add(c_label)
|
|
62
|
+
else:
|
|
63
|
+
tree.add(label)
|
|
64
|
+
|
|
65
|
+
status = "success" if all(s.get("success", True) for s in steps) else "warning"
|
|
66
|
+
final_style = "green" if status == "success" else "yellow"
|
|
67
|
+
tree.add(f"[{final_style}]\u2713[/] [{final_style}]done[/]")
|
|
68
|
+
|
|
69
|
+
return tree
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def render_execution_panel(
|
|
73
|
+
steps: list[dict[str, Any]],
|
|
74
|
+
active_index: int | None = None,
|
|
75
|
+
) -> Panel:
|
|
76
|
+
"""Render the execution tree inside a Panel."""
|
|
77
|
+
tree = build_execution_tree(steps, active_index=active_index)
|
|
78
|
+
return Panel(tree, title="Execution Tree", border_style="bright_black", padding=(1, 2))
|