synth-ai 0.2.3__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/compound/cais.py +0 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +115 -1
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/test_crafter_react_agent_lm_synth.py +3 -3
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/test_crafter_react_agent_lm_synth_v2_backup.py +3 -3
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +4 -4
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/test_crafter_react_agent_openai_v2_backup.py +3 -3
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +1 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/crafter_custom/environment.py +1 -1
- synth_ai/environments/service/core_routes.py +1 -1
- synth_ai/learning/prompts/mipro.py +8 -0
- synth_ai/lm/core/main_v3.py +219 -158
- synth_ai/tracing_v3/__init__.py +2 -2
- synth_ai/tracing_v3/abstractions.py +62 -17
- synth_ai/tracing_v3/hooks.py +1 -1
- synth_ai/tracing_v3/llm_call_record_helpers.py +350 -0
- synth_ai/tracing_v3/lm_call_record_abstractions.py +257 -0
- synth_ai/tracing_v3/session_tracer.py +5 -5
- synth_ai/tracing_v3/tests/test_concurrent_operations.py +1 -1
- synth_ai/tracing_v3/tests/test_llm_call_records.py +672 -0
- synth_ai/tracing_v3/tests/test_session_tracer.py +43 -9
- synth_ai/tracing_v3/tests/test_turso_manager.py +1 -1
- synth_ai/tracing_v3/turso/manager.py +10 -3
- synth_ai/tracing_v3/turso/models.py +1 -0
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/METADATA +3 -2
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/RECORD +30 -26
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.3.dist-info → synth_ai-0.2.4.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,672 @@
|
|
1
|
+
"""Unit tests for LMCAISEvent with LLMCallRecord integration.
|
2
|
+
|
3
|
+
This module tests the new call_records field in LMCAISEvent and demonstrates
|
4
|
+
proper usage patterns for migrating from legacy fields to the new structure.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
import time
|
9
|
+
import uuid
|
10
|
+
from datetime import datetime
|
11
|
+
from typing import List, Dict, Any
|
12
|
+
|
13
|
+
import pytest
|
14
|
+
|
15
|
+
from synth_ai.tracing_v3.abstractions import (
|
16
|
+
LMCAISEvent,
|
17
|
+
TimeRecord,
|
18
|
+
SessionTimeStep,
|
19
|
+
SessionTrace,
|
20
|
+
)
|
21
|
+
from synth_ai.tracing_v3.lm_call_record_abstractions import (
|
22
|
+
LLMCallRecord,
|
23
|
+
LLMUsage,
|
24
|
+
LLMRequestParams,
|
25
|
+
LLMMessage,
|
26
|
+
LLMContentPart,
|
27
|
+
ToolCallSpec,
|
28
|
+
ToolCallResult,
|
29
|
+
compute_latency_ms,
|
30
|
+
)
|
31
|
+
|
32
|
+
|
33
|
+
class TestLLMCallRecord:
|
34
|
+
"""Test LLMCallRecord creation and manipulation."""
|
35
|
+
|
36
|
+
def test_create_basic_call_record(self):
|
37
|
+
"""Test creating a basic LLMCallRecord."""
|
38
|
+
call_id = str(uuid.uuid4())
|
39
|
+
started_at = datetime.utcnow()
|
40
|
+
|
41
|
+
record = LLMCallRecord(
|
42
|
+
call_id=call_id,
|
43
|
+
api_type="chat_completions",
|
44
|
+
provider="openai",
|
45
|
+
model_name="gpt-4",
|
46
|
+
started_at=started_at,
|
47
|
+
input_messages=[
|
48
|
+
LLMMessage(
|
49
|
+
role="user",
|
50
|
+
parts=[LLMContentPart(type="text", text="What is 2+2?")]
|
51
|
+
)
|
52
|
+
],
|
53
|
+
output_messages=[
|
54
|
+
LLMMessage(
|
55
|
+
role="assistant",
|
56
|
+
parts=[LLMContentPart(type="text", text="4")]
|
57
|
+
)
|
58
|
+
],
|
59
|
+
usage=LLMUsage(
|
60
|
+
input_tokens=10,
|
61
|
+
output_tokens=5,
|
62
|
+
total_tokens=15,
|
63
|
+
cost_usd=0.0003
|
64
|
+
),
|
65
|
+
finish_reason="stop"
|
66
|
+
)
|
67
|
+
|
68
|
+
assert record.call_id == call_id
|
69
|
+
assert record.api_type == "chat_completions"
|
70
|
+
assert record.model_name == "gpt-4"
|
71
|
+
assert record.usage.total_tokens == 15
|
72
|
+
assert len(record.input_messages) == 1
|
73
|
+
assert len(record.output_messages) == 1
|
74
|
+
|
75
|
+
def test_compute_latency(self):
|
76
|
+
"""Test latency computation from timestamps."""
|
77
|
+
started_at = datetime.utcnow()
|
78
|
+
completed_at = datetime.utcnow()
|
79
|
+
|
80
|
+
record = LLMCallRecord(
|
81
|
+
call_id=str(uuid.uuid4()),
|
82
|
+
api_type="chat_completions",
|
83
|
+
started_at=started_at,
|
84
|
+
completed_at=completed_at
|
85
|
+
)
|
86
|
+
|
87
|
+
latency = compute_latency_ms(record)
|
88
|
+
assert latency is not None
|
89
|
+
assert latency >= 0
|
90
|
+
assert record.latency_ms == latency
|
91
|
+
|
92
|
+
def test_tool_call_record(self):
|
93
|
+
"""Test LLMCallRecord with tool calls."""
|
94
|
+
record = LLMCallRecord(
|
95
|
+
call_id=str(uuid.uuid4()),
|
96
|
+
api_type="chat_completions",
|
97
|
+
provider="openai",
|
98
|
+
model_name="gpt-4",
|
99
|
+
output_tool_calls=[
|
100
|
+
ToolCallSpec(
|
101
|
+
name="get_weather",
|
102
|
+
arguments_json='{"location": "San Francisco"}',
|
103
|
+
arguments={"location": "San Francisco"},
|
104
|
+
call_id="tool_1"
|
105
|
+
)
|
106
|
+
],
|
107
|
+
tool_results=[
|
108
|
+
ToolCallResult(
|
109
|
+
call_id="tool_1",
|
110
|
+
output_text="72°F, sunny",
|
111
|
+
status="ok",
|
112
|
+
duration_ms=150
|
113
|
+
)
|
114
|
+
]
|
115
|
+
)
|
116
|
+
|
117
|
+
assert len(record.output_tool_calls) == 1
|
118
|
+
assert record.output_tool_calls[0].name == "get_weather"
|
119
|
+
assert len(record.tool_results) == 1
|
120
|
+
assert record.tool_results[0].status == "ok"
|
121
|
+
|
122
|
+
|
123
|
+
class TestLMCAISEventWithCallRecords:
|
124
|
+
"""Test LMCAISEvent with integrated LLMCallRecord."""
|
125
|
+
|
126
|
+
def test_create_event_with_call_records(self):
|
127
|
+
"""Test creating an LMCAISEvent with call_records."""
|
128
|
+
call_record = LLMCallRecord(
|
129
|
+
call_id=str(uuid.uuid4()),
|
130
|
+
api_type="chat_completions",
|
131
|
+
provider="openai",
|
132
|
+
model_name="gpt-4",
|
133
|
+
usage=LLMUsage(
|
134
|
+
input_tokens=100,
|
135
|
+
output_tokens=50,
|
136
|
+
total_tokens=150,
|
137
|
+
cost_usd=0.003
|
138
|
+
),
|
139
|
+
latency_ms=500
|
140
|
+
)
|
141
|
+
|
142
|
+
event = LMCAISEvent(
|
143
|
+
system_instance_id="llm_system",
|
144
|
+
time_record=TimeRecord(event_time=time.time()),
|
145
|
+
call_records=[call_record]
|
146
|
+
)
|
147
|
+
|
148
|
+
assert len(event.call_records) == 1
|
149
|
+
assert event.call_records[0].model_name == "gpt-4"
|
150
|
+
assert event.call_records[0].usage.total_tokens == 150
|
151
|
+
|
152
|
+
def test_aggregate_from_call_records(self):
|
153
|
+
"""Test computing aggregates from multiple call_records."""
|
154
|
+
call_records = [
|
155
|
+
LLMCallRecord(
|
156
|
+
call_id=str(uuid.uuid4()),
|
157
|
+
api_type="chat_completions",
|
158
|
+
model_name="gpt-4",
|
159
|
+
usage=LLMUsage(
|
160
|
+
input_tokens=100,
|
161
|
+
output_tokens=50,
|
162
|
+
total_tokens=150,
|
163
|
+
cost_usd=0.003
|
164
|
+
),
|
165
|
+
latency_ms=500
|
166
|
+
),
|
167
|
+
LLMCallRecord(
|
168
|
+
call_id=str(uuid.uuid4()),
|
169
|
+
api_type="chat_completions",
|
170
|
+
model_name="gpt-4",
|
171
|
+
usage=LLMUsage(
|
172
|
+
input_tokens=200,
|
173
|
+
output_tokens=100,
|
174
|
+
total_tokens=300,
|
175
|
+
cost_usd=0.006
|
176
|
+
),
|
177
|
+
latency_ms=700
|
178
|
+
)
|
179
|
+
]
|
180
|
+
|
181
|
+
# Create event with call_records
|
182
|
+
event = LMCAISEvent(
|
183
|
+
system_instance_id="llm_system",
|
184
|
+
time_record=TimeRecord(event_time=time.time()),
|
185
|
+
call_records=call_records
|
186
|
+
)
|
187
|
+
|
188
|
+
# Compute aggregates from call_records
|
189
|
+
total_input_tokens = sum(
|
190
|
+
r.usage.input_tokens for r in call_records
|
191
|
+
if r.usage and r.usage.input_tokens
|
192
|
+
)
|
193
|
+
total_output_tokens = sum(
|
194
|
+
r.usage.output_tokens for r in call_records
|
195
|
+
if r.usage and r.usage.output_tokens
|
196
|
+
)
|
197
|
+
total_tokens = sum(
|
198
|
+
r.usage.total_tokens for r in call_records
|
199
|
+
if r.usage and r.usage.total_tokens
|
200
|
+
)
|
201
|
+
total_cost = sum(
|
202
|
+
r.usage.cost_usd for r in call_records
|
203
|
+
if r.usage and r.usage.cost_usd
|
204
|
+
)
|
205
|
+
total_latency = sum(
|
206
|
+
r.latency_ms for r in call_records
|
207
|
+
if r.latency_ms
|
208
|
+
)
|
209
|
+
|
210
|
+
# Set aggregates on event
|
211
|
+
event.input_tokens = total_input_tokens
|
212
|
+
event.output_tokens = total_output_tokens
|
213
|
+
event.total_tokens = total_tokens
|
214
|
+
event.cost_usd = total_cost
|
215
|
+
event.latency_ms = total_latency
|
216
|
+
|
217
|
+
assert event.input_tokens == 300
|
218
|
+
assert event.output_tokens == 150
|
219
|
+
assert event.total_tokens == 450
|
220
|
+
assert abs(event.cost_usd - 0.009) < 0.0001 # Use floating point comparison
|
221
|
+
assert event.latency_ms == 1200
|
222
|
+
|
223
|
+
def test_migration_pattern(self):
|
224
|
+
"""Test migration from legacy fields to call_records."""
|
225
|
+
# Legacy pattern (what we're migrating from)
|
226
|
+
legacy_event = LMCAISEvent(
|
227
|
+
system_instance_id="llm_system",
|
228
|
+
time_record=TimeRecord(event_time=time.time()),
|
229
|
+
model_name="gpt-4",
|
230
|
+
provider="openai",
|
231
|
+
input_tokens=100,
|
232
|
+
output_tokens=50,
|
233
|
+
total_tokens=150,
|
234
|
+
cost_usd=0.003,
|
235
|
+
latency_ms=500
|
236
|
+
)
|
237
|
+
|
238
|
+
# New pattern (what we're migrating to)
|
239
|
+
new_event = LMCAISEvent(
|
240
|
+
system_instance_id="llm_system",
|
241
|
+
time_record=TimeRecord(event_time=time.time()),
|
242
|
+
# Aggregates can stay on the event
|
243
|
+
total_tokens=150,
|
244
|
+
cost_usd=0.003,
|
245
|
+
latency_ms=500,
|
246
|
+
# Details go in call_records
|
247
|
+
call_records=[
|
248
|
+
LLMCallRecord(
|
249
|
+
call_id=str(uuid.uuid4()),
|
250
|
+
api_type="chat_completions",
|
251
|
+
provider="openai",
|
252
|
+
model_name="gpt-4",
|
253
|
+
usage=LLMUsage(
|
254
|
+
input_tokens=100,
|
255
|
+
output_tokens=50,
|
256
|
+
total_tokens=150,
|
257
|
+
cost_usd=0.003
|
258
|
+
),
|
259
|
+
latency_ms=500
|
260
|
+
)
|
261
|
+
]
|
262
|
+
)
|
263
|
+
|
264
|
+
# Both should represent the same information
|
265
|
+
assert legacy_event.total_tokens == new_event.total_tokens
|
266
|
+
assert legacy_event.cost_usd == new_event.cost_usd
|
267
|
+
assert legacy_event.model_name == new_event.call_records[0].model_name
|
268
|
+
assert legacy_event.provider == new_event.call_records[0].provider
|
269
|
+
|
270
|
+
|
271
|
+
class TestComplexScenarios:
|
272
|
+
"""Test complex scenarios with multiple calls and tool usage."""
|
273
|
+
|
274
|
+
def test_multi_turn_conversation(self):
|
275
|
+
"""Test a multi-turn conversation with multiple LLM calls."""
|
276
|
+
session = SessionTrace(
|
277
|
+
session_id=str(uuid.uuid4()),
|
278
|
+
created_at=datetime.utcnow()
|
279
|
+
)
|
280
|
+
|
281
|
+
# Turn 1: Initial question
|
282
|
+
turn1 = SessionTimeStep(
|
283
|
+
step_id="turn_1",
|
284
|
+
step_index=0,
|
285
|
+
turn_number=1
|
286
|
+
)
|
287
|
+
|
288
|
+
turn1_event = LMCAISEvent(
|
289
|
+
system_instance_id="llm_system",
|
290
|
+
time_record=TimeRecord(event_time=time.time()),
|
291
|
+
call_records=[
|
292
|
+
LLMCallRecord(
|
293
|
+
call_id=str(uuid.uuid4()),
|
294
|
+
api_type="chat_completions",
|
295
|
+
model_name="gpt-4",
|
296
|
+
input_messages=[
|
297
|
+
LLMMessage(
|
298
|
+
role="user",
|
299
|
+
parts=[LLMContentPart(type="text", text="What's the weather?")]
|
300
|
+
)
|
301
|
+
],
|
302
|
+
output_messages=[
|
303
|
+
LLMMessage(
|
304
|
+
role="assistant",
|
305
|
+
parts=[LLMContentPart(
|
306
|
+
type="text",
|
307
|
+
text="I'll check the weather for you."
|
308
|
+
)]
|
309
|
+
)
|
310
|
+
],
|
311
|
+
output_tool_calls=[
|
312
|
+
ToolCallSpec(
|
313
|
+
name="get_weather",
|
314
|
+
arguments_json='{"location": "current"}',
|
315
|
+
call_id="weather_1"
|
316
|
+
)
|
317
|
+
],
|
318
|
+
usage=LLMUsage(input_tokens=10, output_tokens=20, total_tokens=30)
|
319
|
+
)
|
320
|
+
]
|
321
|
+
)
|
322
|
+
|
323
|
+
turn1.events.append(turn1_event)
|
324
|
+
session.session_time_steps.append(turn1)
|
325
|
+
|
326
|
+
# Turn 2: Tool result and response
|
327
|
+
turn2 = SessionTimeStep(
|
328
|
+
step_id="turn_2",
|
329
|
+
step_index=1,
|
330
|
+
turn_number=2
|
331
|
+
)
|
332
|
+
|
333
|
+
turn2_event = LMCAISEvent(
|
334
|
+
system_instance_id="llm_system",
|
335
|
+
time_record=TimeRecord(event_time=time.time()),
|
336
|
+
call_records=[
|
337
|
+
LLMCallRecord(
|
338
|
+
call_id=str(uuid.uuid4()),
|
339
|
+
api_type="chat_completions",
|
340
|
+
model_name="gpt-4",
|
341
|
+
input_messages=[
|
342
|
+
LLMMessage(
|
343
|
+
role="tool",
|
344
|
+
tool_call_id="weather_1",
|
345
|
+
parts=[LLMContentPart(
|
346
|
+
type="text",
|
347
|
+
text="San Francisco: 72°F, sunny"
|
348
|
+
)]
|
349
|
+
)
|
350
|
+
],
|
351
|
+
output_messages=[
|
352
|
+
LLMMessage(
|
353
|
+
role="assistant",
|
354
|
+
parts=[LLMContentPart(
|
355
|
+
type="text",
|
356
|
+
text="The weather in San Francisco is 72°F and sunny."
|
357
|
+
)]
|
358
|
+
)
|
359
|
+
],
|
360
|
+
usage=LLMUsage(input_tokens=15, output_tokens=25, total_tokens=40),
|
361
|
+
tool_results=[
|
362
|
+
ToolCallResult(
|
363
|
+
call_id="weather_1",
|
364
|
+
output_text="San Francisco: 72°F, sunny",
|
365
|
+
status="ok"
|
366
|
+
)
|
367
|
+
]
|
368
|
+
)
|
369
|
+
]
|
370
|
+
)
|
371
|
+
|
372
|
+
turn2.events.append(turn2_event)
|
373
|
+
session.session_time_steps.append(turn2)
|
374
|
+
|
375
|
+
# Verify session structure
|
376
|
+
assert len(session.session_time_steps) == 2
|
377
|
+
assert len(session.session_time_steps[0].events) == 1
|
378
|
+
assert len(session.session_time_steps[1].events) == 1
|
379
|
+
|
380
|
+
# Verify tool call flow
|
381
|
+
turn1_call = session.session_time_steps[0].events[0].call_records[0]
|
382
|
+
turn2_call = session.session_time_steps[1].events[0].call_records[0]
|
383
|
+
|
384
|
+
assert len(turn1_call.output_tool_calls) == 1
|
385
|
+
assert turn1_call.output_tool_calls[0].name == "get_weather"
|
386
|
+
assert len(turn2_call.tool_results) == 1
|
387
|
+
assert turn2_call.tool_results[0].call_id == "weather_1"
|
388
|
+
|
389
|
+
def test_streaming_response(self):
|
390
|
+
"""Test LLMCallRecord with streaming chunks."""
|
391
|
+
from synth_ai.tracing_v3.lm_call_record_abstractions import LLMChunk
|
392
|
+
|
393
|
+
chunks = [
|
394
|
+
LLMChunk(
|
395
|
+
sequence_index=0,
|
396
|
+
received_at=datetime.utcnow(),
|
397
|
+
event_type="content.delta",
|
398
|
+
delta_text="The",
|
399
|
+
choice_index=0
|
400
|
+
),
|
401
|
+
LLMChunk(
|
402
|
+
sequence_index=1,
|
403
|
+
received_at=datetime.utcnow(),
|
404
|
+
event_type="content.delta",
|
405
|
+
delta_text=" answer",
|
406
|
+
choice_index=0
|
407
|
+
),
|
408
|
+
LLMChunk(
|
409
|
+
sequence_index=2,
|
410
|
+
received_at=datetime.utcnow(),
|
411
|
+
event_type="content.delta",
|
412
|
+
delta_text=" is",
|
413
|
+
choice_index=0
|
414
|
+
),
|
415
|
+
LLMChunk(
|
416
|
+
sequence_index=3,
|
417
|
+
received_at=datetime.utcnow(),
|
418
|
+
event_type="content.delta",
|
419
|
+
delta_text=" 42",
|
420
|
+
choice_index=0
|
421
|
+
),
|
422
|
+
LLMChunk(
|
423
|
+
sequence_index=4,
|
424
|
+
received_at=datetime.utcnow(),
|
425
|
+
event_type="message.stop",
|
426
|
+
choice_index=0
|
427
|
+
)
|
428
|
+
]
|
429
|
+
|
430
|
+
record = LLMCallRecord(
|
431
|
+
call_id=str(uuid.uuid4()),
|
432
|
+
api_type="responses", # OpenAI Responses API style
|
433
|
+
model_name="gpt-4",
|
434
|
+
chunks=chunks,
|
435
|
+
output_text="The answer is 42", # Final collapsed output
|
436
|
+
usage=LLMUsage(input_tokens=10, output_tokens=5, total_tokens=15)
|
437
|
+
)
|
438
|
+
|
439
|
+
assert len(record.chunks) == 5
|
440
|
+
assert record.output_text == "The answer is 42"
|
441
|
+
|
442
|
+
# Reconstruct from chunks
|
443
|
+
reconstructed = "".join(
|
444
|
+
c.delta_text for c in chunks
|
445
|
+
if c.delta_text
|
446
|
+
)
|
447
|
+
assert reconstructed == "The answer is 42"
|
448
|
+
|
449
|
+
|
450
|
+
class TestProviderMappings:
|
451
|
+
"""Test mapping different provider formats to LLMCallRecord."""
|
452
|
+
|
453
|
+
def test_openai_chat_completions_mapping(self):
|
454
|
+
"""Test mapping OpenAI Chat Completions to LLMCallRecord."""
|
455
|
+
# Simulate OpenAI response structure
|
456
|
+
openai_response = {
|
457
|
+
"id": "chatcmpl-123",
|
458
|
+
"object": "chat.completion",
|
459
|
+
"created": 1677652288,
|
460
|
+
"model": "gpt-4",
|
461
|
+
"choices": [{
|
462
|
+
"index": 0,
|
463
|
+
"message": {
|
464
|
+
"role": "assistant",
|
465
|
+
"content": "Hello! How can I help you today?"
|
466
|
+
},
|
467
|
+
"finish_reason": "stop"
|
468
|
+
}],
|
469
|
+
"usage": {
|
470
|
+
"prompt_tokens": 10,
|
471
|
+
"completion_tokens": 9,
|
472
|
+
"total_tokens": 19
|
473
|
+
}
|
474
|
+
}
|
475
|
+
|
476
|
+
# Map to LLMCallRecord
|
477
|
+
record = LLMCallRecord(
|
478
|
+
call_id=openai_response["id"],
|
479
|
+
api_type="chat_completions",
|
480
|
+
provider="openai",
|
481
|
+
model_name=openai_response["model"],
|
482
|
+
output_messages=[
|
483
|
+
LLMMessage(
|
484
|
+
role=openai_response["choices"][0]["message"]["role"],
|
485
|
+
parts=[LLMContentPart(
|
486
|
+
type="text",
|
487
|
+
text=openai_response["choices"][0]["message"]["content"]
|
488
|
+
)]
|
489
|
+
)
|
490
|
+
],
|
491
|
+
usage=LLMUsage(
|
492
|
+
input_tokens=openai_response["usage"]["prompt_tokens"],
|
493
|
+
output_tokens=openai_response["usage"]["completion_tokens"],
|
494
|
+
total_tokens=openai_response["usage"]["total_tokens"]
|
495
|
+
),
|
496
|
+
finish_reason=openai_response["choices"][0]["finish_reason"],
|
497
|
+
provider_request_id=openai_response["id"]
|
498
|
+
)
|
499
|
+
|
500
|
+
assert record.call_id == "chatcmpl-123"
|
501
|
+
assert record.model_name == "gpt-4"
|
502
|
+
assert record.usage.total_tokens == 19
|
503
|
+
assert record.finish_reason == "stop"
|
504
|
+
|
505
|
+
def test_anthropic_messages_mapping(self):
|
506
|
+
"""Test mapping Anthropic Messages API to LLMCallRecord."""
|
507
|
+
# Simulate Anthropic response structure
|
508
|
+
anthropic_response = {
|
509
|
+
"id": "msg_123",
|
510
|
+
"type": "message",
|
511
|
+
"role": "assistant",
|
512
|
+
"content": [
|
513
|
+
{
|
514
|
+
"type": "text",
|
515
|
+
"text": "I'll help you with that."
|
516
|
+
}
|
517
|
+
],
|
518
|
+
"model": "claude-3-opus-20240229",
|
519
|
+
"stop_reason": "end_turn",
|
520
|
+
"usage": {
|
521
|
+
"input_tokens": 15,
|
522
|
+
"output_tokens": 12
|
523
|
+
}
|
524
|
+
}
|
525
|
+
|
526
|
+
# Map to LLMCallRecord
|
527
|
+
record = LLMCallRecord(
|
528
|
+
call_id=anthropic_response["id"],
|
529
|
+
api_type="messages", # Anthropic Messages API
|
530
|
+
provider="anthropic",
|
531
|
+
model_name=anthropic_response["model"],
|
532
|
+
output_messages=[
|
533
|
+
LLMMessage(
|
534
|
+
role=anthropic_response["role"],
|
535
|
+
parts=[
|
536
|
+
LLMContentPart(
|
537
|
+
type=content["type"],
|
538
|
+
text=content["text"]
|
539
|
+
)
|
540
|
+
for content in anthropic_response["content"]
|
541
|
+
]
|
542
|
+
)
|
543
|
+
],
|
544
|
+
usage=LLMUsage(
|
545
|
+
input_tokens=anthropic_response["usage"]["input_tokens"],
|
546
|
+
output_tokens=anthropic_response["usage"]["output_tokens"],
|
547
|
+
total_tokens=(
|
548
|
+
anthropic_response["usage"]["input_tokens"] +
|
549
|
+
anthropic_response["usage"]["output_tokens"]
|
550
|
+
)
|
551
|
+
),
|
552
|
+
finish_reason=anthropic_response["stop_reason"],
|
553
|
+
provider_request_id=anthropic_response["id"]
|
554
|
+
)
|
555
|
+
|
556
|
+
assert record.call_id == "msg_123"
|
557
|
+
assert record.model_name == "claude-3-opus-20240229"
|
558
|
+
assert record.usage.total_tokens == 27
|
559
|
+
assert record.finish_reason == "end_turn"
|
560
|
+
|
561
|
+
|
562
|
+
def helper_compute_aggregates_from_records(call_records: List[LLMCallRecord]) -> Dict[str, Any]:
|
563
|
+
"""Helper function to compute aggregates from call_records.
|
564
|
+
|
565
|
+
This demonstrates the pattern for computing event-level aggregates
|
566
|
+
from a list of LLMCallRecord instances.
|
567
|
+
"""
|
568
|
+
aggregates = {
|
569
|
+
"input_tokens": 0,
|
570
|
+
"output_tokens": 0,
|
571
|
+
"total_tokens": 0,
|
572
|
+
"cost_usd": 0.0,
|
573
|
+
"latency_ms": 0,
|
574
|
+
"models_used": set(),
|
575
|
+
"providers_used": set(),
|
576
|
+
"tool_calls_count": 0,
|
577
|
+
"error_count": 0
|
578
|
+
}
|
579
|
+
|
580
|
+
for record in call_records:
|
581
|
+
if record.usage:
|
582
|
+
if record.usage.input_tokens:
|
583
|
+
aggregates["input_tokens"] += record.usage.input_tokens
|
584
|
+
if record.usage.output_tokens:
|
585
|
+
aggregates["output_tokens"] += record.usage.output_tokens
|
586
|
+
if record.usage.total_tokens:
|
587
|
+
aggregates["total_tokens"] += record.usage.total_tokens
|
588
|
+
if record.usage.cost_usd:
|
589
|
+
aggregates["cost_usd"] += record.usage.cost_usd
|
590
|
+
|
591
|
+
if record.latency_ms:
|
592
|
+
aggregates["latency_ms"] += record.latency_ms
|
593
|
+
|
594
|
+
if record.model_name:
|
595
|
+
aggregates["models_used"].add(record.model_name)
|
596
|
+
|
597
|
+
if record.provider:
|
598
|
+
aggregates["providers_used"].add(record.provider)
|
599
|
+
|
600
|
+
aggregates["tool_calls_count"] += len(record.output_tool_calls)
|
601
|
+
|
602
|
+
if record.outcome == "error":
|
603
|
+
aggregates["error_count"] += 1
|
604
|
+
|
605
|
+
# Convert sets to lists for JSON serialization
|
606
|
+
aggregates["models_used"] = list(aggregates["models_used"])
|
607
|
+
aggregates["providers_used"] = list(aggregates["providers_used"])
|
608
|
+
|
609
|
+
return aggregates
|
610
|
+
|
611
|
+
|
612
|
+
class TestAggregateHelper:
|
613
|
+
"""Test the aggregate computation helper."""
|
614
|
+
|
615
|
+
def test_compute_aggregates(self):
|
616
|
+
"""Test computing aggregates from multiple call records."""
|
617
|
+
records = [
|
618
|
+
LLMCallRecord(
|
619
|
+
call_id="1",
|
620
|
+
api_type="chat_completions",
|
621
|
+
model_name="gpt-4",
|
622
|
+
provider="openai",
|
623
|
+
usage=LLMUsage(
|
624
|
+
input_tokens=100,
|
625
|
+
output_tokens=50,
|
626
|
+
total_tokens=150,
|
627
|
+
cost_usd=0.003
|
628
|
+
),
|
629
|
+
latency_ms=500,
|
630
|
+
output_tool_calls=[
|
631
|
+
ToolCallSpec(name="tool1", arguments_json="{}")
|
632
|
+
]
|
633
|
+
),
|
634
|
+
LLMCallRecord(
|
635
|
+
call_id="2",
|
636
|
+
api_type="messages",
|
637
|
+
model_name="claude-3-opus",
|
638
|
+
provider="anthropic",
|
639
|
+
usage=LLMUsage(
|
640
|
+
input_tokens=200,
|
641
|
+
output_tokens=100,
|
642
|
+
total_tokens=300,
|
643
|
+
cost_usd=0.006
|
644
|
+
),
|
645
|
+
latency_ms=700,
|
646
|
+
outcome="success"
|
647
|
+
),
|
648
|
+
LLMCallRecord(
|
649
|
+
call_id="3",
|
650
|
+
api_type="chat_completions",
|
651
|
+
model_name="gpt-4",
|
652
|
+
provider="openai",
|
653
|
+
outcome="error",
|
654
|
+
error={"code": "rate_limit", "message": "Rate limit exceeded"}
|
655
|
+
)
|
656
|
+
]
|
657
|
+
|
658
|
+
aggregates = helper_compute_aggregates_from_records(records)
|
659
|
+
|
660
|
+
assert aggregates["input_tokens"] == 300
|
661
|
+
assert aggregates["output_tokens"] == 150
|
662
|
+
assert aggregates["total_tokens"] == 450
|
663
|
+
assert abs(aggregates["cost_usd"] - 0.009) < 0.0001 # Floating point comparison
|
664
|
+
assert aggregates["latency_ms"] == 1200
|
665
|
+
assert set(aggregates["models_used"]) == {"gpt-4", "claude-3-opus"}
|
666
|
+
assert set(aggregates["providers_used"]) == {"openai", "anthropic"}
|
667
|
+
assert aggregates["tool_calls_count"] == 1
|
668
|
+
assert aggregates["error_count"] == 1
|
669
|
+
|
670
|
+
|
671
|
+
if __name__ == "__main__":
|
672
|
+
pytest.main([__file__, "-v"])
|