jaf-py 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jaf/__init__.py CHANGED
@@ -201,7 +201,7 @@ def generate_run_id() -> RunId:
201
201
  return create_run_id(str(uuid.uuid4()))
202
202
 
203
203
 
204
- __version__ = "2.6.2"
204
+ __version__ = "2.6.4"
205
205
  __all__ = [
206
206
  # Core types and functions
207
207
  "TraceId",
jaf/core/engine.py CHANGED
@@ -692,8 +692,19 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
692
692
  aggregated_text = ""
693
693
  # Working array of partial tool calls
694
694
  partial_tool_calls: List[Dict[str, Any]] = []
695
+ # Capture usage and model from streaming chunks
696
+ stream_usage: Optional[Dict[str, int]] = None
697
+ stream_model: Optional[str] = None
695
698
 
696
699
  async for chunk in get_stream(state, current_agent, config): # type: ignore[arg-type]
700
+ # Extract usage and model from raw chunk if available
701
+ raw_chunk = getattr(chunk, "raw", None)
702
+ if raw_chunk:
703
+ if not stream_usage and "usage" in raw_chunk and raw_chunk["usage"]:
704
+ stream_usage = raw_chunk["usage"]
705
+ if not stream_model and "model" in raw_chunk and raw_chunk["model"]:
706
+ stream_model = raw_chunk["model"]
707
+
697
708
  # Text deltas
698
709
  delta_text = getattr(chunk, "delta", None)
699
710
  if delta_text:
@@ -803,6 +814,13 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
803
814
  llm_response = {
804
815
  "message": {"content": aggregated_text or None, "tool_calls": final_tool_calls}
805
816
  }
817
+
818
+ # Preserve usage and model from streaming if captured
819
+ if stream_usage:
820
+ llm_response["usage"] = stream_usage
821
+ if stream_model:
822
+ llm_response["model"] = stream_model
823
+
806
824
  except Exception:
807
825
  # Fallback to non-streaming on error
808
826
  assistant_event_streamed = False
jaf/core/tracing.py CHANGED
@@ -469,7 +469,7 @@ class LangfuseTraceCollector:
469
469
  public_key=public_key,
470
470
  secret_key=secret_key,
471
471
  host=host,
472
- release="jaf-py-v2.6.2",
472
+ release="jaf-py-v2.6.4",
473
473
  httpx_client=client,
474
474
  )
475
475
  self._httpx_client = client
@@ -753,7 +753,9 @@ class LangfuseTraceCollector:
753
753
  system_prompt = context.system_prompt
754
754
 
755
755
  if system_prompt:
756
- print(f"[LANGFUSE DEBUG] Extracted system_prompt: {system_prompt[:100] if isinstance(system_prompt, str) else system_prompt}...")
756
+ print(
757
+ f"[LANGFUSE DEBUG] Extracted system_prompt: {system_prompt[:100] if isinstance(system_prompt, str) else system_prompt}..."
758
+ )
757
759
 
758
760
  print(
759
761
  f"[LANGFUSE DEBUG] Final extracted - user_query: {user_query}, user_id: {user_id}"
@@ -911,25 +913,25 @@ class LangfuseTraceCollector:
911
913
  print(f"[LANGFUSE] Ending generation for LLM call")
912
914
  # End the generation
913
915
  generation = self.active_spans[span_id]
914
- choice = self._get_event_data(event, "choice", {})
915
916
 
916
- # Extract usage from the event data
917
+ choice = self._get_event_data(event, "choice", {})
917
918
  usage = self._get_event_data(event, "usage", {})
919
+ model = self._get_event_data(event, "model", "unknown")
918
920
 
919
- # Extract model information from choice data or event data
920
- model = choice.get("model", "unknown")
921
- if model == "unknown":
922
- # Try to get model from the choice response structure
923
- if isinstance(choice, dict):
924
- model = choice.get("model") or choice.get("id", "unknown")
921
+ # Also try to get model from the choice if not at top level
922
+ if model == "unknown" and isinstance(choice, dict):
923
+ model = choice.get("model", "unknown")
925
924
 
926
- # Convert to Langfuse v2 format - let Langfuse handle cost calculation automatically
925
+ print(f"[LANGFUSE] Extracted - model: '{model}', usage: {usage}")
926
+
927
+ # Convert to Langfuse format with detailed cache information
927
928
  langfuse_usage = None
928
929
  if usage:
929
930
  prompt_tokens = usage.get("prompt_tokens", 0)
930
931
  completion_tokens = usage.get("completion_tokens", 0)
931
932
  total_tokens = usage.get("total_tokens", 0)
932
933
 
934
+ # Build detailed usage dict with cache information
933
935
  langfuse_usage = {
934
936
  "input": prompt_tokens,
935
937
  "output": completion_tokens,
@@ -937,9 +939,40 @@ class LangfuseTraceCollector:
937
939
  "unit": "TOKENS",
938
940
  }
939
941
 
940
- print(
941
- f"[LANGFUSE] Usage data for automatic cost calculation: {langfuse_usage}"
942
- )
942
+ # Add cache-related fields if available (for prompt caching support)
943
+ if (
944
+ "cache_creation_input_tokens" in usage
945
+ and usage["cache_creation_input_tokens"]
946
+ ):
947
+ langfuse_usage["cache_creation_input_tokens"] = usage[
948
+ "cache_creation_input_tokens"
949
+ ]
950
+ if "cache_read_input_tokens" in usage and usage["cache_read_input_tokens"]:
951
+ langfuse_usage["cache_read_input_tokens"] = usage[
952
+ "cache_read_input_tokens"
953
+ ]
954
+
955
+ # Add detailed token breakdowns if available
956
+ if "prompt_tokens_details" in usage and usage["prompt_tokens_details"]:
957
+ details = usage["prompt_tokens_details"]
958
+ if "cached_tokens" in details and details["cached_tokens"]:
959
+ langfuse_usage["input_cached_tokens"] = details["cached_tokens"]
960
+ if "audio_tokens" in details and details["audio_tokens"]:
961
+ langfuse_usage["input_audio_tokens"] = details["audio_tokens"]
962
+
963
+ if (
964
+ "completion_tokens_details" in usage
965
+ and usage["completion_tokens_details"]
966
+ ):
967
+ details = usage["completion_tokens_details"]
968
+ if "reasoning_tokens" in details and details["reasoning_tokens"]:
969
+ langfuse_usage["output_reasoning_tokens"] = details[
970
+ "reasoning_tokens"
971
+ ]
972
+ if "audio_tokens" in details and details["audio_tokens"]:
973
+ langfuse_usage["output_audio_tokens"] = details["audio_tokens"]
974
+
975
+ print(f"[LANGFUSE] Usage data with cache details: {langfuse_usage}")
943
976
 
944
977
  # Include model information in the generation end - Langfuse will calculate costs automatically
945
978
  # Use compatibility wrapper for ending spans/generations
@@ -1260,7 +1293,10 @@ def create_composite_trace_collector(
1260
1293
  # Automatically add Langfuse collector if keys are configured
1261
1294
  if os.getenv("LANGFUSE_PUBLIC_KEY") and os.getenv("LANGFUSE_SECRET_KEY"):
1262
1295
  langfuse_collector = LangfuseTraceCollector(
1263
- httpx_client=httpx_client, proxy=proxy, timeout=timeout, include_system_prompt=include_system_prompt
1296
+ httpx_client=httpx_client,
1297
+ proxy=proxy,
1298
+ timeout=timeout,
1299
+ include_system_prompt=include_system_prompt,
1264
1300
  )
1265
1301
  collector_list.append(langfuse_collector)
1266
1302
 
jaf/core/types.py CHANGED
@@ -1009,6 +1009,38 @@ class RetryEvent:
1009
1009
  )
1010
1010
 
1011
1011
 
1012
+ @dataclass(frozen=True)
1013
+ class FallbackEventData:
1014
+ """Data for model fallback events."""
1015
+
1016
+ from_model: str # Model that failed
1017
+ to_model: str # Fallback model being tried
1018
+ reason: str # Reason for fallback (e.g., "Content Policy Violation", "Context Window Exceeded", "Rate Limit")
1019
+ fallback_type: Literal["general", "content_policy", "context_window"] # Type of fallback
1020
+ attempt: int # Which fallback attempt this is (1-indexed)
1021
+ trace_id: TraceId
1022
+ run_id: RunId
1023
+ error_details: Optional[Dict[str, Any]] = None # Additional error context
1024
+
1025
+
1026
+ @dataclass(frozen=True)
1027
+ class FallbackEvent:
1028
+ """Event emitted when a model fallback occurs."""
1029
+
1030
+ type: Literal["fallback"] = "fallback"
1031
+ data: FallbackEventData = field(
1032
+ default_factory=lambda: FallbackEventData(
1033
+ from_model="",
1034
+ to_model="",
1035
+ reason="",
1036
+ fallback_type="general",
1037
+ attempt=1,
1038
+ trace_id=TraceId(""),
1039
+ run_id=RunId(""),
1040
+ )
1041
+ )
1042
+
1043
+
1012
1044
  # Union type for all trace events
1013
1045
  TraceEvent = Union[
1014
1046
  RunStartEvent,
@@ -1024,6 +1056,7 @@ TraceEvent = Union[
1024
1056
  HandoffEvent,
1025
1057
  RunEndEvent,
1026
1058
  RetryEvent,
1059
+ FallbackEvent,
1027
1060
  ]
1028
1061
 
1029
1062
 
@@ -1096,7 +1129,9 @@ class RunConfig(Generic[Ctx]):
1096
1129
  agent_registry: Dict[str, Agent[Ctx, Any]]
1097
1130
  model_provider: ModelProvider[Ctx]
1098
1131
  max_turns: Optional[int] = 50
1099
- max_tokens: Optional[int] = None # Default max_tokens for all agents (can be overridden per agent)
1132
+ max_tokens: Optional[int] = (
1133
+ None # Default max_tokens for all agents (can be overridden per agent)
1134
+ )
1100
1135
  model_override: Optional[str] = None
1101
1136
  initial_input_guardrails: Optional[List[Guardrail]] = None
1102
1137
  final_output_guardrails: Optional[List[Guardrail]] = None
@@ -1120,7 +1155,7 @@ class RunConfig(Generic[Ctx]):
1120
1155
  [List[Message], RunState[Ctx]],
1121
1156
  Union[List[Message], Awaitable[List[Message]]],
1122
1157
  ]
1123
- ] = None
1158
+ ] = None
1124
1159
  max_empty_response_retries: int = 3 # Maximum retries when LLM returns empty response
1125
1160
  empty_response_retry_delay: float = (
1126
1161
  1.0 # Initial delay in seconds before retrying empty response (uses exponential backoff)
@@ -1129,6 +1164,14 @@ class RunConfig(Generic[Ctx]):
1129
1164
  prefer_streaming: Optional[bool] = (
1130
1165
  None # Whether to prefer streaming responses. None (default) = use streaming if available, True = prefer streaming, False = disable streaming
1131
1166
  )
1167
+ # Model fallback configuration
1168
+ fallbacks: Optional[List[str]] = None # List of fallback models to try if primary model fails
1169
+ content_policy_fallbacks: Optional[List[str]] = (
1170
+ None # Fallback models for content policy violations
1171
+ )
1172
+ context_window_fallbacks: Optional[List[str]] = (
1173
+ None # Fallback models for context window exceeded errors
1174
+ )
1132
1175
 
1133
1176
 
1134
1177
  # Regeneration types for conversation management
jaf/providers/model.py CHANGED
@@ -30,6 +30,8 @@ from ..core.types import (
30
30
  get_text_content,
31
31
  RetryEvent,
32
32
  RetryEventData,
33
+ FallbackEvent,
34
+ FallbackEventData,
33
35
  )
34
36
  from ..core.proxy import ProxyConfig
35
37
  from ..utils.document_processor import (
@@ -113,6 +115,55 @@ async def _is_vision_model(model: str, base_url: str) -> bool:
113
115
  return is_known_vision_model
114
116
 
115
117
 
118
+ def _classify_error_for_fallback(e: Exception) -> tuple[str, str]:
119
+ """
120
+ Classify an error to determine the fallback type and reason.
121
+
122
+ Args:
123
+ e: Exception from model call
124
+
125
+ Returns:
126
+ Tuple of (fallback_type, reason)
127
+ """
128
+ error_message = str(e).lower()
129
+ error_type = type(e).__name__
130
+
131
+ # Check for content policy violations
132
+ if (
133
+ "content" in error_message
134
+ and ("policy" in error_message or "filter" in error_message)
135
+ or "contentpolicyviolation" in error_type.lower()
136
+ or "content_filter" in error_message
137
+ or "safety" in error_message
138
+ ):
139
+ return ("content_policy", "Content Policy Violation")
140
+
141
+ # Check for context window exceeded
142
+ if (
143
+ "context" in error_message
144
+ and "window" in error_message
145
+ or "too long" in error_message
146
+ or "maximum context" in error_message
147
+ or "contextwindowexceeded" in error_type.lower()
148
+ or "prompt is too long" in error_message
149
+ or "tokens" in error_message
150
+ and "limit" in error_message
151
+ ):
152
+ return ("context_window", "Context Window Exceeded")
153
+
154
+ # Default to general fallback
155
+ if hasattr(e, "status_code"):
156
+ status_code = e.status_code
157
+ if status_code == 429:
158
+ return ("general", f"HTTP {status_code} - Rate Limit")
159
+ elif 500 <= status_code < 600:
160
+ return ("general", f"HTTP {status_code} - Server Error")
161
+ else:
162
+ return ("general", f"HTTP {status_code}")
163
+
164
+ return ("general", error_type)
165
+
166
+
116
167
  async def _retry_with_events(
117
168
  operation_func,
118
169
  state: RunState,
@@ -259,10 +310,10 @@ def make_litellm_provider(
259
310
  async def get_completion(
260
311
  self, state: RunState[Ctx], agent: Agent[Ctx, Any], config: RunConfig[Ctx]
261
312
  ) -> Dict[str, Any]:
262
- """Get completion from the model."""
313
+ """Get completion from the model with fallback support."""
263
314
 
264
- # Determine model to use
265
- model = config.model_override or (
315
+ # Determine initial model to use
316
+ primary_model = config.model_override or (
266
317
  agent.model_config.name if agent.model_config else "gpt-4o"
267
318
  )
268
319
 
@@ -277,10 +328,10 @@ def make_litellm_provider(
277
328
  )
278
329
 
279
330
  if has_image_content:
280
- supports_vision = await _is_vision_model(model, base_url)
331
+ supports_vision = await _is_vision_model(primary_model, base_url)
281
332
  if not supports_vision:
282
333
  raise ValueError(
283
- f"Model {model} does not support vision capabilities. "
334
+ f"Model {primary_model} does not support vision capabilities. "
284
335
  f"Please use a vision-capable model like gpt-4o, claude-3-5-sonnet, or gemini-1.5-pro."
285
336
  )
286
337
 
@@ -322,39 +373,123 @@ def make_litellm_provider(
322
373
  last_message.role == ContentRole.TOOL or last_message.role == "tool"
323
374
  )
324
375
 
325
- # Prepare request parameters
326
- request_params = {"model": model, "messages": messages, "stream": False}
327
-
328
- # Add optional parameters
329
- if agent.model_config:
330
- if agent.model_config.temperature is not None:
331
- request_params["temperature"] = agent.model_config.temperature
332
- # Use agent's max_tokens if set, otherwise fall back to config's max_tokens
333
- max_tokens = agent.model_config.max_tokens
334
- if max_tokens is None:
335
- max_tokens = config.max_tokens
336
- if max_tokens is not None:
337
- request_params["max_tokens"] = max_tokens
338
- elif config.max_tokens is not None:
339
- # No model_config but config has max_tokens
340
- request_params["max_tokens"] = config.max_tokens
341
-
342
- if tools:
343
- request_params["tools"] = tools
344
- # Always set tool_choice to auto when tools are available
345
- request_params["tool_choice"] = "auto"
346
-
347
- if agent.output_codec:
348
- request_params["response_format"] = {"type": "json_object"}
376
+ # Helper function to make API call with a specific model
377
+ async def _make_completion_call(model_name: str) -> Dict[str, Any]:
378
+ # Prepare request parameters
379
+ request_params = {"model": model_name, "messages": messages, "stream": False}
380
+
381
+ # Add optional parameters
382
+ if agent.model_config:
383
+ if agent.model_config.temperature is not None:
384
+ request_params["temperature"] = agent.model_config.temperature
385
+ # Use agent's max_tokens if set, otherwise fall back to config's max_tokens
386
+ max_tokens = agent.model_config.max_tokens
387
+ if max_tokens is None:
388
+ max_tokens = config.max_tokens
389
+ if max_tokens is not None:
390
+ request_params["max_tokens"] = max_tokens
391
+ elif config.max_tokens is not None:
392
+ # No model_config but config has max_tokens
393
+ request_params["max_tokens"] = config.max_tokens
394
+
395
+ if tools:
396
+ request_params["tools"] = tools
397
+ # Always set tool_choice to auto when tools are available
398
+ request_params["tool_choice"] = "auto"
399
+
400
+ if agent.output_codec:
401
+ request_params["response_format"] = {"type": "json_object"}
402
+
403
+ # Make the API call with retry handling
404
+ async def _api_call():
405
+ return await self.client.chat.completions.create(**request_params)
406
+
407
+ # Use retry wrapper to track retries in Langfuse
408
+ return await _retry_with_events(
409
+ _api_call,
410
+ state,
411
+ config,
412
+ operation_name="llm_call",
413
+ max_retries=3,
414
+ backoff_factor=1.0,
415
+ )
349
416
 
350
- # Make the API call with retry handling
351
- async def _api_call():
352
- return await self.client.chat.completions.create(**request_params)
417
+ # Try primary model first
418
+ last_exception = None
419
+ current_model = primary_model
420
+
421
+ try:
422
+ response = await _make_completion_call(current_model)
423
+ except Exception as e:
424
+ last_exception = e
425
+
426
+ # Classify the error to determine which fallback list to use
427
+ fallback_type, reason = _classify_error_for_fallback(e)
428
+
429
+ # Determine which fallback list to use
430
+ fallback_models = []
431
+ if fallback_type == "content_policy" and config.content_policy_fallbacks:
432
+ fallback_models = config.content_policy_fallbacks
433
+ elif fallback_type == "context_window" and config.context_window_fallbacks:
434
+ fallback_models = config.context_window_fallbacks
435
+ elif config.fallbacks:
436
+ fallback_models = config.fallbacks
437
+
438
+ # Try fallback models
439
+ if fallback_models:
440
+ print(
441
+ f"[JAF:FALLBACK] Primary model '{current_model}' failed with {reason}. "
442
+ f"Trying {len(fallback_models)} fallback model(s)..."
443
+ )
353
444
 
354
- # Use retry wrapper to track retries in Langfuse
355
- response = await _retry_with_events(
356
- _api_call, state, config, operation_name="llm_call", max_retries=3, backoff_factor=1.0
357
- )
445
+ for i, fallback_model in enumerate(fallback_models, 1):
446
+ try:
447
+ # Emit fallback event
448
+ if config.on_event:
449
+ fallback_event = FallbackEvent(
450
+ data=FallbackEventData(
451
+ from_model=current_model,
452
+ to_model=fallback_model,
453
+ reason=reason,
454
+ fallback_type=fallback_type,
455
+ attempt=i,
456
+ trace_id=state.trace_id,
457
+ run_id=state.run_id,
458
+ error_details={
459
+ "error_type": type(last_exception).__name__,
460
+ "error_message": str(last_exception),
461
+ },
462
+ )
463
+ )
464
+ config.on_event(fallback_event)
465
+
466
+ print(
467
+ f"[JAF:FALLBACK] Attempting fallback {i}/{len(fallback_models)}: {fallback_model}"
468
+ )
469
+
470
+ # Try the fallback model
471
+ response = await _make_completion_call(fallback_model)
472
+ current_model = fallback_model
473
+ print(
474
+ f"[JAF:FALLBACK] Successfully used fallback model: {fallback_model}"
475
+ )
476
+ break # Success - exit the fallback loop
477
+
478
+ except Exception as fallback_error:
479
+ last_exception = fallback_error
480
+ print(
481
+ f"[JAF:FALLBACK] Fallback model '{fallback_model}' also failed: {fallback_error}"
482
+ )
483
+
484
+ # If this was the last fallback, re-raise
485
+ if i == len(fallback_models):
486
+ print(
487
+ f"[JAF:FALLBACK] All fallback models exhausted. Raising last exception."
488
+ )
489
+ raise
490
+ else:
491
+ # No fallbacks configured, re-raise original exception
492
+ raise
358
493
 
359
494
  # Return in the expected format that the engine expects
360
495
  choice = response.choices[0]
@@ -371,7 +506,7 @@ def make_litellm_provider(
371
506
  for tc in choice.message.tool_calls
372
507
  ]
373
508
 
374
- # Extract usage data
509
+ # Extract usage data with detailed cache information
375
510
  usage_data = None
376
511
  if response.usage:
377
512
  usage_data = {
@@ -380,6 +515,45 @@ def make_litellm_provider(
380
515
  "total_tokens": response.usage.total_tokens,
381
516
  }
382
517
 
518
+ # Extract cache-related fields if available (for prompt caching support)
519
+ if hasattr(response.usage, "cache_creation_input_tokens"):
520
+ usage_data["cache_creation_input_tokens"] = (
521
+ response.usage.cache_creation_input_tokens
522
+ )
523
+ if hasattr(response.usage, "cache_read_input_tokens"):
524
+ usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
525
+
526
+ # Extract detailed token breakdowns
527
+ if (
528
+ hasattr(response.usage, "prompt_tokens_details")
529
+ and response.usage.prompt_tokens_details
530
+ ):
531
+ details = {}
532
+ if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
533
+ details["cached_tokens"] = (
534
+ response.usage.prompt_tokens_details.cached_tokens
535
+ )
536
+ if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
537
+ details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
538
+ if details:
539
+ usage_data["prompt_tokens_details"] = details
540
+
541
+ if (
542
+ hasattr(response.usage, "completion_tokens_details")
543
+ and response.usage.completion_tokens_details
544
+ ):
545
+ details = {}
546
+ if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
547
+ details["reasoning_tokens"] = (
548
+ response.usage.completion_tokens_details.reasoning_tokens
549
+ )
550
+ if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
551
+ details["audio_tokens"] = (
552
+ response.usage.completion_tokens_details.audio_tokens
553
+ )
554
+ if details:
555
+ usage_data["completion_tokens_details"] = details
556
+
383
557
  return {
384
558
  "id": response.id,
385
559
  "created": response.created,
@@ -688,7 +862,12 @@ def make_litellm_sdk_provider(
688
862
 
689
863
  # Use retry wrapper to track retries in Langfuse
690
864
  response = await _retry_with_events(
691
- _api_call, state, config, operation_name="llm_call", max_retries=3, backoff_factor=1.0
865
+ _api_call,
866
+ state,
867
+ config,
868
+ operation_name="llm_call",
869
+ max_retries=3,
870
+ backoff_factor=1.0,
692
871
  )
693
872
 
694
873
  # Return in the expected format that the engine expects
@@ -706,8 +885,16 @@ def make_litellm_sdk_provider(
706
885
  for tc in choice.message.tool_calls
707
886
  ]
708
887
 
709
- # Extract usage data
710
- usage_data = None
888
+ # Extract usage data with detailed cache information - ALWAYS return a dict with defaults for Langfuse cost tracking
889
+ # Initialize with zeros as defensive default (matches AzureDirectProvider pattern)
890
+ usage_data = {
891
+ "prompt_tokens": 0,
892
+ "completion_tokens": 0,
893
+ "total_tokens": 0,
894
+ }
895
+
896
+ actual_model = getattr(response, "model", model_name)
897
+
711
898
  if response.usage:
712
899
  usage_data = {
713
900
  "prompt_tokens": response.usage.prompt_tokens,
@@ -715,12 +902,59 @@ def make_litellm_sdk_provider(
715
902
  "total_tokens": response.usage.total_tokens,
716
903
  }
717
904
 
905
+ # Extract cache-related fields if available (for prompt caching support)
906
+ if hasattr(response.usage, "cache_creation_input_tokens"):
907
+ usage_data["cache_creation_input_tokens"] = (
908
+ response.usage.cache_creation_input_tokens
909
+ )
910
+ if hasattr(response.usage, "cache_read_input_tokens"):
911
+ usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
912
+
913
+ # Extract detailed token breakdowns
914
+ if (
915
+ hasattr(response.usage, "prompt_tokens_details")
916
+ and response.usage.prompt_tokens_details
917
+ ):
918
+ details = {}
919
+ if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
920
+ details["cached_tokens"] = (
921
+ response.usage.prompt_tokens_details.cached_tokens
922
+ )
923
+ if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
924
+ details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
925
+ if details:
926
+ usage_data["prompt_tokens_details"] = details
927
+
928
+ if (
929
+ hasattr(response.usage, "completion_tokens_details")
930
+ and response.usage.completion_tokens_details
931
+ ):
932
+ details = {}
933
+ if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
934
+ details["reasoning_tokens"] = (
935
+ response.usage.completion_tokens_details.reasoning_tokens
936
+ )
937
+ if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
938
+ details["audio_tokens"] = (
939
+ response.usage.completion_tokens_details.audio_tokens
940
+ )
941
+ if details:
942
+ usage_data["completion_tokens_details"] = details
943
+
944
+ message_content = {
945
+ "content": choice.message.content,
946
+ "tool_calls": tool_calls,
947
+ # CRITICAL: Embed usage and model here so trace collector can find them
948
+ "_usage": usage_data,
949
+ "_model": actual_model,
950
+ }
951
+
718
952
  return {
719
953
  "id": response.id,
720
954
  "created": response.created,
721
- "model": response.model,
955
+ "model": actual_model,
722
956
  "system_fingerprint": getattr(response, "system_fingerprint", None),
723
- "message": {"content": choice.message.content, "tool_calls": tool_calls},
957
+ "message": message_content,
724
958
  "usage": usage_data,
725
959
  "prompt": messages,
726
960
  }
@@ -769,6 +1003,7 @@ def make_litellm_sdk_provider(
769
1003
  "model": model_name,
770
1004
  "messages": messages,
771
1005
  "stream": True,
1006
+ "stream_options": {"include_usage": True}, # Request usage data in streaming
772
1007
  **self.litellm_kwargs,
773
1008
  }
774
1009
 
@@ -804,14 +1039,30 @@ def make_litellm_sdk_provider(
804
1039
  # Stream using litellm
805
1040
  stream = await litellm.acompletion(**request_params)
806
1041
 
1042
+ accumulated_usage: Optional[Dict[str, int]] = None
1043
+ response_model: Optional[str] = None
1044
+
807
1045
  async for chunk in stream:
808
1046
  try:
809
1047
  # Best-effort extraction of raw for debugging
810
1048
  try:
811
1049
  raw_obj = chunk.model_dump() if hasattr(chunk, "model_dump") else None
812
- except Exception:
1050
+
1051
+ # Capture usage from chunk if present
1052
+ if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
1053
+ accumulated_usage = raw_obj["usage"]
1054
+
1055
+ # Capture model from chunk if present
1056
+ if raw_obj and "model" in raw_obj and raw_obj["model"]:
1057
+ response_model = raw_obj["model"]
1058
+
1059
+ except Exception as e:
813
1060
  raw_obj = None
814
1061
 
1062
+ if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
1063
+ # Yield this chunk so engine.py can capture usage from raw
1064
+ yield CompletionStreamChunk(delta="", raw=raw_obj)
1065
+
815
1066
  choice = None
816
1067
  if getattr(chunk, "choices", None):
817
1068
  choice = chunk.choices[0]
@@ -826,6 +1077,12 @@ def make_litellm_sdk_provider(
826
1077
  if delta is not None:
827
1078
  content_delta = getattr(delta, "content", None)
828
1079
  if content_delta:
1080
+ # Include accumulated usage and model in raw_obj for engine
1081
+ if raw_obj and (accumulated_usage or response_model):
1082
+ if accumulated_usage:
1083
+ raw_obj["usage"] = accumulated_usage
1084
+ if response_model:
1085
+ raw_obj["model"] = response_model
829
1086
  yield CompletionStreamChunk(delta=content_delta, raw=raw_obj)
830
1087
 
831
1088
  # Tool call deltas
@@ -841,6 +1098,13 @@ def make_litellm_sdk_provider(
841
1098
  getattr(fn, "arguments", None) if fn is not None else None
842
1099
  )
843
1100
 
1101
+ # Include accumulated usage and model in raw_obj
1102
+ if raw_obj and (accumulated_usage or response_model):
1103
+ if accumulated_usage:
1104
+ raw_obj["usage"] = accumulated_usage
1105
+ if response_model:
1106
+ raw_obj["model"] = response_model
1107
+
844
1108
  yield CompletionStreamChunk(
845
1109
  tool_call_delta=ToolCallDelta(
846
1110
  index=idx,
@@ -857,6 +1121,12 @@ def make_litellm_sdk_provider(
857
1121
 
858
1122
  # Completion ended
859
1123
  if finish_reason:
1124
+ # Include accumulated usage and model in final chunk
1125
+ if raw_obj and (accumulated_usage or response_model):
1126
+ if accumulated_usage:
1127
+ raw_obj["usage"] = accumulated_usage
1128
+ if response_model:
1129
+ raw_obj["model"] = response_model
860
1130
  yield CompletionStreamChunk(
861
1131
  is_done=True, finish_reason=finish_reason, raw=raw_obj
862
1132
  )
jaf/server/server.py CHANGED
@@ -220,7 +220,7 @@ def _convert_core_message_to_http(core_msg: Message) -> HttpMessage:
220
220
  content=content,
221
221
  attachments=attachments,
222
222
  tool_call_id=core_msg.tool_call_id,
223
- tool_calls=core_msg.tool_calls,
223
+ tool_calls=[asdict(tc) for tc in core_msg.tool_calls] if core_msg.tool_calls else None,
224
224
  )
225
225
 
226
226
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jaf-py
3
- Version: 2.6.2
3
+ Version: 2.6.4
4
4
  Summary: A purely functional agent framework with immutable state and composable tools - Python implementation
5
5
  Author: JAF Contributors
6
6
  Maintainer: JAF Contributors
@@ -82,7 +82,7 @@ Dynamic: license-file
82
82
 
83
83
  <!-- ![JAF Banner](docs/cover.png) -->
84
84
 
85
- [![Version](https://img.shields.io/badge/version-2.6.2-blue.svg)](https://github.com/xynehq/jaf-py)
85
+ [![Version](https://img.shields.io/badge/version-2.6.4-blue.svg)](https://github.com/xynehq/jaf-py)
86
86
  [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/)
87
87
  [![Docs](https://img.shields.io/badge/Docs-Live-brightgreen)](https://xynehq.github.io/jaf-py/)
88
88
 
@@ -1,4 +1,4 @@
1
- jaf/__init__.py,sha256=Yc0wSawKKU2cvECCRJeJ_8mL6XwCPkbTHe74WmjVKRY,8652
1
+ jaf/__init__.py,sha256=ieEZNHk68b5MjZ39t9BhFzK19GqLV6w2TnWH8cwKPG0,8652
2
2
  jaf/cli.py,sha256=EDMMA5uX0e3TUIedLdyP3p4Qy-aXADvpht3VgJPJagU,8299
3
3
  jaf/exceptions.py,sha256=FdLIw7bdCNtBYfqRyJBkRT4Z1vWuvkzrMqFiMAzjL8Y,9158
4
4
  jaf/a2a/__init__.py,sha256=r4W-WHZNjoxR8EQ0x41_rY3fl12OH5qcSn0KycXaKKU,7752
@@ -43,7 +43,7 @@ jaf/core/agent_tool.py,sha256=gZje8_gZSaWCecySg2ZBK07RcD8bc2hxHsR4z87oKJE,12075
43
43
  jaf/core/analytics.py,sha256=ypdhllyOThXZB-TY_eR1t1n2qrnAVN7Ljb8PaOtJft0,23267
44
44
  jaf/core/checkpoint.py,sha256=O7mfi7gFOAUgJ3zHzgJsr11uzn-BU-Vj1iKyKjcirMk,8398
45
45
  jaf/core/composition.py,sha256=Tj0-FRTVWygmAfsBLld7pnZK4nrGMMBx2YYJW_KQPoo,25393
46
- jaf/core/engine.py,sha256=1jY8gBeNy00LgUKolAQRfF33C2L_xZ0j5nyI5OTAPyk,70271
46
+ jaf/core/engine.py,sha256=D_RtMWI43oSm7gK_J2kFRsJ2EJkHX4hMj0soUNXC92k,71179
47
47
  jaf/core/errors.py,sha256=iDw00o3WH0gHcenRcTj3QEbbloZVpgwnPij6mtaJJk4,5710
48
48
  jaf/core/guardrails.py,sha256=oPB7MpD3xWiCWoyaS-xQQp-glaPON7GNVrIL0h1Jefs,26931
49
49
  jaf/core/handoff.py,sha256=M7TQfd7BXuer1ZeRJ51nLsI55KifbM6faNtmA2Nsj3I,6196
@@ -56,8 +56,8 @@ jaf/core/state.py,sha256=fdWDc2DQ-o_g_8E4ibg2QM0Vad_XUique3a5iYBwGZo,9516
56
56
  jaf/core/streaming.py,sha256=5ntOtJrZVCHuGsygquyCLG2J5yuSxE6DN5OM-BrQiGw,16818
57
57
  jaf/core/tool_results.py,sha256=L9U3JDQAjAH5YR7iMpSxfVky2Nxo6FYQs4WE05RATaQ,11283
58
58
  jaf/core/tools.py,sha256=rHxzAfGVGpYk3YJKmrq3AQLW0oE3ACkiJBOwle2bLdc,15146
59
- jaf/core/tracing.py,sha256=-ZlIsfDRoFktiJgoY5R2d9lVjSASctKGjdUBWEuw-EE,57320
60
- jaf/core/types.py,sha256=MwHSXSamOz3QDjTEaOQzNqOMU1JxwFbHg8Fd9Xzw33Y,35576
59
+ jaf/core/tracing.py,sha256=gh_oAm8T7ENv7oV6-IRt9GnW-rsmWXMlLDFwr8NfeAI,59360
60
+ jaf/core/types.py,sha256=lJXlkL55devvzbc5efT5FdQ_LX3JcsMWA10Hy8Cd5Qs,37015
61
61
  jaf/core/workflows.py,sha256=0825AoD1QwEiGAs5IRlWHmaKrjurx6xF7oDJR6POBsg,25651
62
62
  jaf/memory/__init__.py,sha256=YfANOg5vUFSPVG7gpBE4_lYkV5X3_U6Yj9v1_QexfN0,1396
63
63
  jaf/memory/approval_storage.py,sha256=DcwtERcoIMH7B-abK9hqND3Moz4zSETsPlgJNkvqcaM,10573
@@ -75,10 +75,10 @@ jaf/policies/handoff.py,sha256=3lPegkSV_2LUf6jEZnj68_g3XUGFB_Fsj1C_6Svr2Kg,8128
75
75
  jaf/policies/validation.py,sha256=-zhB5ysH0Y4JnstHzo3I8tt-PFB9FSHBwSUALITBxw4,11016
76
76
  jaf/providers/__init__.py,sha256=PfIQkCtXb_yiTEjqs5msGv5-a6De2ujFCEaDGJEe_TQ,2100
77
77
  jaf/providers/mcp.py,sha256=fGfrlYx5g7ZX1fBUkPmAYSePKrCc4pG_HKngV_QCdRU,13148
78
- jaf/providers/model.py,sha256=MiPWEZl8MYAXLD010oX_qMCT7AkpGXIHLr9sTK4-xJM,45728
78
+ jaf/providers/model.py,sha256=FCnenKOLwh5JJ8hcXy7pemJb32EO0uvoww5ZTqd4mlE,58619
79
79
  jaf/server/__init__.py,sha256=cYqdruJCJ3W1AMmmxMjAnDlj9gh3XbHhtegjq4nYRNY,391
80
80
  jaf/server/main.py,sha256=usdCRZfDP3GWQchh1o2tHd4KqTTFyQQCD9w4khd9rSo,2113
81
- jaf/server/server.py,sha256=eVxc4w7XHwLFid_3X8lLp9EugUqeLLtVxS6Ikh485Io,51476
81
+ jaf/server/server.py,sha256=ZhZ2gmY10eQNaKUlE7ecMkrwMkYkAh-QgKdUJ2q7ktM,51532
82
82
  jaf/server/types.py,sha256=MsbADzpxVLlaVh0-VfgwbDybk1ZSavN5KSpPEamDEwE,14174
83
83
  jaf/utils/__init__.py,sha256=s3rsFFqSjsgRfnXrQFhcXXUc99HVFYizlfVbbkOYQDo,1229
84
84
  jaf/utils/attachments.py,sha256=SvZxEO7aCwl97bIJH3YtEYiuhBB6YcaBCp4UkXrWc4w,13179
@@ -89,9 +89,9 @@ jaf/visualization/functional_core.py,sha256=0Xs2R8ELADKNIgokcbjuxmWwxEyCH1yXIEdG
89
89
  jaf/visualization/graphviz.py,sha256=EwWVIRv8Z7gTiO5Spvcm-z_UUQ1oWNPRgdE33ZzFwx8,11569
90
90
  jaf/visualization/imperative_shell.py,sha256=N5lWzOLMIU_iCoy3n5WCg49eec8VxV8f7JIG6_wNtVw,2506
91
91
  jaf/visualization/types.py,sha256=90G8oClsFa_APqTuMrTW6KjD0oG9I4kVur773dXNW0E,1393
92
- jaf_py-2.6.2.dist-info/licenses/LICENSE,sha256=LXUQBJxdyr-7C4bk9cQBwvsF_xwA-UVstDTKabpcjlI,1063
93
- jaf_py-2.6.2.dist-info/METADATA,sha256=IHIsXU-V5KVPanz4Obos8xlgylgNlg2Q8vgvKToDg7Y,27743
94
- jaf_py-2.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
- jaf_py-2.6.2.dist-info/entry_points.txt,sha256=OtIJeNJpb24kgGrqRx9szGgDx1vL9ayq8uHErmu7U5w,41
96
- jaf_py-2.6.2.dist-info/top_level.txt,sha256=Xu1RZbGaM4_yQX7bpalo881hg7N_dybaOW282F15ruE,4
97
- jaf_py-2.6.2.dist-info/RECORD,,
92
+ jaf_py-2.6.4.dist-info/licenses/LICENSE,sha256=LXUQBJxdyr-7C4bk9cQBwvsF_xwA-UVstDTKabpcjlI,1063
93
+ jaf_py-2.6.4.dist-info/METADATA,sha256=By3r8jZ5EwcA_-CetPgaeG2XY28pTqNBPl54uttx-a0,27743
94
+ jaf_py-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
+ jaf_py-2.6.4.dist-info/entry_points.txt,sha256=OtIJeNJpb24kgGrqRx9szGgDx1vL9ayq8uHErmu7U5w,41
96
+ jaf_py-2.6.4.dist-info/top_level.txt,sha256=Xu1RZbGaM4_yQX7bpalo881hg7N_dybaOW282F15ruE,4
97
+ jaf_py-2.6.4.dist-info/RECORD,,
File without changes