PyPI - jaf-py - Versions diffs - 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl - Mend

jaf-py 2.6.2py3-none-any.whl → 2.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

jaf/__init__.py +1 -1
jaf/core/engine.py +18 -0
jaf/core/tracing.py +51 -15
jaf/core/types.py +45 -2
jaf/providers/model.py +313 -43
jaf/server/server.py +1 -1
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/METADATA +2 -2
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/RECORD +12 -12
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/WHEEL +0 -0
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/entry_points.txt +0 -0
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/licenses/LICENSE +0 -0
{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/top_level.txt +0 -0

jaf/__init__.py CHANGED Viewed

@@ -201,7 +201,7 @@ def generate_run_id() -> RunId:
     return create_run_id(str(uuid.uuid4()))
-__version__ = "2.6.2"
+__version__ = "2.6.4"
 __all__ = [
     # Core types and functions
     "TraceId",

jaf/core/engine.py CHANGED Viewed

@@ -692,8 +692,19 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
                 aggregated_text = ""
                 # Working array of partial tool calls
                 partial_tool_calls: List[Dict[str, Any]] = []
+                # Capture usage and model from streaming chunks
+                stream_usage: Optional[Dict[str, int]] = None
+                stream_model: Optional[str] = None
                 async for chunk in get_stream(state, current_agent, config):  # type: ignore[arg-type]
+                    # Extract usage and model from raw chunk if available
+                    raw_chunk = getattr(chunk, "raw", None)
+                    if raw_chunk:
+                        if not stream_usage and "usage" in raw_chunk and raw_chunk["usage"]:
+                            stream_usage = raw_chunk["usage"]
+                        if not stream_model and "model" in raw_chunk and raw_chunk["model"]:
+                            stream_model = raw_chunk["model"]
                     # Text deltas
                     delta_text = getattr(chunk, "delta", None)
                     if delta_text:
@@ -803,6 +814,13 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
                 llm_response = {
                     "message": {"content": aggregated_text or None, "tool_calls": final_tool_calls}
                 }
+                # Preserve usage and model from streaming if captured
+                if stream_usage:
+                    llm_response["usage"] = stream_usage
+                if stream_model:
+                    llm_response["model"] = stream_model
             except Exception:
                 # Fallback to non-streaming on error
                 assistant_event_streamed = False

jaf/core/tracing.py CHANGED Viewed

@@ -469,7 +469,7 @@ class LangfuseTraceCollector:
             public_key=public_key,
             secret_key=secret_key,
             host=host,
-            release="jaf-py-v2.6.2",
+            release="jaf-py-v2.6.4",
             httpx_client=client,
         )
         self._httpx_client = client
@@ -753,7 +753,9 @@ class LangfuseTraceCollector:
                         system_prompt = context.system_prompt
                     if system_prompt:
-                        print(f"[LANGFUSE DEBUG] Extracted system_prompt: {system_prompt[:100] if isinstance(system_prompt, str) else system_prompt}...")
+                        print(
+                            f"[LANGFUSE DEBUG] Extracted system_prompt: {system_prompt[:100] if isinstance(system_prompt, str) else system_prompt}..."
+                        )
                 print(
                     f"[LANGFUSE DEBUG] Final extracted - user_query: {user_query}, user_id: {user_id}"
@@ -911,25 +913,25 @@ class LangfuseTraceCollector:
                     print(f"[LANGFUSE] Ending generation for LLM call")
                     # End the generation
                     generation = self.active_spans[span_id]
-                    choice = self._get_event_data(event, "choice", {})
-                    # Extract usage from the event data
+                    choice = self._get_event_data(event, "choice", {})
                     usage = self._get_event_data(event, "usage", {})
+                    model = self._get_event_data(event, "model", "unknown")
-                    # Extract model information from choice data or event data
-                    model = choice.get("model", "unknown")
-                    if model == "unknown":
-                        # Try to get model from the choice response structure
-                        if isinstance(choice, dict):
-                            model = choice.get("model") or choice.get("id", "unknown")
+                    # Also try to get model from the choice if not at top level
+                    if model == "unknown" and isinstance(choice, dict):
+                        model = choice.get("model", "unknown")
-                    # Convert to Langfuse v2 format - let Langfuse handle cost calculation automatically
+                    print(f"[LANGFUSE] Extracted - model: '{model}', usage: {usage}")
+                    # Convert to Langfuse format with detailed cache information
                     langfuse_usage = None
                     if usage:
                         prompt_tokens = usage.get("prompt_tokens", 0)
                         completion_tokens = usage.get("completion_tokens", 0)
                         total_tokens = usage.get("total_tokens", 0)
+                        # Build detailed usage dict with cache information
                         langfuse_usage = {
                             "input": prompt_tokens,
                             "output": completion_tokens,
@@ -937,9 +939,40 @@ class LangfuseTraceCollector:
                             "unit": "TOKENS",
                         }
-                        print(
-                            f"[LANGFUSE] Usage data for automatic cost calculation: {langfuse_usage}"
-                        )
+                        # Add cache-related fields if available (for prompt caching support)
+                        if (
+                            "cache_creation_input_tokens" in usage
+                            and usage["cache_creation_input_tokens"]
+                        ):
+                            langfuse_usage["cache_creation_input_tokens"] = usage[
+                                "cache_creation_input_tokens"
+                            ]
+                        if "cache_read_input_tokens" in usage and usage["cache_read_input_tokens"]:
+                            langfuse_usage["cache_read_input_tokens"] = usage[
+                                "cache_read_input_tokens"
+                            ]
+                        # Add detailed token breakdowns if available
+                        if "prompt_tokens_details" in usage and usage["prompt_tokens_details"]:
+                            details = usage["prompt_tokens_details"]
+                            if "cached_tokens" in details and details["cached_tokens"]:
+                                langfuse_usage["input_cached_tokens"] = details["cached_tokens"]
+                            if "audio_tokens" in details and details["audio_tokens"]:
+                                langfuse_usage["input_audio_tokens"] = details["audio_tokens"]
+                        if (
+                            "completion_tokens_details" in usage
+                            and usage["completion_tokens_details"]
+                        ):
+                            details = usage["completion_tokens_details"]
+                            if "reasoning_tokens" in details and details["reasoning_tokens"]:
+                                langfuse_usage["output_reasoning_tokens"] = details[
+                                    "reasoning_tokens"
+                                ]
+                            if "audio_tokens" in details and details["audio_tokens"]:
+                                langfuse_usage["output_audio_tokens"] = details["audio_tokens"]
+                        print(f"[LANGFUSE] Usage data with cache details: {langfuse_usage}")
                     # Include model information in the generation end - Langfuse will calculate costs automatically
                     # Use compatibility wrapper for ending spans/generations
@@ -1260,7 +1293,10 @@ def create_composite_trace_collector(
     # Automatically add Langfuse collector if keys are configured
     if os.getenv("LANGFUSE_PUBLIC_KEY") and os.getenv("LANGFUSE_SECRET_KEY"):
         langfuse_collector = LangfuseTraceCollector(
-            httpx_client=httpx_client, proxy=proxy, timeout=timeout, include_system_prompt=include_system_prompt
+            httpx_client=httpx_client,
+            proxy=proxy,
+            timeout=timeout,
+            include_system_prompt=include_system_prompt,
         )
         collector_list.append(langfuse_collector)

jaf/core/types.py CHANGED Viewed

@@ -1009,6 +1009,38 @@ class RetryEvent:
     )
+@dataclass(frozen=True)
+class FallbackEventData:
+    """Data for model fallback events."""
+    from_model: str  # Model that failed
+    to_model: str  # Fallback model being tried
+    reason: str  # Reason for fallback (e.g., "Content Policy Violation", "Context Window Exceeded", "Rate Limit")
+    fallback_type: Literal["general", "content_policy", "context_window"]  # Type of fallback
+    attempt: int  # Which fallback attempt this is (1-indexed)
+    trace_id: TraceId
+    run_id: RunId
+    error_details: Optional[Dict[str, Any]] = None  # Additional error context
+@dataclass(frozen=True)
+class FallbackEvent:
+    """Event emitted when a model fallback occurs."""
+    type: Literal["fallback"] = "fallback"
+    data: FallbackEventData = field(
+        default_factory=lambda: FallbackEventData(
+            from_model="",
+            to_model="",
+            reason="",
+            fallback_type="general",
+            attempt=1,
+            trace_id=TraceId(""),
+            run_id=RunId(""),
+        )
+    )
 # Union type for all trace events
 TraceEvent = Union[
     RunStartEvent,
@@ -1024,6 +1056,7 @@ TraceEvent = Union[
     HandoffEvent,
     RunEndEvent,
     RetryEvent,
+    FallbackEvent,
 ]
@@ -1096,7 +1129,9 @@ class RunConfig(Generic[Ctx]):
     agent_registry: Dict[str, Agent[Ctx, Any]]
     model_provider: ModelProvider[Ctx]
     max_turns: Optional[int] = 50
-    max_tokens: Optional[int] = None  # Default max_tokens for all agents (can be overridden per agent)
+    max_tokens: Optional[int] = (
+        None  # Default max_tokens for all agents (can be overridden per agent)
+    )
     model_override: Optional[str] = None
     initial_input_guardrails: Optional[List[Guardrail]] = None
     final_output_guardrails: Optional[List[Guardrail]] = None
@@ -1120,7 +1155,7 @@ class RunConfig(Generic[Ctx]):
             [List[Message], RunState[Ctx]],
             Union[List[Message], Awaitable[List[Message]]],
         ]
-    ] = None
+    ] = None
     max_empty_response_retries: int = 3  # Maximum retries when LLM returns empty response
     empty_response_retry_delay: float = (
         1.0  # Initial delay in seconds before retrying empty response (uses exponential backoff)
@@ -1129,6 +1164,14 @@ class RunConfig(Generic[Ctx]):
     prefer_streaming: Optional[bool] = (
         None  # Whether to prefer streaming responses. None (default) = use streaming if available, True = prefer streaming, False = disable streaming
     )
+    # Model fallback configuration
+    fallbacks: Optional[List[str]] = None  # List of fallback models to try if primary model fails
+    content_policy_fallbacks: Optional[List[str]] = (
+        None  # Fallback models for content policy violations
+    )
+    context_window_fallbacks: Optional[List[str]] = (
+        None  # Fallback models for context window exceeded errors
+    )
 # Regeneration types for conversation management

jaf/providers/model.py CHANGED Viewed

@@ -30,6 +30,8 @@ from ..core.types import (
     get_text_content,
     RetryEvent,
     RetryEventData,
+    FallbackEvent,
+    FallbackEventData,
 )
 from ..core.proxy import ProxyConfig
 from ..utils.document_processor import (
@@ -113,6 +115,55 @@ async def _is_vision_model(model: str, base_url: str) -> bool:
     return is_known_vision_model
+def _classify_error_for_fallback(e: Exception) -> tuple[str, str]:
+    """
+    Classify an error to determine the fallback type and reason.
+    Args:
+        e: Exception from model call
+    Returns:
+        Tuple of (fallback_type, reason)
+    """
+    error_message = str(e).lower()
+    error_type = type(e).__name__
+    # Check for content policy violations
+    if (
+        "content" in error_message
+        and ("policy" in error_message or "filter" in error_message)
+        or "contentpolicyviolation" in error_type.lower()
+        or "content_filter" in error_message
+        or "safety" in error_message
+    ):
+        return ("content_policy", "Content Policy Violation")
+    # Check for context window exceeded
+    if (
+        "context" in error_message
+        and "window" in error_message
+        or "too long" in error_message
+        or "maximum context" in error_message
+        or "contextwindowexceeded" in error_type.lower()
+        or "prompt is too long" in error_message
+        or "tokens" in error_message
+        and "limit" in error_message
+    ):
+        return ("context_window", "Context Window Exceeded")
+    # Default to general fallback
+    if hasattr(e, "status_code"):
+        status_code = e.status_code
+        if status_code == 429:
+            return ("general", f"HTTP {status_code} - Rate Limit")
+        elif 500 <= status_code < 600:
+            return ("general", f"HTTP {status_code} - Server Error")
+        else:
+            return ("general", f"HTTP {status_code}")
+    return ("general", error_type)
 async def _retry_with_events(
     operation_func,
     state: RunState,
@@ -259,10 +310,10 @@ def make_litellm_provider(
         async def get_completion(
             self, state: RunState[Ctx], agent: Agent[Ctx, Any], config: RunConfig[Ctx]
         ) -> Dict[str, Any]:
-            """Get completion from the model."""
+            """Get completion from the model with fallback support."""
-            # Determine model to use
-            model = config.model_override or (
+            # Determine initial model to use
+            primary_model = config.model_override or (
                 agent.model_config.name if agent.model_config else "gpt-4o"
             )
@@ -277,10 +328,10 @@ def make_litellm_provider(
             )
             if has_image_content:
-                supports_vision = await _is_vision_model(model, base_url)
+                supports_vision = await _is_vision_model(primary_model, base_url)
                 if not supports_vision:
                     raise ValueError(
-                        f"Model {model} does not support vision capabilities. "
+                        f"Model {primary_model} does not support vision capabilities. "
                         f"Please use a vision-capable model like gpt-4o, claude-3-5-sonnet, or gemini-1.5-pro."
                     )
@@ -322,39 +373,123 @@ def make_litellm_provider(
                 last_message.role == ContentRole.TOOL or last_message.role == "tool"
             )
-            # Prepare request parameters
-            request_params = {"model": model, "messages": messages, "stream": False}
-            # Add optional parameters
-            if agent.model_config:
-                if agent.model_config.temperature is not None:
-                    request_params["temperature"] = agent.model_config.temperature
-                # Use agent's max_tokens if set, otherwise fall back to config's max_tokens
-                max_tokens = agent.model_config.max_tokens
-                if max_tokens is None:
-                    max_tokens = config.max_tokens
-                if max_tokens is not None:
-                    request_params["max_tokens"] = max_tokens
-            elif config.max_tokens is not None:
-                # No model_config but config has max_tokens
-                request_params["max_tokens"] = config.max_tokens
-            if tools:
-                request_params["tools"] = tools
-                # Always set tool_choice to auto when tools are available
-                request_params["tool_choice"] = "auto"
-            if agent.output_codec:
-                request_params["response_format"] = {"type": "json_object"}
+            # Helper function to make API call with a specific model
+            async def _make_completion_call(model_name: str) -> Dict[str, Any]:
+                # Prepare request parameters
+                request_params = {"model": model_name, "messages": messages, "stream": False}
+                # Add optional parameters
+                if agent.model_config:
+                    if agent.model_config.temperature is not None:
+                        request_params["temperature"] = agent.model_config.temperature
+                    # Use agent's max_tokens if set, otherwise fall back to config's max_tokens
+                    max_tokens = agent.model_config.max_tokens
+                    if max_tokens is None:
+                        max_tokens = config.max_tokens
+                    if max_tokens is not None:
+                        request_params["max_tokens"] = max_tokens
+                elif config.max_tokens is not None:
+                    # No model_config but config has max_tokens
+                    request_params["max_tokens"] = config.max_tokens
+                if tools:
+                    request_params["tools"] = tools
+                    # Always set tool_choice to auto when tools are available
+                    request_params["tool_choice"] = "auto"
+                if agent.output_codec:
+                    request_params["response_format"] = {"type": "json_object"}
+                # Make the API call with retry handling
+                async def _api_call():
+                    return await self.client.chat.completions.create(**request_params)
+                # Use retry wrapper to track retries in Langfuse
+                return await _retry_with_events(
+                    _api_call,
+                    state,
+                    config,
+                    operation_name="llm_call",
+                    max_retries=3,
+                    backoff_factor=1.0,
+                )
-            # Make the API call with retry handling
-            async def _api_call():
-                return await self.client.chat.completions.create(**request_params)
+            # Try primary model first
+            last_exception = None
+            current_model = primary_model
+            try:
+                response = await _make_completion_call(current_model)
+            except Exception as e:
+                last_exception = e
+                # Classify the error to determine which fallback list to use
+                fallback_type, reason = _classify_error_for_fallback(e)
+                # Determine which fallback list to use
+                fallback_models = []
+                if fallback_type == "content_policy" and config.content_policy_fallbacks:
+                    fallback_models = config.content_policy_fallbacks
+                elif fallback_type == "context_window" and config.context_window_fallbacks:
+                    fallback_models = config.context_window_fallbacks
+                elif config.fallbacks:
+                    fallback_models = config.fallbacks
+                # Try fallback models
+                if fallback_models:
+                    print(
+                        f"[JAF:FALLBACK] Primary model '{current_model}' failed with {reason}. "
+                        f"Trying {len(fallback_models)} fallback model(s)..."
+                    )
-            # Use retry wrapper to track retries in Langfuse
-            response = await _retry_with_events(
-                _api_call, state, config, operation_name="llm_call", max_retries=3, backoff_factor=1.0
-            )
+                    for i, fallback_model in enumerate(fallback_models, 1):
+                        try:
+                            # Emit fallback event
+                            if config.on_event:
+                                fallback_event = FallbackEvent(
+                                    data=FallbackEventData(
+                                        from_model=current_model,
+                                        to_model=fallback_model,
+                                        reason=reason,
+                                        fallback_type=fallback_type,
+                                        attempt=i,
+                                        trace_id=state.trace_id,
+                                        run_id=state.run_id,
+                                        error_details={
+                                            "error_type": type(last_exception).__name__,
+                                            "error_message": str(last_exception),
+                                        },
+                                    )
+                                )
+                                config.on_event(fallback_event)
+                            print(
+                                f"[JAF:FALLBACK] Attempting fallback {i}/{len(fallback_models)}: {fallback_model}"
+                            )
+                            # Try the fallback model
+                            response = await _make_completion_call(fallback_model)
+                            current_model = fallback_model
+                            print(
+                                f"[JAF:FALLBACK] Successfully used fallback model: {fallback_model}"
+                            )
+                            break  # Success - exit the fallback loop
+                        except Exception as fallback_error:
+                            last_exception = fallback_error
+                            print(
+                                f"[JAF:FALLBACK] Fallback model '{fallback_model}' also failed: {fallback_error}"
+                            )
+                            # If this was the last fallback, re-raise
+                            if i == len(fallback_models):
+                                print(
+                                    f"[JAF:FALLBACK] All fallback models exhausted. Raising last exception."
+                                )
+                                raise
+                else:
+                    # No fallbacks configured, re-raise original exception
+                    raise
             # Return in the expected format that the engine expects
             choice = response.choices[0]
@@ -371,7 +506,7 @@ def make_litellm_provider(
                     for tc in choice.message.tool_calls
                 ]
-            # Extract usage data
+            # Extract usage data with detailed cache information
             usage_data = None
             if response.usage:
                 usage_data = {
@@ -380,6 +515,45 @@ def make_litellm_provider(
                     "total_tokens": response.usage.total_tokens,
                 }
+                # Extract cache-related fields if available (for prompt caching support)
+                if hasattr(response.usage, "cache_creation_input_tokens"):
+                    usage_data["cache_creation_input_tokens"] = (
+                        response.usage.cache_creation_input_tokens
+                    )
+                if hasattr(response.usage, "cache_read_input_tokens"):
+                    usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
+                # Extract detailed token breakdowns
+                if (
+                    hasattr(response.usage, "prompt_tokens_details")
+                    and response.usage.prompt_tokens_details
+                ):
+                    details = {}
+                    if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
+                        details["cached_tokens"] = (
+                            response.usage.prompt_tokens_details.cached_tokens
+                        )
+                    if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
+                        details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
+                    if details:
+                        usage_data["prompt_tokens_details"] = details
+                if (
+                    hasattr(response.usage, "completion_tokens_details")
+                    and response.usage.completion_tokens_details
+                ):
+                    details = {}
+                    if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
+                        details["reasoning_tokens"] = (
+                            response.usage.completion_tokens_details.reasoning_tokens
+                        )
+                    if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
+                        details["audio_tokens"] = (
+                            response.usage.completion_tokens_details.audio_tokens
+                        )
+                    if details:
+                        usage_data["completion_tokens_details"] = details
             return {
                 "id": response.id,
                 "created": response.created,
@@ -688,7 +862,12 @@ def make_litellm_sdk_provider(
             # Use retry wrapper to track retries in Langfuse
             response = await _retry_with_events(
-                _api_call, state, config, operation_name="llm_call", max_retries=3, backoff_factor=1.0
+                _api_call,
+                state,
+                config,
+                operation_name="llm_call",
+                max_retries=3,
+                backoff_factor=1.0,
             )
             # Return in the expected format that the engine expects
@@ -706,8 +885,16 @@ def make_litellm_sdk_provider(
                     for tc in choice.message.tool_calls
                 ]
-            # Extract usage data
-            usage_data = None
+            # Extract usage data with detailed cache information - ALWAYS return a dict with defaults for Langfuse cost tracking
+            # Initialize with zeros as defensive default (matches AzureDirectProvider pattern)
+            usage_data = {
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "total_tokens": 0,
+            }
+            actual_model = getattr(response, "model", model_name)
             if response.usage:
                 usage_data = {
                     "prompt_tokens": response.usage.prompt_tokens,
@@ -715,12 +902,59 @@ def make_litellm_sdk_provider(
                     "total_tokens": response.usage.total_tokens,
                 }
+                # Extract cache-related fields if available (for prompt caching support)
+                if hasattr(response.usage, "cache_creation_input_tokens"):
+                    usage_data["cache_creation_input_tokens"] = (
+                        response.usage.cache_creation_input_tokens
+                    )
+                if hasattr(response.usage, "cache_read_input_tokens"):
+                    usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
+                # Extract detailed token breakdowns
+                if (
+                    hasattr(response.usage, "prompt_tokens_details")
+                    and response.usage.prompt_tokens_details
+                ):
+                    details = {}
+                    if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
+                        details["cached_tokens"] = (
+                            response.usage.prompt_tokens_details.cached_tokens
+                        )
+                    if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
+                        details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
+                    if details:
+                        usage_data["prompt_tokens_details"] = details
+                if (
+                    hasattr(response.usage, "completion_tokens_details")
+                    and response.usage.completion_tokens_details
+                ):
+                    details = {}
+                    if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
+                        details["reasoning_tokens"] = (
+                            response.usage.completion_tokens_details.reasoning_tokens
+                        )
+                    if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
+                        details["audio_tokens"] = (
+                            response.usage.completion_tokens_details.audio_tokens
+                        )
+                    if details:
+                        usage_data["completion_tokens_details"] = details
+            message_content = {
+                "content": choice.message.content,
+                "tool_calls": tool_calls,
+                # CRITICAL: Embed usage and model here so trace collector can find them
+                "_usage": usage_data,
+                "_model": actual_model,
+            }
             return {
                 "id": response.id,
                 "created": response.created,
-                "model": response.model,
+                "model": actual_model,
                 "system_fingerprint": getattr(response, "system_fingerprint", None),
-                "message": {"content": choice.message.content, "tool_calls": tool_calls},
+                "message": message_content,
                 "usage": usage_data,
                 "prompt": messages,
             }
@@ -769,6 +1003,7 @@ def make_litellm_sdk_provider(
                 "model": model_name,
                 "messages": messages,
                 "stream": True,
+                "stream_options": {"include_usage": True},  # Request usage data in streaming
                 **self.litellm_kwargs,
             }
@@ -804,14 +1039,30 @@ def make_litellm_sdk_provider(
             # Stream using litellm
             stream = await litellm.acompletion(**request_params)
+            accumulated_usage: Optional[Dict[str, int]] = None
+            response_model: Optional[str] = None
             async for chunk in stream:
                 try:
                     # Best-effort extraction of raw for debugging
                     try:
                         raw_obj = chunk.model_dump() if hasattr(chunk, "model_dump") else None
-                    except Exception:
+                        # Capture usage from chunk if present
+                        if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
+                            accumulated_usage = raw_obj["usage"]
+                        # Capture model from chunk if present
+                        if raw_obj and "model" in raw_obj and raw_obj["model"]:
+                            response_model = raw_obj["model"]
+                    except Exception as e:
                         raw_obj = None
+                    if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
+                        # Yield this chunk so engine.py can capture usage from raw
+                        yield CompletionStreamChunk(delta="", raw=raw_obj)
                     choice = None
                     if getattr(chunk, "choices", None):
                         choice = chunk.choices[0]
@@ -826,6 +1077,12 @@ def make_litellm_sdk_provider(
                     if delta is not None:
                         content_delta = getattr(delta, "content", None)
                         if content_delta:
+                            # Include accumulated usage and model in raw_obj for engine
+                            if raw_obj and (accumulated_usage or response_model):
+                                if accumulated_usage:
+                                    raw_obj["usage"] = accumulated_usage
+                                if response_model:
+                                    raw_obj["model"] = response_model
                             yield CompletionStreamChunk(delta=content_delta, raw=raw_obj)
                         # Tool call deltas
@@ -841,6 +1098,13 @@ def make_litellm_sdk_provider(
                                         getattr(fn, "arguments", None) if fn is not None else None
                                     )
+                                    # Include accumulated usage and model in raw_obj
+                                    if raw_obj and (accumulated_usage or response_model):
+                                        if accumulated_usage:
+                                            raw_obj["usage"] = accumulated_usage
+                                        if response_model:
+                                            raw_obj["model"] = response_model
                                     yield CompletionStreamChunk(
                                         tool_call_delta=ToolCallDelta(
                                             index=idx,
@@ -857,6 +1121,12 @@ def make_litellm_sdk_provider(
                     # Completion ended
                     if finish_reason:
+                        # Include accumulated usage and model in final chunk
+                        if raw_obj and (accumulated_usage or response_model):
+                            if accumulated_usage:
+                                raw_obj["usage"] = accumulated_usage
+                            if response_model:
+                                raw_obj["model"] = response_model
                         yield CompletionStreamChunk(
                             is_done=True, finish_reason=finish_reason, raw=raw_obj
                         )

jaf/server/server.py CHANGED Viewed

@@ -220,7 +220,7 @@ def _convert_core_message_to_http(core_msg: Message) -> HttpMessage:
         content=content,
         attachments=attachments,
         tool_call_id=core_msg.tool_call_id,
-        tool_calls=core_msg.tool_calls,
+        tool_calls=[asdict(tc) for tc in core_msg.tool_calls] if core_msg.tool_calls else None,
     )

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: jaf-py
-Version: 2.6.2
+Version: 2.6.4
 Summary: A purely functional agent framework with immutable state and composable tools - Python implementation
 Author: JAF Contributors
 Maintainer: JAF Contributors
@@ -82,7 +82,7 @@ Dynamic: license-file
 <!-- ![JAF Banner](docs/cover.png) -->
-[![Version](https://img.shields.io/badge/version-2.6.2-blue.svg)](https://github.com/xynehq/jaf-py)
+[![Version](https://img.shields.io/badge/version-2.6.4-blue.svg)](https://github.com/xynehq/jaf-py)
 [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/)
 [![Docs](https://img.shields.io/badge/Docs-Live-brightgreen)](https://xynehq.github.io/jaf-py/)

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-jaf/__init__.py,sha256=Yc0wSawKKU2cvECCRJeJ_8mL6XwCPkbTHe74WmjVKRY,8652
+jaf/__init__.py,sha256=ieEZNHk68b5MjZ39t9BhFzK19GqLV6w2TnWH8cwKPG0,8652
 jaf/cli.py,sha256=EDMMA5uX0e3TUIedLdyP3p4Qy-aXADvpht3VgJPJagU,8299
 jaf/exceptions.py,sha256=FdLIw7bdCNtBYfqRyJBkRT4Z1vWuvkzrMqFiMAzjL8Y,9158
 jaf/a2a/__init__.py,sha256=r4W-WHZNjoxR8EQ0x41_rY3fl12OH5qcSn0KycXaKKU,7752
@@ -43,7 +43,7 @@ jaf/core/agent_tool.py,sha256=gZje8_gZSaWCecySg2ZBK07RcD8bc2hxHsR4z87oKJE,12075
 jaf/core/analytics.py,sha256=ypdhllyOThXZB-TY_eR1t1n2qrnAVN7Ljb8PaOtJft0,23267
 jaf/core/checkpoint.py,sha256=O7mfi7gFOAUgJ3zHzgJsr11uzn-BU-Vj1iKyKjcirMk,8398
 jaf/core/composition.py,sha256=Tj0-FRTVWygmAfsBLld7pnZK4nrGMMBx2YYJW_KQPoo,25393
-jaf/core/engine.py,sha256=1jY8gBeNy00LgUKolAQRfF33C2L_xZ0j5nyI5OTAPyk,70271
+jaf/core/engine.py,sha256=D_RtMWI43oSm7gK_J2kFRsJ2EJkHX4hMj0soUNXC92k,71179
 jaf/core/errors.py,sha256=iDw00o3WH0gHcenRcTj3QEbbloZVpgwnPij6mtaJJk4,5710
 jaf/core/guardrails.py,sha256=oPB7MpD3xWiCWoyaS-xQQp-glaPON7GNVrIL0h1Jefs,26931
 jaf/core/handoff.py,sha256=M7TQfd7BXuer1ZeRJ51nLsI55KifbM6faNtmA2Nsj3I,6196
@@ -56,8 +56,8 @@ jaf/core/state.py,sha256=fdWDc2DQ-o_g_8E4ibg2QM0Vad_XUique3a5iYBwGZo,9516
 jaf/core/streaming.py,sha256=5ntOtJrZVCHuGsygquyCLG2J5yuSxE6DN5OM-BrQiGw,16818
 jaf/core/tool_results.py,sha256=L9U3JDQAjAH5YR7iMpSxfVky2Nxo6FYQs4WE05RATaQ,11283
 jaf/core/tools.py,sha256=rHxzAfGVGpYk3YJKmrq3AQLW0oE3ACkiJBOwle2bLdc,15146
-jaf/core/tracing.py,sha256=-ZlIsfDRoFktiJgoY5R2d9lVjSASctKGjdUBWEuw-EE,57320
-jaf/core/types.py,sha256=MwHSXSamOz3QDjTEaOQzNqOMU1JxwFbHg8Fd9Xzw33Y,35576
+jaf/core/tracing.py,sha256=gh_oAm8T7ENv7oV6-IRt9GnW-rsmWXMlLDFwr8NfeAI,59360
+jaf/core/types.py,sha256=lJXlkL55devvzbc5efT5FdQ_LX3JcsMWA10Hy8Cd5Qs,37015
 jaf/core/workflows.py,sha256=0825AoD1QwEiGAs5IRlWHmaKrjurx6xF7oDJR6POBsg,25651
 jaf/memory/__init__.py,sha256=YfANOg5vUFSPVG7gpBE4_lYkV5X3_U6Yj9v1_QexfN0,1396
 jaf/memory/approval_storage.py,sha256=DcwtERcoIMH7B-abK9hqND3Moz4zSETsPlgJNkvqcaM,10573
@@ -75,10 +75,10 @@ jaf/policies/handoff.py,sha256=3lPegkSV_2LUf6jEZnj68_g3XUGFB_Fsj1C_6Svr2Kg,8128
 jaf/policies/validation.py,sha256=-zhB5ysH0Y4JnstHzo3I8tt-PFB9FSHBwSUALITBxw4,11016
 jaf/providers/__init__.py,sha256=PfIQkCtXb_yiTEjqs5msGv5-a6De2ujFCEaDGJEe_TQ,2100
 jaf/providers/mcp.py,sha256=fGfrlYx5g7ZX1fBUkPmAYSePKrCc4pG_HKngV_QCdRU,13148
-jaf/providers/model.py,sha256=MiPWEZl8MYAXLD010oX_qMCT7AkpGXIHLr9sTK4-xJM,45728
+jaf/providers/model.py,sha256=FCnenKOLwh5JJ8hcXy7pemJb32EO0uvoww5ZTqd4mlE,58619
 jaf/server/__init__.py,sha256=cYqdruJCJ3W1AMmmxMjAnDlj9gh3XbHhtegjq4nYRNY,391
 jaf/server/main.py,sha256=usdCRZfDP3GWQchh1o2tHd4KqTTFyQQCD9w4khd9rSo,2113
-jaf/server/server.py,sha256=eVxc4w7XHwLFid_3X8lLp9EugUqeLLtVxS6Ikh485Io,51476
+jaf/server/server.py,sha256=ZhZ2gmY10eQNaKUlE7ecMkrwMkYkAh-QgKdUJ2q7ktM,51532
 jaf/server/types.py,sha256=MsbADzpxVLlaVh0-VfgwbDybk1ZSavN5KSpPEamDEwE,14174
 jaf/utils/__init__.py,sha256=s3rsFFqSjsgRfnXrQFhcXXUc99HVFYizlfVbbkOYQDo,1229
 jaf/utils/attachments.py,sha256=SvZxEO7aCwl97bIJH3YtEYiuhBB6YcaBCp4UkXrWc4w,13179
@@ -89,9 +89,9 @@ jaf/visualization/functional_core.py,sha256=0Xs2R8ELADKNIgokcbjuxmWwxEyCH1yXIEdG
 jaf/visualization/graphviz.py,sha256=EwWVIRv8Z7gTiO5Spvcm-z_UUQ1oWNPRgdE33ZzFwx8,11569
 jaf/visualization/imperative_shell.py,sha256=N5lWzOLMIU_iCoy3n5WCg49eec8VxV8f7JIG6_wNtVw,2506
 jaf/visualization/types.py,sha256=90G8oClsFa_APqTuMrTW6KjD0oG9I4kVur773dXNW0E,1393
-jaf_py-2.6.2.dist-info/licenses/LICENSE,sha256=LXUQBJxdyr-7C4bk9cQBwvsF_xwA-UVstDTKabpcjlI,1063
-jaf_py-2.6.2.dist-info/METADATA,sha256=IHIsXU-V5KVPanz4Obos8xlgylgNlg2Q8vgvKToDg7Y,27743
-jaf_py-2.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-jaf_py-2.6.2.dist-info/entry_points.txt,sha256=OtIJeNJpb24kgGrqRx9szGgDx1vL9ayq8uHErmu7U5w,41
-jaf_py-2.6.2.dist-info/top_level.txt,sha256=Xu1RZbGaM4_yQX7bpalo881hg7N_dybaOW282F15ruE,4
-jaf_py-2.6.2.dist-info/RECORD,,
+jaf_py-2.6.4.dist-info/licenses/LICENSE,sha256=LXUQBJxdyr-7C4bk9cQBwvsF_xwA-UVstDTKabpcjlI,1063
+jaf_py-2.6.4.dist-info/METADATA,sha256=By3r8jZ5EwcA_-CetPgaeG2XY28pTqNBPl54uttx-a0,27743
+jaf_py-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+jaf_py-2.6.4.dist-info/entry_points.txt,sha256=OtIJeNJpb24kgGrqRx9szGgDx1vL9ayq8uHErmu7U5w,41
+jaf_py-2.6.4.dist-info/top_level.txt,sha256=Xu1RZbGaM4_yQX7bpalo881hg7N_dybaOW282F15ruE,4
+jaf_py-2.6.4.dist-info/RECORD,,

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

jaf-py 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl

jaf-py 2.6.2py3-none-any.whl → 2.6.4py3-none-any.whl