jaf-py 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jaf/__init__.py +1 -1
- jaf/core/engine.py +18 -0
- jaf/core/tracing.py +51 -15
- jaf/core/types.py +45 -2
- jaf/providers/model.py +313 -43
- jaf/server/server.py +1 -1
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/METADATA +2 -2
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/RECORD +12 -12
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/WHEEL +0 -0
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/entry_points.txt +0 -0
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/licenses/LICENSE +0 -0
- {jaf_py-2.6.2.dist-info → jaf_py-2.6.4.dist-info}/top_level.txt +0 -0
jaf/__init__.py
CHANGED
jaf/core/engine.py
CHANGED
|
@@ -692,8 +692,19 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
|
|
|
692
692
|
aggregated_text = ""
|
|
693
693
|
# Working array of partial tool calls
|
|
694
694
|
partial_tool_calls: List[Dict[str, Any]] = []
|
|
695
|
+
# Capture usage and model from streaming chunks
|
|
696
|
+
stream_usage: Optional[Dict[str, int]] = None
|
|
697
|
+
stream_model: Optional[str] = None
|
|
695
698
|
|
|
696
699
|
async for chunk in get_stream(state, current_agent, config): # type: ignore[arg-type]
|
|
700
|
+
# Extract usage and model from raw chunk if available
|
|
701
|
+
raw_chunk = getattr(chunk, "raw", None)
|
|
702
|
+
if raw_chunk:
|
|
703
|
+
if not stream_usage and "usage" in raw_chunk and raw_chunk["usage"]:
|
|
704
|
+
stream_usage = raw_chunk["usage"]
|
|
705
|
+
if not stream_model and "model" in raw_chunk and raw_chunk["model"]:
|
|
706
|
+
stream_model = raw_chunk["model"]
|
|
707
|
+
|
|
697
708
|
# Text deltas
|
|
698
709
|
delta_text = getattr(chunk, "delta", None)
|
|
699
710
|
if delta_text:
|
|
@@ -803,6 +814,13 @@ async def _run_internal(state: RunState[Ctx], config: RunConfig[Ctx]) -> RunResu
|
|
|
803
814
|
llm_response = {
|
|
804
815
|
"message": {"content": aggregated_text or None, "tool_calls": final_tool_calls}
|
|
805
816
|
}
|
|
817
|
+
|
|
818
|
+
# Preserve usage and model from streaming if captured
|
|
819
|
+
if stream_usage:
|
|
820
|
+
llm_response["usage"] = stream_usage
|
|
821
|
+
if stream_model:
|
|
822
|
+
llm_response["model"] = stream_model
|
|
823
|
+
|
|
806
824
|
except Exception:
|
|
807
825
|
# Fallback to non-streaming on error
|
|
808
826
|
assistant_event_streamed = False
|
jaf/core/tracing.py
CHANGED
|
@@ -469,7 +469,7 @@ class LangfuseTraceCollector:
|
|
|
469
469
|
public_key=public_key,
|
|
470
470
|
secret_key=secret_key,
|
|
471
471
|
host=host,
|
|
472
|
-
release="jaf-py-v2.6.
|
|
472
|
+
release="jaf-py-v2.6.4",
|
|
473
473
|
httpx_client=client,
|
|
474
474
|
)
|
|
475
475
|
self._httpx_client = client
|
|
@@ -753,7 +753,9 @@ class LangfuseTraceCollector:
|
|
|
753
753
|
system_prompt = context.system_prompt
|
|
754
754
|
|
|
755
755
|
if system_prompt:
|
|
756
|
-
print(
|
|
756
|
+
print(
|
|
757
|
+
f"[LANGFUSE DEBUG] Extracted system_prompt: {system_prompt[:100] if isinstance(system_prompt, str) else system_prompt}..."
|
|
758
|
+
)
|
|
757
759
|
|
|
758
760
|
print(
|
|
759
761
|
f"[LANGFUSE DEBUG] Final extracted - user_query: {user_query}, user_id: {user_id}"
|
|
@@ -911,25 +913,25 @@ class LangfuseTraceCollector:
|
|
|
911
913
|
print(f"[LANGFUSE] Ending generation for LLM call")
|
|
912
914
|
# End the generation
|
|
913
915
|
generation = self.active_spans[span_id]
|
|
914
|
-
choice = self._get_event_data(event, "choice", {})
|
|
915
916
|
|
|
916
|
-
|
|
917
|
+
choice = self._get_event_data(event, "choice", {})
|
|
917
918
|
usage = self._get_event_data(event, "usage", {})
|
|
919
|
+
model = self._get_event_data(event, "model", "unknown")
|
|
918
920
|
|
|
919
|
-
#
|
|
920
|
-
model
|
|
921
|
-
|
|
922
|
-
# Try to get model from the choice response structure
|
|
923
|
-
if isinstance(choice, dict):
|
|
924
|
-
model = choice.get("model") or choice.get("id", "unknown")
|
|
921
|
+
# Also try to get model from the choice if not at top level
|
|
922
|
+
if model == "unknown" and isinstance(choice, dict):
|
|
923
|
+
model = choice.get("model", "unknown")
|
|
925
924
|
|
|
926
|
-
|
|
925
|
+
print(f"[LANGFUSE] Extracted - model: '{model}', usage: {usage}")
|
|
926
|
+
|
|
927
|
+
# Convert to Langfuse format with detailed cache information
|
|
927
928
|
langfuse_usage = None
|
|
928
929
|
if usage:
|
|
929
930
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
930
931
|
completion_tokens = usage.get("completion_tokens", 0)
|
|
931
932
|
total_tokens = usage.get("total_tokens", 0)
|
|
932
933
|
|
|
934
|
+
# Build detailed usage dict with cache information
|
|
933
935
|
langfuse_usage = {
|
|
934
936
|
"input": prompt_tokens,
|
|
935
937
|
"output": completion_tokens,
|
|
@@ -937,9 +939,40 @@ class LangfuseTraceCollector:
|
|
|
937
939
|
"unit": "TOKENS",
|
|
938
940
|
}
|
|
939
941
|
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
942
|
+
# Add cache-related fields if available (for prompt caching support)
|
|
943
|
+
if (
|
|
944
|
+
"cache_creation_input_tokens" in usage
|
|
945
|
+
and usage["cache_creation_input_tokens"]
|
|
946
|
+
):
|
|
947
|
+
langfuse_usage["cache_creation_input_tokens"] = usage[
|
|
948
|
+
"cache_creation_input_tokens"
|
|
949
|
+
]
|
|
950
|
+
if "cache_read_input_tokens" in usage and usage["cache_read_input_tokens"]:
|
|
951
|
+
langfuse_usage["cache_read_input_tokens"] = usage[
|
|
952
|
+
"cache_read_input_tokens"
|
|
953
|
+
]
|
|
954
|
+
|
|
955
|
+
# Add detailed token breakdowns if available
|
|
956
|
+
if "prompt_tokens_details" in usage and usage["prompt_tokens_details"]:
|
|
957
|
+
details = usage["prompt_tokens_details"]
|
|
958
|
+
if "cached_tokens" in details and details["cached_tokens"]:
|
|
959
|
+
langfuse_usage["input_cached_tokens"] = details["cached_tokens"]
|
|
960
|
+
if "audio_tokens" in details and details["audio_tokens"]:
|
|
961
|
+
langfuse_usage["input_audio_tokens"] = details["audio_tokens"]
|
|
962
|
+
|
|
963
|
+
if (
|
|
964
|
+
"completion_tokens_details" in usage
|
|
965
|
+
and usage["completion_tokens_details"]
|
|
966
|
+
):
|
|
967
|
+
details = usage["completion_tokens_details"]
|
|
968
|
+
if "reasoning_tokens" in details and details["reasoning_tokens"]:
|
|
969
|
+
langfuse_usage["output_reasoning_tokens"] = details[
|
|
970
|
+
"reasoning_tokens"
|
|
971
|
+
]
|
|
972
|
+
if "audio_tokens" in details and details["audio_tokens"]:
|
|
973
|
+
langfuse_usage["output_audio_tokens"] = details["audio_tokens"]
|
|
974
|
+
|
|
975
|
+
print(f"[LANGFUSE] Usage data with cache details: {langfuse_usage}")
|
|
943
976
|
|
|
944
977
|
# Include model information in the generation end - Langfuse will calculate costs automatically
|
|
945
978
|
# Use compatibility wrapper for ending spans/generations
|
|
@@ -1260,7 +1293,10 @@ def create_composite_trace_collector(
|
|
|
1260
1293
|
# Automatically add Langfuse collector if keys are configured
|
|
1261
1294
|
if os.getenv("LANGFUSE_PUBLIC_KEY") and os.getenv("LANGFUSE_SECRET_KEY"):
|
|
1262
1295
|
langfuse_collector = LangfuseTraceCollector(
|
|
1263
|
-
httpx_client=httpx_client,
|
|
1296
|
+
httpx_client=httpx_client,
|
|
1297
|
+
proxy=proxy,
|
|
1298
|
+
timeout=timeout,
|
|
1299
|
+
include_system_prompt=include_system_prompt,
|
|
1264
1300
|
)
|
|
1265
1301
|
collector_list.append(langfuse_collector)
|
|
1266
1302
|
|
jaf/core/types.py
CHANGED
|
@@ -1009,6 +1009,38 @@ class RetryEvent:
|
|
|
1009
1009
|
)
|
|
1010
1010
|
|
|
1011
1011
|
|
|
1012
|
+
@dataclass(frozen=True)
|
|
1013
|
+
class FallbackEventData:
|
|
1014
|
+
"""Data for model fallback events."""
|
|
1015
|
+
|
|
1016
|
+
from_model: str # Model that failed
|
|
1017
|
+
to_model: str # Fallback model being tried
|
|
1018
|
+
reason: str # Reason for fallback (e.g., "Content Policy Violation", "Context Window Exceeded", "Rate Limit")
|
|
1019
|
+
fallback_type: Literal["general", "content_policy", "context_window"] # Type of fallback
|
|
1020
|
+
attempt: int # Which fallback attempt this is (1-indexed)
|
|
1021
|
+
trace_id: TraceId
|
|
1022
|
+
run_id: RunId
|
|
1023
|
+
error_details: Optional[Dict[str, Any]] = None # Additional error context
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
@dataclass(frozen=True)
|
|
1027
|
+
class FallbackEvent:
|
|
1028
|
+
"""Event emitted when a model fallback occurs."""
|
|
1029
|
+
|
|
1030
|
+
type: Literal["fallback"] = "fallback"
|
|
1031
|
+
data: FallbackEventData = field(
|
|
1032
|
+
default_factory=lambda: FallbackEventData(
|
|
1033
|
+
from_model="",
|
|
1034
|
+
to_model="",
|
|
1035
|
+
reason="",
|
|
1036
|
+
fallback_type="general",
|
|
1037
|
+
attempt=1,
|
|
1038
|
+
trace_id=TraceId(""),
|
|
1039
|
+
run_id=RunId(""),
|
|
1040
|
+
)
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
|
|
1012
1044
|
# Union type for all trace events
|
|
1013
1045
|
TraceEvent = Union[
|
|
1014
1046
|
RunStartEvent,
|
|
@@ -1024,6 +1056,7 @@ TraceEvent = Union[
|
|
|
1024
1056
|
HandoffEvent,
|
|
1025
1057
|
RunEndEvent,
|
|
1026
1058
|
RetryEvent,
|
|
1059
|
+
FallbackEvent,
|
|
1027
1060
|
]
|
|
1028
1061
|
|
|
1029
1062
|
|
|
@@ -1096,7 +1129,9 @@ class RunConfig(Generic[Ctx]):
|
|
|
1096
1129
|
agent_registry: Dict[str, Agent[Ctx, Any]]
|
|
1097
1130
|
model_provider: ModelProvider[Ctx]
|
|
1098
1131
|
max_turns: Optional[int] = 50
|
|
1099
|
-
max_tokens: Optional[int] =
|
|
1132
|
+
max_tokens: Optional[int] = (
|
|
1133
|
+
None # Default max_tokens for all agents (can be overridden per agent)
|
|
1134
|
+
)
|
|
1100
1135
|
model_override: Optional[str] = None
|
|
1101
1136
|
initial_input_guardrails: Optional[List[Guardrail]] = None
|
|
1102
1137
|
final_output_guardrails: Optional[List[Guardrail]] = None
|
|
@@ -1120,7 +1155,7 @@ class RunConfig(Generic[Ctx]):
|
|
|
1120
1155
|
[List[Message], RunState[Ctx]],
|
|
1121
1156
|
Union[List[Message], Awaitable[List[Message]]],
|
|
1122
1157
|
]
|
|
1123
|
-
] = None
|
|
1158
|
+
] = None
|
|
1124
1159
|
max_empty_response_retries: int = 3 # Maximum retries when LLM returns empty response
|
|
1125
1160
|
empty_response_retry_delay: float = (
|
|
1126
1161
|
1.0 # Initial delay in seconds before retrying empty response (uses exponential backoff)
|
|
@@ -1129,6 +1164,14 @@ class RunConfig(Generic[Ctx]):
|
|
|
1129
1164
|
prefer_streaming: Optional[bool] = (
|
|
1130
1165
|
None # Whether to prefer streaming responses. None (default) = use streaming if available, True = prefer streaming, False = disable streaming
|
|
1131
1166
|
)
|
|
1167
|
+
# Model fallback configuration
|
|
1168
|
+
fallbacks: Optional[List[str]] = None # List of fallback models to try if primary model fails
|
|
1169
|
+
content_policy_fallbacks: Optional[List[str]] = (
|
|
1170
|
+
None # Fallback models for content policy violations
|
|
1171
|
+
)
|
|
1172
|
+
context_window_fallbacks: Optional[List[str]] = (
|
|
1173
|
+
None # Fallback models for context window exceeded errors
|
|
1174
|
+
)
|
|
1132
1175
|
|
|
1133
1176
|
|
|
1134
1177
|
# Regeneration types for conversation management
|
jaf/providers/model.py
CHANGED
|
@@ -30,6 +30,8 @@ from ..core.types import (
|
|
|
30
30
|
get_text_content,
|
|
31
31
|
RetryEvent,
|
|
32
32
|
RetryEventData,
|
|
33
|
+
FallbackEvent,
|
|
34
|
+
FallbackEventData,
|
|
33
35
|
)
|
|
34
36
|
from ..core.proxy import ProxyConfig
|
|
35
37
|
from ..utils.document_processor import (
|
|
@@ -113,6 +115,55 @@ async def _is_vision_model(model: str, base_url: str) -> bool:
|
|
|
113
115
|
return is_known_vision_model
|
|
114
116
|
|
|
115
117
|
|
|
118
|
+
def _classify_error_for_fallback(e: Exception) -> tuple[str, str]:
|
|
119
|
+
"""
|
|
120
|
+
Classify an error to determine the fallback type and reason.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
e: Exception from model call
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Tuple of (fallback_type, reason)
|
|
127
|
+
"""
|
|
128
|
+
error_message = str(e).lower()
|
|
129
|
+
error_type = type(e).__name__
|
|
130
|
+
|
|
131
|
+
# Check for content policy violations
|
|
132
|
+
if (
|
|
133
|
+
"content" in error_message
|
|
134
|
+
and ("policy" in error_message or "filter" in error_message)
|
|
135
|
+
or "contentpolicyviolation" in error_type.lower()
|
|
136
|
+
or "content_filter" in error_message
|
|
137
|
+
or "safety" in error_message
|
|
138
|
+
):
|
|
139
|
+
return ("content_policy", "Content Policy Violation")
|
|
140
|
+
|
|
141
|
+
# Check for context window exceeded
|
|
142
|
+
if (
|
|
143
|
+
"context" in error_message
|
|
144
|
+
and "window" in error_message
|
|
145
|
+
or "too long" in error_message
|
|
146
|
+
or "maximum context" in error_message
|
|
147
|
+
or "contextwindowexceeded" in error_type.lower()
|
|
148
|
+
or "prompt is too long" in error_message
|
|
149
|
+
or "tokens" in error_message
|
|
150
|
+
and "limit" in error_message
|
|
151
|
+
):
|
|
152
|
+
return ("context_window", "Context Window Exceeded")
|
|
153
|
+
|
|
154
|
+
# Default to general fallback
|
|
155
|
+
if hasattr(e, "status_code"):
|
|
156
|
+
status_code = e.status_code
|
|
157
|
+
if status_code == 429:
|
|
158
|
+
return ("general", f"HTTP {status_code} - Rate Limit")
|
|
159
|
+
elif 500 <= status_code < 600:
|
|
160
|
+
return ("general", f"HTTP {status_code} - Server Error")
|
|
161
|
+
else:
|
|
162
|
+
return ("general", f"HTTP {status_code}")
|
|
163
|
+
|
|
164
|
+
return ("general", error_type)
|
|
165
|
+
|
|
166
|
+
|
|
116
167
|
async def _retry_with_events(
|
|
117
168
|
operation_func,
|
|
118
169
|
state: RunState,
|
|
@@ -259,10 +310,10 @@ def make_litellm_provider(
|
|
|
259
310
|
async def get_completion(
|
|
260
311
|
self, state: RunState[Ctx], agent: Agent[Ctx, Any], config: RunConfig[Ctx]
|
|
261
312
|
) -> Dict[str, Any]:
|
|
262
|
-
"""Get completion from the model."""
|
|
313
|
+
"""Get completion from the model with fallback support."""
|
|
263
314
|
|
|
264
|
-
# Determine model to use
|
|
265
|
-
|
|
315
|
+
# Determine initial model to use
|
|
316
|
+
primary_model = config.model_override or (
|
|
266
317
|
agent.model_config.name if agent.model_config else "gpt-4o"
|
|
267
318
|
)
|
|
268
319
|
|
|
@@ -277,10 +328,10 @@ def make_litellm_provider(
|
|
|
277
328
|
)
|
|
278
329
|
|
|
279
330
|
if has_image_content:
|
|
280
|
-
supports_vision = await _is_vision_model(
|
|
331
|
+
supports_vision = await _is_vision_model(primary_model, base_url)
|
|
281
332
|
if not supports_vision:
|
|
282
333
|
raise ValueError(
|
|
283
|
-
f"Model {
|
|
334
|
+
f"Model {primary_model} does not support vision capabilities. "
|
|
284
335
|
f"Please use a vision-capable model like gpt-4o, claude-3-5-sonnet, or gemini-1.5-pro."
|
|
285
336
|
)
|
|
286
337
|
|
|
@@ -322,39 +373,123 @@ def make_litellm_provider(
|
|
|
322
373
|
last_message.role == ContentRole.TOOL or last_message.role == "tool"
|
|
323
374
|
)
|
|
324
375
|
|
|
325
|
-
#
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
max_tokens =
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
376
|
+
# Helper function to make API call with a specific model
|
|
377
|
+
async def _make_completion_call(model_name: str) -> Dict[str, Any]:
|
|
378
|
+
# Prepare request parameters
|
|
379
|
+
request_params = {"model": model_name, "messages": messages, "stream": False}
|
|
380
|
+
|
|
381
|
+
# Add optional parameters
|
|
382
|
+
if agent.model_config:
|
|
383
|
+
if agent.model_config.temperature is not None:
|
|
384
|
+
request_params["temperature"] = agent.model_config.temperature
|
|
385
|
+
# Use agent's max_tokens if set, otherwise fall back to config's max_tokens
|
|
386
|
+
max_tokens = agent.model_config.max_tokens
|
|
387
|
+
if max_tokens is None:
|
|
388
|
+
max_tokens = config.max_tokens
|
|
389
|
+
if max_tokens is not None:
|
|
390
|
+
request_params["max_tokens"] = max_tokens
|
|
391
|
+
elif config.max_tokens is not None:
|
|
392
|
+
# No model_config but config has max_tokens
|
|
393
|
+
request_params["max_tokens"] = config.max_tokens
|
|
394
|
+
|
|
395
|
+
if tools:
|
|
396
|
+
request_params["tools"] = tools
|
|
397
|
+
# Always set tool_choice to auto when tools are available
|
|
398
|
+
request_params["tool_choice"] = "auto"
|
|
399
|
+
|
|
400
|
+
if agent.output_codec:
|
|
401
|
+
request_params["response_format"] = {"type": "json_object"}
|
|
402
|
+
|
|
403
|
+
# Make the API call with retry handling
|
|
404
|
+
async def _api_call():
|
|
405
|
+
return await self.client.chat.completions.create(**request_params)
|
|
406
|
+
|
|
407
|
+
# Use retry wrapper to track retries in Langfuse
|
|
408
|
+
return await _retry_with_events(
|
|
409
|
+
_api_call,
|
|
410
|
+
state,
|
|
411
|
+
config,
|
|
412
|
+
operation_name="llm_call",
|
|
413
|
+
max_retries=3,
|
|
414
|
+
backoff_factor=1.0,
|
|
415
|
+
)
|
|
349
416
|
|
|
350
|
-
#
|
|
351
|
-
|
|
352
|
-
|
|
417
|
+
# Try primary model first
|
|
418
|
+
last_exception = None
|
|
419
|
+
current_model = primary_model
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
response = await _make_completion_call(current_model)
|
|
423
|
+
except Exception as e:
|
|
424
|
+
last_exception = e
|
|
425
|
+
|
|
426
|
+
# Classify the error to determine which fallback list to use
|
|
427
|
+
fallback_type, reason = _classify_error_for_fallback(e)
|
|
428
|
+
|
|
429
|
+
# Determine which fallback list to use
|
|
430
|
+
fallback_models = []
|
|
431
|
+
if fallback_type == "content_policy" and config.content_policy_fallbacks:
|
|
432
|
+
fallback_models = config.content_policy_fallbacks
|
|
433
|
+
elif fallback_type == "context_window" and config.context_window_fallbacks:
|
|
434
|
+
fallback_models = config.context_window_fallbacks
|
|
435
|
+
elif config.fallbacks:
|
|
436
|
+
fallback_models = config.fallbacks
|
|
437
|
+
|
|
438
|
+
# Try fallback models
|
|
439
|
+
if fallback_models:
|
|
440
|
+
print(
|
|
441
|
+
f"[JAF:FALLBACK] Primary model '{current_model}' failed with {reason}. "
|
|
442
|
+
f"Trying {len(fallback_models)} fallback model(s)..."
|
|
443
|
+
)
|
|
353
444
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
445
|
+
for i, fallback_model in enumerate(fallback_models, 1):
|
|
446
|
+
try:
|
|
447
|
+
# Emit fallback event
|
|
448
|
+
if config.on_event:
|
|
449
|
+
fallback_event = FallbackEvent(
|
|
450
|
+
data=FallbackEventData(
|
|
451
|
+
from_model=current_model,
|
|
452
|
+
to_model=fallback_model,
|
|
453
|
+
reason=reason,
|
|
454
|
+
fallback_type=fallback_type,
|
|
455
|
+
attempt=i,
|
|
456
|
+
trace_id=state.trace_id,
|
|
457
|
+
run_id=state.run_id,
|
|
458
|
+
error_details={
|
|
459
|
+
"error_type": type(last_exception).__name__,
|
|
460
|
+
"error_message": str(last_exception),
|
|
461
|
+
},
|
|
462
|
+
)
|
|
463
|
+
)
|
|
464
|
+
config.on_event(fallback_event)
|
|
465
|
+
|
|
466
|
+
print(
|
|
467
|
+
f"[JAF:FALLBACK] Attempting fallback {i}/{len(fallback_models)}: {fallback_model}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Try the fallback model
|
|
471
|
+
response = await _make_completion_call(fallback_model)
|
|
472
|
+
current_model = fallback_model
|
|
473
|
+
print(
|
|
474
|
+
f"[JAF:FALLBACK] Successfully used fallback model: {fallback_model}"
|
|
475
|
+
)
|
|
476
|
+
break # Success - exit the fallback loop
|
|
477
|
+
|
|
478
|
+
except Exception as fallback_error:
|
|
479
|
+
last_exception = fallback_error
|
|
480
|
+
print(
|
|
481
|
+
f"[JAF:FALLBACK] Fallback model '{fallback_model}' also failed: {fallback_error}"
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# If this was the last fallback, re-raise
|
|
485
|
+
if i == len(fallback_models):
|
|
486
|
+
print(
|
|
487
|
+
f"[JAF:FALLBACK] All fallback models exhausted. Raising last exception."
|
|
488
|
+
)
|
|
489
|
+
raise
|
|
490
|
+
else:
|
|
491
|
+
# No fallbacks configured, re-raise original exception
|
|
492
|
+
raise
|
|
358
493
|
|
|
359
494
|
# Return in the expected format that the engine expects
|
|
360
495
|
choice = response.choices[0]
|
|
@@ -371,7 +506,7 @@ def make_litellm_provider(
|
|
|
371
506
|
for tc in choice.message.tool_calls
|
|
372
507
|
]
|
|
373
508
|
|
|
374
|
-
# Extract usage data
|
|
509
|
+
# Extract usage data with detailed cache information
|
|
375
510
|
usage_data = None
|
|
376
511
|
if response.usage:
|
|
377
512
|
usage_data = {
|
|
@@ -380,6 +515,45 @@ def make_litellm_provider(
|
|
|
380
515
|
"total_tokens": response.usage.total_tokens,
|
|
381
516
|
}
|
|
382
517
|
|
|
518
|
+
# Extract cache-related fields if available (for prompt caching support)
|
|
519
|
+
if hasattr(response.usage, "cache_creation_input_tokens"):
|
|
520
|
+
usage_data["cache_creation_input_tokens"] = (
|
|
521
|
+
response.usage.cache_creation_input_tokens
|
|
522
|
+
)
|
|
523
|
+
if hasattr(response.usage, "cache_read_input_tokens"):
|
|
524
|
+
usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
|
|
525
|
+
|
|
526
|
+
# Extract detailed token breakdowns
|
|
527
|
+
if (
|
|
528
|
+
hasattr(response.usage, "prompt_tokens_details")
|
|
529
|
+
and response.usage.prompt_tokens_details
|
|
530
|
+
):
|
|
531
|
+
details = {}
|
|
532
|
+
if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
|
|
533
|
+
details["cached_tokens"] = (
|
|
534
|
+
response.usage.prompt_tokens_details.cached_tokens
|
|
535
|
+
)
|
|
536
|
+
if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
|
|
537
|
+
details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
|
|
538
|
+
if details:
|
|
539
|
+
usage_data["prompt_tokens_details"] = details
|
|
540
|
+
|
|
541
|
+
if (
|
|
542
|
+
hasattr(response.usage, "completion_tokens_details")
|
|
543
|
+
and response.usage.completion_tokens_details
|
|
544
|
+
):
|
|
545
|
+
details = {}
|
|
546
|
+
if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
|
|
547
|
+
details["reasoning_tokens"] = (
|
|
548
|
+
response.usage.completion_tokens_details.reasoning_tokens
|
|
549
|
+
)
|
|
550
|
+
if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
|
|
551
|
+
details["audio_tokens"] = (
|
|
552
|
+
response.usage.completion_tokens_details.audio_tokens
|
|
553
|
+
)
|
|
554
|
+
if details:
|
|
555
|
+
usage_data["completion_tokens_details"] = details
|
|
556
|
+
|
|
383
557
|
return {
|
|
384
558
|
"id": response.id,
|
|
385
559
|
"created": response.created,
|
|
@@ -688,7 +862,12 @@ def make_litellm_sdk_provider(
|
|
|
688
862
|
|
|
689
863
|
# Use retry wrapper to track retries in Langfuse
|
|
690
864
|
response = await _retry_with_events(
|
|
691
|
-
_api_call,
|
|
865
|
+
_api_call,
|
|
866
|
+
state,
|
|
867
|
+
config,
|
|
868
|
+
operation_name="llm_call",
|
|
869
|
+
max_retries=3,
|
|
870
|
+
backoff_factor=1.0,
|
|
692
871
|
)
|
|
693
872
|
|
|
694
873
|
# Return in the expected format that the engine expects
|
|
@@ -706,8 +885,16 @@ def make_litellm_sdk_provider(
|
|
|
706
885
|
for tc in choice.message.tool_calls
|
|
707
886
|
]
|
|
708
887
|
|
|
709
|
-
# Extract usage data
|
|
710
|
-
|
|
888
|
+
# Extract usage data with detailed cache information - ALWAYS return a dict with defaults for Langfuse cost tracking
|
|
889
|
+
# Initialize with zeros as defensive default (matches AzureDirectProvider pattern)
|
|
890
|
+
usage_data = {
|
|
891
|
+
"prompt_tokens": 0,
|
|
892
|
+
"completion_tokens": 0,
|
|
893
|
+
"total_tokens": 0,
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
actual_model = getattr(response, "model", model_name)
|
|
897
|
+
|
|
711
898
|
if response.usage:
|
|
712
899
|
usage_data = {
|
|
713
900
|
"prompt_tokens": response.usage.prompt_tokens,
|
|
@@ -715,12 +902,59 @@ def make_litellm_sdk_provider(
|
|
|
715
902
|
"total_tokens": response.usage.total_tokens,
|
|
716
903
|
}
|
|
717
904
|
|
|
905
|
+
# Extract cache-related fields if available (for prompt caching support)
|
|
906
|
+
if hasattr(response.usage, "cache_creation_input_tokens"):
|
|
907
|
+
usage_data["cache_creation_input_tokens"] = (
|
|
908
|
+
response.usage.cache_creation_input_tokens
|
|
909
|
+
)
|
|
910
|
+
if hasattr(response.usage, "cache_read_input_tokens"):
|
|
911
|
+
usage_data["cache_read_input_tokens"] = response.usage.cache_read_input_tokens
|
|
912
|
+
|
|
913
|
+
# Extract detailed token breakdowns
|
|
914
|
+
if (
|
|
915
|
+
hasattr(response.usage, "prompt_tokens_details")
|
|
916
|
+
and response.usage.prompt_tokens_details
|
|
917
|
+
):
|
|
918
|
+
details = {}
|
|
919
|
+
if hasattr(response.usage.prompt_tokens_details, "cached_tokens"):
|
|
920
|
+
details["cached_tokens"] = (
|
|
921
|
+
response.usage.prompt_tokens_details.cached_tokens
|
|
922
|
+
)
|
|
923
|
+
if hasattr(response.usage.prompt_tokens_details, "audio_tokens"):
|
|
924
|
+
details["audio_tokens"] = response.usage.prompt_tokens_details.audio_tokens
|
|
925
|
+
if details:
|
|
926
|
+
usage_data["prompt_tokens_details"] = details
|
|
927
|
+
|
|
928
|
+
if (
|
|
929
|
+
hasattr(response.usage, "completion_tokens_details")
|
|
930
|
+
and response.usage.completion_tokens_details
|
|
931
|
+
):
|
|
932
|
+
details = {}
|
|
933
|
+
if hasattr(response.usage.completion_tokens_details, "reasoning_tokens"):
|
|
934
|
+
details["reasoning_tokens"] = (
|
|
935
|
+
response.usage.completion_tokens_details.reasoning_tokens
|
|
936
|
+
)
|
|
937
|
+
if hasattr(response.usage.completion_tokens_details, "audio_tokens"):
|
|
938
|
+
details["audio_tokens"] = (
|
|
939
|
+
response.usage.completion_tokens_details.audio_tokens
|
|
940
|
+
)
|
|
941
|
+
if details:
|
|
942
|
+
usage_data["completion_tokens_details"] = details
|
|
943
|
+
|
|
944
|
+
message_content = {
|
|
945
|
+
"content": choice.message.content,
|
|
946
|
+
"tool_calls": tool_calls,
|
|
947
|
+
# CRITICAL: Embed usage and model here so trace collector can find them
|
|
948
|
+
"_usage": usage_data,
|
|
949
|
+
"_model": actual_model,
|
|
950
|
+
}
|
|
951
|
+
|
|
718
952
|
return {
|
|
719
953
|
"id": response.id,
|
|
720
954
|
"created": response.created,
|
|
721
|
-
"model":
|
|
955
|
+
"model": actual_model,
|
|
722
956
|
"system_fingerprint": getattr(response, "system_fingerprint", None),
|
|
723
|
-
"message":
|
|
957
|
+
"message": message_content,
|
|
724
958
|
"usage": usage_data,
|
|
725
959
|
"prompt": messages,
|
|
726
960
|
}
|
|
@@ -769,6 +1003,7 @@ def make_litellm_sdk_provider(
|
|
|
769
1003
|
"model": model_name,
|
|
770
1004
|
"messages": messages,
|
|
771
1005
|
"stream": True,
|
|
1006
|
+
"stream_options": {"include_usage": True}, # Request usage data in streaming
|
|
772
1007
|
**self.litellm_kwargs,
|
|
773
1008
|
}
|
|
774
1009
|
|
|
@@ -804,14 +1039,30 @@ def make_litellm_sdk_provider(
|
|
|
804
1039
|
# Stream using litellm
|
|
805
1040
|
stream = await litellm.acompletion(**request_params)
|
|
806
1041
|
|
|
1042
|
+
accumulated_usage: Optional[Dict[str, int]] = None
|
|
1043
|
+
response_model: Optional[str] = None
|
|
1044
|
+
|
|
807
1045
|
async for chunk in stream:
|
|
808
1046
|
try:
|
|
809
1047
|
# Best-effort extraction of raw for debugging
|
|
810
1048
|
try:
|
|
811
1049
|
raw_obj = chunk.model_dump() if hasattr(chunk, "model_dump") else None
|
|
812
|
-
|
|
1050
|
+
|
|
1051
|
+
# Capture usage from chunk if present
|
|
1052
|
+
if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
|
|
1053
|
+
accumulated_usage = raw_obj["usage"]
|
|
1054
|
+
|
|
1055
|
+
# Capture model from chunk if present
|
|
1056
|
+
if raw_obj and "model" in raw_obj and raw_obj["model"]:
|
|
1057
|
+
response_model = raw_obj["model"]
|
|
1058
|
+
|
|
1059
|
+
except Exception as e:
|
|
813
1060
|
raw_obj = None
|
|
814
1061
|
|
|
1062
|
+
if raw_obj and "usage" in raw_obj and raw_obj["usage"]:
|
|
1063
|
+
# Yield this chunk so engine.py can capture usage from raw
|
|
1064
|
+
yield CompletionStreamChunk(delta="", raw=raw_obj)
|
|
1065
|
+
|
|
815
1066
|
choice = None
|
|
816
1067
|
if getattr(chunk, "choices", None):
|
|
817
1068
|
choice = chunk.choices[0]
|
|
@@ -826,6 +1077,12 @@ def make_litellm_sdk_provider(
|
|
|
826
1077
|
if delta is not None:
|
|
827
1078
|
content_delta = getattr(delta, "content", None)
|
|
828
1079
|
if content_delta:
|
|
1080
|
+
# Include accumulated usage and model in raw_obj for engine
|
|
1081
|
+
if raw_obj and (accumulated_usage or response_model):
|
|
1082
|
+
if accumulated_usage:
|
|
1083
|
+
raw_obj["usage"] = accumulated_usage
|
|
1084
|
+
if response_model:
|
|
1085
|
+
raw_obj["model"] = response_model
|
|
829
1086
|
yield CompletionStreamChunk(delta=content_delta, raw=raw_obj)
|
|
830
1087
|
|
|
831
1088
|
# Tool call deltas
|
|
@@ -841,6 +1098,13 @@ def make_litellm_sdk_provider(
|
|
|
841
1098
|
getattr(fn, "arguments", None) if fn is not None else None
|
|
842
1099
|
)
|
|
843
1100
|
|
|
1101
|
+
# Include accumulated usage and model in raw_obj
|
|
1102
|
+
if raw_obj and (accumulated_usage or response_model):
|
|
1103
|
+
if accumulated_usage:
|
|
1104
|
+
raw_obj["usage"] = accumulated_usage
|
|
1105
|
+
if response_model:
|
|
1106
|
+
raw_obj["model"] = response_model
|
|
1107
|
+
|
|
844
1108
|
yield CompletionStreamChunk(
|
|
845
1109
|
tool_call_delta=ToolCallDelta(
|
|
846
1110
|
index=idx,
|
|
@@ -857,6 +1121,12 @@ def make_litellm_sdk_provider(
|
|
|
857
1121
|
|
|
858
1122
|
# Completion ended
|
|
859
1123
|
if finish_reason:
|
|
1124
|
+
# Include accumulated usage and model in final chunk
|
|
1125
|
+
if raw_obj and (accumulated_usage or response_model):
|
|
1126
|
+
if accumulated_usage:
|
|
1127
|
+
raw_obj["usage"] = accumulated_usage
|
|
1128
|
+
if response_model:
|
|
1129
|
+
raw_obj["model"] = response_model
|
|
860
1130
|
yield CompletionStreamChunk(
|
|
861
1131
|
is_done=True, finish_reason=finish_reason, raw=raw_obj
|
|
862
1132
|
)
|
jaf/server/server.py
CHANGED
|
@@ -220,7 +220,7 @@ def _convert_core_message_to_http(core_msg: Message) -> HttpMessage:
|
|
|
220
220
|
content=content,
|
|
221
221
|
attachments=attachments,
|
|
222
222
|
tool_call_id=core_msg.tool_call_id,
|
|
223
|
-
tool_calls=core_msg.tool_calls,
|
|
223
|
+
tool_calls=[asdict(tc) for tc in core_msg.tool_calls] if core_msg.tool_calls else None,
|
|
224
224
|
)
|
|
225
225
|
|
|
226
226
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jaf-py
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.4
|
|
4
4
|
Summary: A purely functional agent framework with immutable state and composable tools - Python implementation
|
|
5
5
|
Author: JAF Contributors
|
|
6
6
|
Maintainer: JAF Contributors
|
|
@@ -82,7 +82,7 @@ Dynamic: license-file
|
|
|
82
82
|
|
|
83
83
|
<!--  -->
|
|
84
84
|
|
|
85
|
-
[](https://github.com/xynehq/jaf-py)
|
|
86
86
|
[](https://www.python.org/)
|
|
87
87
|
[](https://xynehq.github.io/jaf-py/)
|
|
88
88
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
jaf/__init__.py,sha256=
|
|
1
|
+
jaf/__init__.py,sha256=ieEZNHk68b5MjZ39t9BhFzK19GqLV6w2TnWH8cwKPG0,8652
|
|
2
2
|
jaf/cli.py,sha256=EDMMA5uX0e3TUIedLdyP3p4Qy-aXADvpht3VgJPJagU,8299
|
|
3
3
|
jaf/exceptions.py,sha256=FdLIw7bdCNtBYfqRyJBkRT4Z1vWuvkzrMqFiMAzjL8Y,9158
|
|
4
4
|
jaf/a2a/__init__.py,sha256=r4W-WHZNjoxR8EQ0x41_rY3fl12OH5qcSn0KycXaKKU,7752
|
|
@@ -43,7 +43,7 @@ jaf/core/agent_tool.py,sha256=gZje8_gZSaWCecySg2ZBK07RcD8bc2hxHsR4z87oKJE,12075
|
|
|
43
43
|
jaf/core/analytics.py,sha256=ypdhllyOThXZB-TY_eR1t1n2qrnAVN7Ljb8PaOtJft0,23267
|
|
44
44
|
jaf/core/checkpoint.py,sha256=O7mfi7gFOAUgJ3zHzgJsr11uzn-BU-Vj1iKyKjcirMk,8398
|
|
45
45
|
jaf/core/composition.py,sha256=Tj0-FRTVWygmAfsBLld7pnZK4nrGMMBx2YYJW_KQPoo,25393
|
|
46
|
-
jaf/core/engine.py,sha256=
|
|
46
|
+
jaf/core/engine.py,sha256=D_RtMWI43oSm7gK_J2kFRsJ2EJkHX4hMj0soUNXC92k,71179
|
|
47
47
|
jaf/core/errors.py,sha256=iDw00o3WH0gHcenRcTj3QEbbloZVpgwnPij6mtaJJk4,5710
|
|
48
48
|
jaf/core/guardrails.py,sha256=oPB7MpD3xWiCWoyaS-xQQp-glaPON7GNVrIL0h1Jefs,26931
|
|
49
49
|
jaf/core/handoff.py,sha256=M7TQfd7BXuer1ZeRJ51nLsI55KifbM6faNtmA2Nsj3I,6196
|
|
@@ -56,8 +56,8 @@ jaf/core/state.py,sha256=fdWDc2DQ-o_g_8E4ibg2QM0Vad_XUique3a5iYBwGZo,9516
|
|
|
56
56
|
jaf/core/streaming.py,sha256=5ntOtJrZVCHuGsygquyCLG2J5yuSxE6DN5OM-BrQiGw,16818
|
|
57
57
|
jaf/core/tool_results.py,sha256=L9U3JDQAjAH5YR7iMpSxfVky2Nxo6FYQs4WE05RATaQ,11283
|
|
58
58
|
jaf/core/tools.py,sha256=rHxzAfGVGpYk3YJKmrq3AQLW0oE3ACkiJBOwle2bLdc,15146
|
|
59
|
-
jaf/core/tracing.py,sha256
|
|
60
|
-
jaf/core/types.py,sha256=
|
|
59
|
+
jaf/core/tracing.py,sha256=gh_oAm8T7ENv7oV6-IRt9GnW-rsmWXMlLDFwr8NfeAI,59360
|
|
60
|
+
jaf/core/types.py,sha256=lJXlkL55devvzbc5efT5FdQ_LX3JcsMWA10Hy8Cd5Qs,37015
|
|
61
61
|
jaf/core/workflows.py,sha256=0825AoD1QwEiGAs5IRlWHmaKrjurx6xF7oDJR6POBsg,25651
|
|
62
62
|
jaf/memory/__init__.py,sha256=YfANOg5vUFSPVG7gpBE4_lYkV5X3_U6Yj9v1_QexfN0,1396
|
|
63
63
|
jaf/memory/approval_storage.py,sha256=DcwtERcoIMH7B-abK9hqND3Moz4zSETsPlgJNkvqcaM,10573
|
|
@@ -75,10 +75,10 @@ jaf/policies/handoff.py,sha256=3lPegkSV_2LUf6jEZnj68_g3XUGFB_Fsj1C_6Svr2Kg,8128
|
|
|
75
75
|
jaf/policies/validation.py,sha256=-zhB5ysH0Y4JnstHzo3I8tt-PFB9FSHBwSUALITBxw4,11016
|
|
76
76
|
jaf/providers/__init__.py,sha256=PfIQkCtXb_yiTEjqs5msGv5-a6De2ujFCEaDGJEe_TQ,2100
|
|
77
77
|
jaf/providers/mcp.py,sha256=fGfrlYx5g7ZX1fBUkPmAYSePKrCc4pG_HKngV_QCdRU,13148
|
|
78
|
-
jaf/providers/model.py,sha256=
|
|
78
|
+
jaf/providers/model.py,sha256=FCnenKOLwh5JJ8hcXy7pemJb32EO0uvoww5ZTqd4mlE,58619
|
|
79
79
|
jaf/server/__init__.py,sha256=cYqdruJCJ3W1AMmmxMjAnDlj9gh3XbHhtegjq4nYRNY,391
|
|
80
80
|
jaf/server/main.py,sha256=usdCRZfDP3GWQchh1o2tHd4KqTTFyQQCD9w4khd9rSo,2113
|
|
81
|
-
jaf/server/server.py,sha256=
|
|
81
|
+
jaf/server/server.py,sha256=ZhZ2gmY10eQNaKUlE7ecMkrwMkYkAh-QgKdUJ2q7ktM,51532
|
|
82
82
|
jaf/server/types.py,sha256=MsbADzpxVLlaVh0-VfgwbDybk1ZSavN5KSpPEamDEwE,14174
|
|
83
83
|
jaf/utils/__init__.py,sha256=s3rsFFqSjsgRfnXrQFhcXXUc99HVFYizlfVbbkOYQDo,1229
|
|
84
84
|
jaf/utils/attachments.py,sha256=SvZxEO7aCwl97bIJH3YtEYiuhBB6YcaBCp4UkXrWc4w,13179
|
|
@@ -89,9 +89,9 @@ jaf/visualization/functional_core.py,sha256=0Xs2R8ELADKNIgokcbjuxmWwxEyCH1yXIEdG
|
|
|
89
89
|
jaf/visualization/graphviz.py,sha256=EwWVIRv8Z7gTiO5Spvcm-z_UUQ1oWNPRgdE33ZzFwx8,11569
|
|
90
90
|
jaf/visualization/imperative_shell.py,sha256=N5lWzOLMIU_iCoy3n5WCg49eec8VxV8f7JIG6_wNtVw,2506
|
|
91
91
|
jaf/visualization/types.py,sha256=90G8oClsFa_APqTuMrTW6KjD0oG9I4kVur773dXNW0E,1393
|
|
92
|
-
jaf_py-2.6.
|
|
93
|
-
jaf_py-2.6.
|
|
94
|
-
jaf_py-2.6.
|
|
95
|
-
jaf_py-2.6.
|
|
96
|
-
jaf_py-2.6.
|
|
97
|
-
jaf_py-2.6.
|
|
92
|
+
jaf_py-2.6.4.dist-info/licenses/LICENSE,sha256=LXUQBJxdyr-7C4bk9cQBwvsF_xwA-UVstDTKabpcjlI,1063
|
|
93
|
+
jaf_py-2.6.4.dist-info/METADATA,sha256=By3r8jZ5EwcA_-CetPgaeG2XY28pTqNBPl54uttx-a0,27743
|
|
94
|
+
jaf_py-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
95
|
+
jaf_py-2.6.4.dist-info/entry_points.txt,sha256=OtIJeNJpb24kgGrqRx9szGgDx1vL9ayq8uHErmu7U5w,41
|
|
96
|
+
jaf_py-2.6.4.dist-info/top_level.txt,sha256=Xu1RZbGaM4_yQX7bpalo881hg7N_dybaOW282F15ruE,4
|
|
97
|
+
jaf_py-2.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|