ffai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. ffai/Clients/AsyncFFLiteLLMClient.py +141 -0
  2. ffai/Clients/BaseLiteLLMClient.py +345 -0
  3. ffai/Clients/FFLiteLLMClient.py +174 -0
  4. ffai/Clients/FFMistralSmall.py +385 -0
  5. ffai/Clients/__init__.py +13 -0
  6. ffai/Clients/model_defaults.py +74 -0
  7. ffai/ConversationHistory.py +4 -0
  8. ffai/FFAI.py +868 -0
  9. ffai/FFAIClientBase.py +4 -0
  10. ffai/OrderedPromptHistory.py +4 -0
  11. ffai/__init__.py +57 -0
  12. ffai/agent/__init__.py +16 -0
  13. ffai/agent/agent_loop.py +266 -0
  14. ffai/agent/agent_result.py +121 -0
  15. ffai/agent/response_validator.py +201 -0
  16. ffai/config.py +293 -0
  17. ffai/core/__init__.py +71 -0
  18. ffai/core/async_client_base.py +52 -0
  19. ffai/core/async_executor.py +257 -0
  20. ffai/core/client_base.py +245 -0
  21. ffai/core/condition_evaluator.py +763 -0
  22. ffai/core/conversation_manager.py +134 -0
  23. ffai/core/execution_result.py +43 -0
  24. ffai/core/execution_state.py +47 -0
  25. ffai/core/graph.py +273 -0
  26. ffai/core/graph_execution_helpers.py +207 -0
  27. ffai/core/history/__init__.py +17 -0
  28. ffai/core/history/conversation.py +64 -0
  29. ffai/core/history/ordered.py +347 -0
  30. ffai/core/history/permanent.py +81 -0
  31. ffai/core/history/recorder.py +92 -0
  32. ffai/core/history_exporter.py +315 -0
  33. ffai/core/prompt_builder.py +154 -0
  34. ffai/core/prompt_node.py +66 -0
  35. ffai/core/prompt_utils.py +104 -0
  36. ffai/core/response_context.py +124 -0
  37. ffai/core/response_executor.py +309 -0
  38. ffai/core/response_options.py +82 -0
  39. ffai/core/response_result.py +47 -0
  40. ffai/core/response_utils.py +93 -0
  41. ffai/core/structured_output.py +242 -0
  42. ffai/core/types.py +62 -0
  43. ffai/core/usage.py +32 -0
  44. ffai/observability/__init__.py +11 -0
  45. ffai/observability/log_context.py +78 -0
  46. ffai/observability/telemetry.py +190 -0
  47. ffai/py.typed +0 -0
  48. ffai/rag/__init__.py +74 -0
  49. ffai/rag/_async.py +34 -0
  50. ffai/rag/client_adapter.py +50 -0
  51. ffai/rag/embed.py +272 -0
  52. ffai/rag/format.py +49 -0
  53. ffai/rag/indexing/__init__.py +5 -0
  54. ffai/rag/indexing/bm25.py +283 -0
  55. ffai/rag/indexing/contextual.py +169 -0
  56. ffai/rag/indexing/deduplication.py +154 -0
  57. ffai/rag/indexing/hierarchical.py +272 -0
  58. ffai/rag/prompts.py +11 -0
  59. ffai/rag/rag.py +546 -0
  60. ffai/rag/search/__init__.py +21 -0
  61. ffai/rag/search/hybrid.py +265 -0
  62. ffai/rag/search/query_expansion.py +165 -0
  63. ffai/rag/search/rerankers.py +255 -0
  64. ffai/rag/splitters/__init__.py +21 -0
  65. ffai/rag/splitters/base.py +114 -0
  66. ffai/rag/splitters/character.py +103 -0
  67. ffai/rag/splitters/code.py +351 -0
  68. ffai/rag/splitters/factory.py +129 -0
  69. ffai/rag/splitters/hierarchical.py +221 -0
  70. ffai/rag/splitters/markdown.py +320 -0
  71. ffai/rag/splitters/recursive.py +223 -0
  72. ffai/rag/store.py +218 -0
  73. ffai/rag/types.py +68 -0
  74. ffai/retry_utils.py +245 -0
  75. ffai/tools/__init__.py +12 -0
  76. ffai/tools/tool_registry.py +294 -0
  77. ffai-0.1.0.dist-info/METADATA +742 -0
  78. ffai-0.1.0.dist-info/RECORD +81 -0
  79. ffai-0.1.0.dist-info/WHEEL +5 -0
  80. ffai-0.1.0.dist-info/licenses/LICENSE +21 -0
  81. ffai-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,141 @@
1
+ # Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
2
+ # SPDX-License-Identifier: MIT
3
+ # Contact: antquinonez@farfiner.com
4
+
5
+ """Async LiteLLM-backed AI client implementing AsyncFFAIClientBase contract.
6
+
7
+ Mirrors ``FFLiteLLMClient`` but uses ``litellm.acompletion()`` for async
8
+ I/O. Shares all non-I/O logic via ``BaseLiteLLMClient``.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import copy
14
+ import logging
15
+ from typing import Any
16
+
17
+ from litellm import acompletion
18
+
19
+ from ..core.async_client_base import AsyncFFAIClientBase
20
+ from ..retry_utils import get_configured_retry_decorator
21
+ from .BaseLiteLLMClient import BaseLiteLLMClient
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class AsyncFFLiteLLMClient(BaseLiteLLMClient, AsyncFFAIClientBase):
27
+ """Async LiteLLM-backed AI client implementing AsyncFFAIClientBase.
28
+
29
+ Key features:
30
+ - Internal conversation history management
31
+ - Clone pattern for parallel execution
32
+ - Model string routing (e.g., "azure/mistral-small-2503")
33
+ - Retry and fallback support
34
+
35
+ Args:
36
+ model_string: LiteLLM model identifier.
37
+ config: Optional configuration dictionary.
38
+ api_key: API key (overrides env var).
39
+ api_base: API base URL (overrides env var).
40
+ system_instructions: System prompt.
41
+ temperature: Sampling temperature (0-2).
42
+ max_tokens: Maximum tokens to generate.
43
+ fallbacks: List of fallback model strings.
44
+ retry_config: Retry configuration.
45
+
46
+ """
47
+
48
+ async def generate_response(
49
+ self,
50
+ prompt: str,
51
+ model: str | None = None,
52
+ system_instructions: str | None = None,
53
+ temperature: float | None = None,
54
+ max_tokens: int | None = None,
55
+ **kwargs: Any,
56
+ ) -> str:
57
+ """Generate a response from the model asynchronously.
58
+
59
+ Falls back to configured fallback models when the primary call fails.
60
+
61
+ Args:
62
+ prompt: User's input text.
63
+ model: Model identifier override (preserves provider prefix if set).
64
+ system_instructions: System prompt override.
65
+ temperature: Sampling temperature override.
66
+ max_tokens: Maximum tokens to generate override.
67
+ **kwargs: Additional parameters forwarded to ``litellm.acompletion()``.
68
+
69
+ Returns:
70
+ The model's response text.
71
+
72
+ Raises:
73
+ ValueError: If the prompt is empty.
74
+ RuntimeError: If all models (primary + fallbacks) fail.
75
+
76
+ """
77
+ api_params, model_string = self._prepare_generate_params(
78
+ prompt, model, system_instructions, temperature, max_tokens, **kwargs
79
+ )
80
+
81
+ logger.debug(
82
+ f"Calling LiteLLM async with model={model_string}, temperature={api_params.get('temperature')}"
83
+ )
84
+
85
+ try:
86
+ with self._trace_llm_call(model_string):
87
+ return await self._call_primary(api_params, model_string, prompt)
88
+ except Exception as e:
89
+ if self._fallbacks:
90
+ logger.warning(f"Primary model {model_string} failed, trying fallbacks")
91
+ return await self._try_fallbacks(api_params, str(e))
92
+ raise
93
+
94
+ @get_configured_retry_decorator()
95
+ async def _call_primary(
96
+ self, api_params: dict[str, Any], model_string: str, prompt: str
97
+ ) -> str:
98
+ response = await acompletion(**api_params)
99
+ return self._record_response(prompt, response, model_string)
100
+
101
+ async def _try_fallbacks(
102
+ self,
103
+ original_params: dict[str, Any],
104
+ original_error: str,
105
+ ) -> str:
106
+ for fallback_model in self._fallbacks:
107
+ try:
108
+ logger.info(f"Trying fallback model: {fallback_model}")
109
+ params = original_params.copy()
110
+ params["model"] = fallback_model
111
+ response = await acompletion(**params)
112
+ return self._record_fallback_response(response, fallback_model)
113
+ except Exception as e:
114
+ logger.warning(f"Fallback model {fallback_model} failed: {e}")
115
+ continue
116
+
117
+ raise RuntimeError(f"All models failed. Primary error: {original_error}")
118
+
119
+ async def clone(self) -> AsyncFFLiteLLMClient:
120
+ """Create a deep copy of this client with reset usage and empty history.
121
+
122
+ Returns:
123
+ A new ``AsyncFFLiteLLMClient`` with identical configuration.
124
+
125
+ """
126
+ logger.debug(f"Cloning async client with model_string={self._model_string}")
127
+ cloned = AsyncFFLiteLLMClient(
128
+ model_string=self._model_string,
129
+ config=copy.deepcopy(self._config),
130
+ api_key=self.api_key,
131
+ api_base=self.api_base,
132
+ api_version=self.api_version,
133
+ system_instructions=self.system_instructions,
134
+ temperature=self.temperature,
135
+ max_tokens=self.max_tokens,
136
+ fallbacks=copy.copy(self._fallbacks) if self._fallbacks else None,
137
+ retry_config=copy.copy(self._retry_config),
138
+ **copy.deepcopy(self._extra_kwargs),
139
+ )
140
+ cloned._reset_usage()
141
+ return cloned
@@ -0,0 +1,345 @@
1
+ # Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
2
+ # SPDX-License-Identifier: MIT
3
+ # Contact: antquinonez@farfiner.com
4
+
5
+ """Shared base for sync and async LiteLLM-backed AI clients.
6
+
7
+ Contains all non-I/O logic: settings resolution, env var lookup, message
8
+ building, usage extraction, tool call serialization, and conversation
9
+ history management. Subclasses provide sync/async ``completion()`` calls.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import os
16
+ from typing import Any
17
+
18
+ import litellm
19
+
20
+ from ..core.usage import TokenUsage
21
+ from .model_defaults import get_model_defaults
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class BaseLiteLLMClient:
27
+ """Mixin providing shared logic for LiteLLM-backed clients.
28
+
29
+ Subclasses must:
30
+ - Inherit from this class *and* ``FFAIClientBase`` (or its async variant)
31
+ - Implement ``generate_response()``, ``_call_primary()``,
32
+ ``_try_fallbacks()``, and ``clone()``
33
+
34
+ """
35
+
36
+ model: str
37
+ system_instructions: str
38
+ conversation_history: list[dict[str, Any]]
39
+ _model_string: str
40
+ _config: dict[str, Any]
41
+ _fallbacks: list[str]
42
+ _retry_config: dict[str, Any]
43
+ _extra_kwargs: dict[str, Any]
44
+ api_key: str | None
45
+ api_base: str | None
46
+ api_version: str | None
47
+ temperature: float
48
+ max_tokens: int
49
+
50
+ def __init__(
51
+ self,
52
+ model_string: str,
53
+ config: dict[str, Any] | None = None,
54
+ *,
55
+ api_key: str | None = None,
56
+ api_base: str | None = None,
57
+ api_version: str | None = None,
58
+ system_instructions: str | None = None,
59
+ temperature: float | None = None,
60
+ max_tokens: int | None = None,
61
+ fallbacks: list[str] | None = None,
62
+ retry_config: dict[str, Any] | None = None,
63
+ **kwargs: Any,
64
+ ):
65
+ self._model_string = model_string
66
+ self._config = config or {}
67
+ self._fallbacks = fallbacks or []
68
+
69
+ self.model = model_string.split("/", 1)[-1] if "/" in model_string else model_string
70
+
71
+ if retry_config is None:
72
+ try:
73
+ from ..config import get_config
74
+
75
+ app_config = get_config()
76
+ retry_settings = getattr(app_config, "retry", None)
77
+ if retry_settings:
78
+ retry_config = {
79
+ "max_attempts": getattr(retry_settings, "max_attempts", 3),
80
+ }
81
+ except Exception as e:
82
+ logger.debug(f"Could not load retry config: {e}")
83
+
84
+ self._retry_config = retry_config or {"max_attempts": 3}
85
+
86
+ self._resolve_settings(
87
+ api_key=api_key,
88
+ api_base=api_base,
89
+ api_version=api_version,
90
+ system_instructions=system_instructions,
91
+ temperature=temperature,
92
+ max_tokens=max_tokens,
93
+ **kwargs,
94
+ )
95
+
96
+ self._configure_litellm_retry()
97
+
98
+ self.conversation_history: list[dict[str, Any]] = []
99
+ logger.info(f"Initialized {self.__class__.__name__} with model_string={model_string}")
100
+
101
+ super().__init__()
102
+
103
+ def _resolve_settings(
104
+ self,
105
+ api_key: str | None,
106
+ api_base: str | None,
107
+ api_version: str | None,
108
+ system_instructions: str | None,
109
+ temperature: float | None,
110
+ max_tokens: int | None,
111
+ **kwargs: Any,
112
+ ) -> None:
113
+ defaults = get_model_defaults(self._model_string)
114
+
115
+ self.api_key = api_key or self._config.get("api_key") or self._get_env("API_KEY")
116
+ self.api_base = api_base or self._config.get("api_base") or self._get_env("API_BASE")
117
+ self.api_version = (
118
+ api_version or self._config.get("api_version") or self._get_env("API_VERSION")
119
+ )
120
+ self.system_instructions = (
121
+ system_instructions
122
+ or self._config.get("system_instructions")
123
+ or defaults.get("system_instructions", "You are a helpful assistant.")
124
+ )
125
+ self.temperature = (
126
+ temperature
127
+ if temperature is not None
128
+ else self._config.get("temperature", defaults.get("temperature", 0.7))
129
+ )
130
+ self.max_tokens = (
131
+ max_tokens
132
+ if max_tokens is not None
133
+ else self._config.get("max_tokens", defaults.get("max_tokens", 4096))
134
+ )
135
+
136
+ self._extra_kwargs = kwargs
137
+
138
+ def _configure_litellm_retry(self) -> None:
139
+ litellm.num_retries = 0
140
+ litellm.suppress_debug_info = True
141
+ logging.getLogger("LiteLLM").setLevel(logging.WARNING)
142
+
143
+ def _get_env(self, suffix: str) -> str | None:
144
+ provider = self._model_string.split("/")[0] if "/" in self._model_string else "openai"
145
+
146
+ prefixes = {
147
+ "azure": f"AZURE_{self.model.upper().replace('-', '_')}",
148
+ "anthropic": "ANTHROPIC",
149
+ "mistral": "MISTRAL",
150
+ "openai": "OPENAI",
151
+ "gemini": "GEMINI",
152
+ "perplexity": "PERPLEXITY",
153
+ "nvidia_nim": "NVIDIA",
154
+ }
155
+
156
+ prefix = prefixes.get(provider, provider.upper())
157
+
158
+ patterns = [
159
+ f"{prefix}_{suffix}",
160
+ f"{prefix}_API_KEY" if suffix == "API_KEY" else None,
161
+ f"LITELLM_{suffix}",
162
+ ]
163
+
164
+ for pattern in patterns:
165
+ if pattern and (value := os.getenv(pattern)):
166
+ return value
167
+
168
+ return None
169
+
170
+ def _build_messages(self, system_instructions: str | None = None) -> list[dict[str, Any]]:
171
+ messages: list[dict[str, Any]] = []
172
+
173
+ system = system_instructions or self.system_instructions
174
+ if system:
175
+ messages.append({"role": "system", "content": system})
176
+
177
+ messages.extend(self.conversation_history)
178
+
179
+ return messages
180
+
181
+ def _prepare_generate_params(
182
+ self,
183
+ prompt: str,
184
+ model: str | None,
185
+ system_instructions: str | None,
186
+ temperature: float | None,
187
+ max_tokens: int | None,
188
+ **kwargs: Any,
189
+ ) -> tuple[dict[str, Any], str]:
190
+ if not prompt.strip():
191
+ raise ValueError("Empty prompt provided")
192
+
193
+ self._reset_usage() # type: ignore[attr-defined]
194
+
195
+ messages = self._build_messages(system_instructions)
196
+ messages.append({"role": "user", "content": prompt})
197
+
198
+ model_string = self._model_string
199
+ if model:
200
+ if "/" not in model and "/" in self._model_string:
201
+ provider = self._model_string.split("/")[0]
202
+ model_string = f"{provider}/{model}"
203
+ else:
204
+ model_string = model
205
+
206
+ api_params: dict[str, Any] = {
207
+ "model": model_string,
208
+ "messages": messages,
209
+ "temperature": (temperature if temperature is not None else self.temperature),
210
+ "max_tokens": max_tokens or self.max_tokens,
211
+ }
212
+
213
+ if self.api_key:
214
+ api_params["api_key"] = self.api_key
215
+ if self.api_base:
216
+ api_params["api_base"] = self.api_base
217
+ if self.api_version:
218
+ api_params["api_version"] = self.api_version
219
+
220
+ api_params.update(self._extra_kwargs)
221
+ api_params.update(kwargs)
222
+
223
+ return api_params, model_string
224
+
225
+ def _record_response(self, prompt: str, response: Any, model_string: str) -> str:
226
+ self._extract_usage(response, model_string)
227
+ message = response.choices[0].message # type: ignore[reportAttributeAccessIssue]
228
+ tool_calls = getattr(message, "tool_calls", None)
229
+ assistant_response = message.content or ""
230
+
231
+ if tool_calls:
232
+ self.conversation_history.append({"role": "user", "content": prompt})
233
+ self.conversation_history.append(
234
+ {
235
+ "role": "assistant",
236
+ "content": assistant_response,
237
+ "tool_calls": self._serialize_tool_calls(tool_calls),
238
+ }
239
+ )
240
+ logger.debug("Response received with %s tool call(s)", len(tool_calls))
241
+ else:
242
+ self.conversation_history.append({"role": "user", "content": prompt})
243
+ self.conversation_history.append(
244
+ {"role": "assistant", "content": assistant_response}
245
+ )
246
+ logger.debug(f"Response received: {assistant_response[:100]}...")
247
+
248
+ return assistant_response
249
+
250
+ def _record_fallback_response(self, response: Any, model_string: str) -> str:
251
+ self._extract_usage(response, model_string)
252
+ assistant_response: str = response.choices[0].message.content or "" # type: ignore[reportAttributeAccessIssue]
253
+ self.conversation_history.append(
254
+ {"role": "assistant", "content": assistant_response}
255
+ )
256
+ logger.info(f"Fallback model {model_string} succeeded")
257
+ return assistant_response
258
+
259
+ def _extract_usage(self, response: Any, model_string: str) -> None:
260
+ usage = getattr(response, "usage", None)
261
+ if usage:
262
+ raw_input = getattr(usage, "prompt_tokens", 0)
263
+ raw_output = getattr(usage, "completion_tokens", 0)
264
+ raw_total = getattr(usage, "total_tokens", 0)
265
+ self._last_usage = TokenUsage(
266
+ input_tokens=int(raw_input) if raw_input else 0,
267
+ output_tokens=int(raw_output) if raw_output else 0,
268
+ total_tokens=int(raw_total) if raw_total else 0,
269
+ )
270
+ try:
271
+ self._last_cost_usd = litellm.completion_cost(response)
272
+ except Exception:
273
+ self._last_cost_usd = 0.0
274
+ logger.debug(
275
+ f"Usage for {model_string}: "
276
+ f"input={self._last_usage.input_tokens if self._last_usage else 0}, "
277
+ f"output={self._last_usage.output_tokens if self._last_usage else 0}, "
278
+ f"cost=${self._last_cost_usd:.6f}"
279
+ )
280
+
281
+ def _serialize_tool_calls(self, tool_calls: list[Any]) -> list[dict[str, Any]]:
282
+ serialized: list[dict[str, Any]] = []
283
+
284
+ for tool_call in tool_calls:
285
+ if isinstance(tool_call, dict):
286
+ tool_id = tool_call.get("id", "")
287
+ function = tool_call.get("function", {})
288
+ function_name = function.get("name", "")
289
+ function_arguments = function.get("arguments", "{}")
290
+ else:
291
+ tool_id = getattr(tool_call, "id", "")
292
+ function = getattr(tool_call, "function", None)
293
+ function_name = getattr(function, "name", "") if function else ""
294
+ function_arguments = getattr(function, "arguments", "{}") if function else "{}"
295
+
296
+ serialized.append(
297
+ {
298
+ "id": tool_id,
299
+ "function": {
300
+ "name": function_name,
301
+ "arguments": function_arguments,
302
+ },
303
+ }
304
+ )
305
+
306
+ return serialized
307
+
308
+ def add_tool_result(self, tool_call_id: str, content: str) -> None:
309
+ """Append a tool result message to the conversation history.
310
+
311
+ Args:
312
+ tool_call_id: Provider-specific ID of the tool call being answered.
313
+ content: The tool's return value as a string.
314
+
315
+ """
316
+ self.conversation_history.append(
317
+ {"role": "tool", "tool_call_id": tool_call_id, "content": content}
318
+ )
319
+
320
+ def clear_conversation(self) -> None:
321
+ """Remove all messages from the conversation history."""
322
+ logger.debug("Clearing conversation history")
323
+ self.conversation_history = []
324
+
325
+ def get_conversation_history(self) -> list[dict[str, Any]]:
326
+ """Return a shallow copy of the conversation history.
327
+
328
+ Returns:
329
+ List of message dictionaries.
330
+
331
+ """
332
+ return self.conversation_history.copy()
333
+
334
+ def set_conversation_history(self, history: list[dict[str, Any]]) -> None:
335
+ """Replace the conversation history with a new list of messages.
336
+
337
+ Args:
338
+ history: List of message dictionaries to set.
339
+
340
+ """
341
+ self.conversation_history = list(history)
342
+ logger.debug(f"Set conversation history with {len(history)} messages")
343
+
344
+ def __repr__(self) -> str:
345
+ return f"{self.__class__.__name__}(model_string={self._model_string!r}, model={self.model!r})"
@@ -0,0 +1,174 @@
1
+ # Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
2
+ # SPDX-License-Identifier: MIT
3
+ # Contact: antquinonez@farfiner.com
4
+
5
+ """Synchronous LiteLLM-backed AI client implementing FFAIClientBase contract.
6
+
7
+ Delegates all shared logic to ``BaseLiteLLMClient`` and provides only the
8
+ synchronous ``completion()`` call and ``clone()`` factory.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import copy
14
+ import logging
15
+ from typing import Any
16
+
17
+ from litellm import completion
18
+
19
+ from ..core.client_base import FFAIClientBase
20
+ from ..retry_utils import get_configured_retry_decorator
21
+ from .BaseLiteLLMClient import BaseLiteLLMClient
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class FFLiteLLMClient(BaseLiteLLMClient, FFAIClientBase):
27
+ """LiteLLM-backed AI client implementing FFAIClientBase.
28
+
29
+ This client wraps LiteLLM's completion() function while maintaining
30
+ the FFAIClientBase contract for compatibility with FFAI wrapper.
31
+
32
+ Key features:
33
+ - Internal conversation history management
34
+ - Clone pattern for parallel execution
35
+ - Model string routing (e.g., "azure/mistral-small-2503")
36
+ - Retry and fallback support
37
+
38
+ Args:
39
+ model_string: LiteLLM model identifier (e.g., "openai/gpt-4", "azure/my-deployment")
40
+ config: Optional configuration dictionary
41
+ api_key: API key (overrides env var)
42
+ api_base: API base URL (overrides env var)
43
+ system_instructions: System prompt
44
+ temperature: Sampling temperature (0-2)
45
+ max_tokens: Maximum tokens to generate
46
+ fallbacks: List of fallback model strings
47
+ retry_config: Retry configuration
48
+
49
+ Example:
50
+ >>> client = FFLiteLLMClient(model_string="azure/mistral-small-2503")
51
+ >>> response = client.generate_response("Hello!")
52
+ >>>
53
+ >>> # With fallbacks
54
+ >>> client = FFLiteLLMClient(
55
+ ... model_string="anthropic/claude-3-opus",
56
+ ... fallbacks=["openai/gpt-4", "azure/gpt-4"]
57
+ ... )
58
+
59
+ """
60
+
61
+ def generate_response(
62
+ self,
63
+ prompt: str,
64
+ model: str | None = None,
65
+ system_instructions: str | None = None,
66
+ temperature: float | None = None,
67
+ max_tokens: int | None = None,
68
+ **kwargs: Any,
69
+ ) -> str:
70
+ """Generate a response from the AI model with retry and fallback logic.
71
+
72
+ Retries are handled by ``retry_utils.get_configured_retry_decorator``
73
+ on the inner ``_call_primary`` method. If the primary model (and all
74
+ its retries) fail, fallback models are tried once each.
75
+
76
+ Args:
77
+ prompt: The user prompt
78
+ model: Override model (appends to provider prefix)
79
+ system_instructions: Override system instructions
80
+ temperature: Override temperature
81
+ max_tokens: Override max tokens
82
+ **kwargs: Additional LiteLLM parameters
83
+
84
+ Returns:
85
+ The generated response text
86
+
87
+ Raises:
88
+ ValueError: If prompt is empty
89
+ RuntimeError: If all models (including fallbacks) fail
90
+
91
+ """
92
+ api_params, model_string = self._prepare_generate_params(
93
+ prompt, model, system_instructions, temperature, max_tokens, **kwargs
94
+ )
95
+
96
+ logger.debug(
97
+ f"Calling LiteLLM with model={model_string}, temperature={api_params.get('temperature')}"
98
+ )
99
+
100
+ try:
101
+ with self._trace_llm_call(model_string):
102
+ return self._call_primary(api_params, model_string, prompt)
103
+ except Exception as e:
104
+ if self._fallbacks:
105
+ logger.warning(f"Primary model {model_string} failed, trying fallbacks")
106
+ return self._try_fallbacks(api_params, str(e))
107
+ raise
108
+
109
+ @get_configured_retry_decorator()
110
+ def _call_primary(
111
+ self, api_params: dict[str, Any], model_string: str, prompt: str
112
+ ) -> str:
113
+ """Execute a single LiteLLM completion call (retried by decorator).
114
+
115
+ Args:
116
+ api_params: Parameters dict for ``litellm.completion()``.
117
+ model_string: Model identifier for logging.
118
+ prompt: Original user prompt (used for history).
119
+
120
+ Returns:
121
+ The assistant response text.
122
+
123
+ Raises:
124
+ Exception: Re-raised from ``completion()`` after retries exhausted.
125
+
126
+ """
127
+ response = completion(**api_params)
128
+ return self._record_response(prompt, response, model_string)
129
+
130
+ def _try_fallbacks(
131
+ self,
132
+ original_params: dict[str, Any],
133
+ original_error: str,
134
+ ) -> str:
135
+ """Try fallback models if primary fails."""
136
+ for fallback_model in self._fallbacks:
137
+ try:
138
+ logger.info(f"Trying fallback model: {fallback_model}")
139
+ params = original_params.copy()
140
+ params["model"] = fallback_model
141
+ response = completion(**params)
142
+ return self._record_fallback_response(response, fallback_model)
143
+ except Exception as e:
144
+ logger.warning(f"Fallback model {fallback_model} failed: {e}")
145
+ continue
146
+
147
+ raise RuntimeError(f"All models failed. Primary error: {original_error}")
148
+
149
+ def clone(self) -> FFLiteLLMClient:
150
+ """Create a fresh clone of this client with empty history.
151
+
152
+ Used for thread-safe parallel execution where each thread
153
+ needs an isolated client instance with the same configuration.
154
+
155
+ Returns:
156
+ New FFLiteLLMClient with same config, empty history.
157
+
158
+ """
159
+ logger.debug(f"Cloning client with model_string={self._model_string}")
160
+ clone = FFLiteLLMClient(
161
+ model_string=self._model_string,
162
+ config=copy.deepcopy(self._config),
163
+ api_key=self.api_key,
164
+ api_base=self.api_base,
165
+ api_version=self.api_version,
166
+ system_instructions=self.system_instructions,
167
+ temperature=self.temperature,
168
+ max_tokens=self.max_tokens,
169
+ fallbacks=copy.copy(self._fallbacks) if self._fallbacks else None,
170
+ retry_config=copy.copy(self._retry_config),
171
+ **copy.deepcopy(self._extra_kwargs),
172
+ )
173
+ clone._reset_usage()
174
+ return clone