PyPI - ffai - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ffai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

ffai/Clients/AsyncFFLiteLLMClient.py +141 -0
ffai/Clients/BaseLiteLLMClient.py +345 -0
ffai/Clients/FFLiteLLMClient.py +174 -0
ffai/Clients/FFMistralSmall.py +385 -0
ffai/Clients/__init__.py +13 -0
ffai/Clients/model_defaults.py +74 -0
ffai/ConversationHistory.py +4 -0
ffai/FFAI.py +868 -0
ffai/FFAIClientBase.py +4 -0
ffai/OrderedPromptHistory.py +4 -0
ffai/__init__.py +57 -0
ffai/agent/__init__.py +16 -0
ffai/agent/agent_loop.py +266 -0
ffai/agent/agent_result.py +121 -0
ffai/agent/response_validator.py +201 -0
ffai/config.py +293 -0
ffai/core/__init__.py +71 -0
ffai/core/async_client_base.py +52 -0
ffai/core/async_executor.py +257 -0
ffai/core/client_base.py +245 -0
ffai/core/condition_evaluator.py +763 -0
ffai/core/conversation_manager.py +134 -0
ffai/core/execution_result.py +43 -0
ffai/core/execution_state.py +47 -0
ffai/core/graph.py +273 -0
ffai/core/graph_execution_helpers.py +207 -0
ffai/core/history/__init__.py +17 -0
ffai/core/history/conversation.py +64 -0
ffai/core/history/ordered.py +347 -0
ffai/core/history/permanent.py +81 -0
ffai/core/history/recorder.py +92 -0
ffai/core/history_exporter.py +315 -0
ffai/core/prompt_builder.py +154 -0
ffai/core/prompt_node.py +66 -0
ffai/core/prompt_utils.py +104 -0
ffai/core/response_context.py +124 -0
ffai/core/response_executor.py +309 -0
ffai/core/response_options.py +82 -0
ffai/core/response_result.py +47 -0
ffai/core/response_utils.py +93 -0
ffai/core/structured_output.py +242 -0
ffai/core/types.py +62 -0
ffai/core/usage.py +32 -0
ffai/observability/__init__.py +11 -0
ffai/observability/log_context.py +78 -0
ffai/observability/telemetry.py +190 -0
ffai/py.typed +0 -0
ffai/rag/__init__.py +74 -0
ffai/rag/_async.py +34 -0
ffai/rag/client_adapter.py +50 -0
ffai/rag/embed.py +272 -0
ffai/rag/format.py +49 -0
ffai/rag/indexing/__init__.py +5 -0
ffai/rag/indexing/bm25.py +283 -0
ffai/rag/indexing/contextual.py +169 -0
ffai/rag/indexing/deduplication.py +154 -0
ffai/rag/indexing/hierarchical.py +272 -0
ffai/rag/prompts.py +11 -0
ffai/rag/rag.py +546 -0
ffai/rag/search/__init__.py +21 -0
ffai/rag/search/hybrid.py +265 -0
ffai/rag/search/query_expansion.py +165 -0
ffai/rag/search/rerankers.py +255 -0
ffai/rag/splitters/__init__.py +21 -0
ffai/rag/splitters/base.py +114 -0
ffai/rag/splitters/character.py +103 -0
ffai/rag/splitters/code.py +351 -0
ffai/rag/splitters/factory.py +129 -0
ffai/rag/splitters/hierarchical.py +221 -0
ffai/rag/splitters/markdown.py +320 -0
ffai/rag/splitters/recursive.py +223 -0
ffai/rag/store.py +218 -0
ffai/rag/types.py +68 -0
ffai/retry_utils.py +245 -0
ffai/tools/__init__.py +12 -0
ffai/tools/tool_registry.py +294 -0
ffai-0.1.0.dist-info/METADATA +742 -0
ffai-0.1.0.dist-info/RECORD +81 -0
ffai-0.1.0.dist-info/WHEEL +5 -0
ffai-0.1.0.dist-info/licenses/LICENSE +21 -0
ffai-0.1.0.dist-info/top_level.txt +1 -0

ffai/Clients/AsyncFFLiteLLMClient.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
+# SPDX-License-Identifier: MIT
+# Contact: antquinonez@farfiner.com
+"""Async LiteLLM-backed AI client implementing AsyncFFAIClientBase contract.
+Mirrors ``FFLiteLLMClient`` but uses ``litellm.acompletion()`` for async
+I/O.  Shares all non-I/O logic via ``BaseLiteLLMClient``.
+"""
+from __future__ import annotations
+import copy
+import logging
+from typing import Any
+from litellm import acompletion
+from ..core.async_client_base import AsyncFFAIClientBase
+from ..retry_utils import get_configured_retry_decorator
+from .BaseLiteLLMClient import BaseLiteLLMClient
+logger = logging.getLogger(__name__)
+class AsyncFFLiteLLMClient(BaseLiteLLMClient, AsyncFFAIClientBase):
+    """Async LiteLLM-backed AI client implementing AsyncFFAIClientBase.
+    Key features:
+    - Internal conversation history management
+    - Clone pattern for parallel execution
+    - Model string routing (e.g., "azure/mistral-small-2503")
+    - Retry and fallback support
+    Args:
+        model_string: LiteLLM model identifier.
+        config: Optional configuration dictionary.
+        api_key: API key (overrides env var).
+        api_base: API base URL (overrides env var).
+        system_instructions: System prompt.
+        temperature: Sampling temperature (0-2).
+        max_tokens: Maximum tokens to generate.
+        fallbacks: List of fallback model strings.
+        retry_config: Retry configuration.
+    """
+    async def generate_response(
+        self,
+        prompt: str,
+        model: str | None = None,
+        system_instructions: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> str:
+        """Generate a response from the model asynchronously.
+        Falls back to configured fallback models when the primary call fails.
+        Args:
+            prompt: User's input text.
+            model: Model identifier override (preserves provider prefix if set).
+            system_instructions: System prompt override.
+            temperature: Sampling temperature override.
+            max_tokens: Maximum tokens to generate override.
+            **kwargs: Additional parameters forwarded to ``litellm.acompletion()``.
+        Returns:
+            The model's response text.
+        Raises:
+            ValueError: If the prompt is empty.
+            RuntimeError: If all models (primary + fallbacks) fail.
+        """
+        api_params, model_string = self._prepare_generate_params(
+            prompt, model, system_instructions, temperature, max_tokens, **kwargs
+        )
+        logger.debug(
+            f"Calling LiteLLM async with model={model_string}, temperature={api_params.get('temperature')}"
+        )
+        try:
+            with self._trace_llm_call(model_string):
+                return await self._call_primary(api_params, model_string, prompt)
+        except Exception as e:
+            if self._fallbacks:
+                logger.warning(f"Primary model {model_string} failed, trying fallbacks")
+                return await self._try_fallbacks(api_params, str(e))
+            raise
+    @get_configured_retry_decorator()
+    async def _call_primary(
+        self, api_params: dict[str, Any], model_string: str, prompt: str
+    ) -> str:
+        response = await acompletion(**api_params)
+        return self._record_response(prompt, response, model_string)
+    async def _try_fallbacks(
+        self,
+        original_params: dict[str, Any],
+        original_error: str,
+    ) -> str:
+        for fallback_model in self._fallbacks:
+            try:
+                logger.info(f"Trying fallback model: {fallback_model}")
+                params = original_params.copy()
+                params["model"] = fallback_model
+                response = await acompletion(**params)
+                return self._record_fallback_response(response, fallback_model)
+            except Exception as e:
+                logger.warning(f"Fallback model {fallback_model} failed: {e}")
+                continue
+        raise RuntimeError(f"All models failed. Primary error: {original_error}")
+    async def clone(self) -> AsyncFFLiteLLMClient:
+        """Create a deep copy of this client with reset usage and empty history.
+        Returns:
+            A new ``AsyncFFLiteLLMClient`` with identical configuration.
+        """
+        logger.debug(f"Cloning async client with model_string={self._model_string}")
+        cloned = AsyncFFLiteLLMClient(
+            model_string=self._model_string,
+            config=copy.deepcopy(self._config),
+            api_key=self.api_key,
+            api_base=self.api_base,
+            api_version=self.api_version,
+            system_instructions=self.system_instructions,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            fallbacks=copy.copy(self._fallbacks) if self._fallbacks else None,
+            retry_config=copy.copy(self._retry_config),
+            **copy.deepcopy(self._extra_kwargs),
+        )
+        cloned._reset_usage()
+        return cloned

ffai/Clients/BaseLiteLLMClient.py ADDED Viewed

@@ -0,0 +1,345 @@
+# Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
+# SPDX-License-Identifier: MIT
+# Contact: antquinonez@farfiner.com
+"""Shared base for sync and async LiteLLM-backed AI clients.
+Contains all non-I/O logic: settings resolution, env var lookup, message
+building, usage extraction, tool call serialization, and conversation
+history management.  Subclasses provide sync/async ``completion()`` calls.
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import Any
+import litellm
+from ..core.usage import TokenUsage
+from .model_defaults import get_model_defaults
+logger = logging.getLogger(__name__)
+class BaseLiteLLMClient:
+    """Mixin providing shared logic for LiteLLM-backed clients.
+    Subclasses must:
+    - Inherit from this class *and* ``FFAIClientBase`` (or its async variant)
+    - Implement ``generate_response()``, ``_call_primary()``,
+      ``_try_fallbacks()``, and ``clone()``
+    """
+    model: str
+    system_instructions: str
+    conversation_history: list[dict[str, Any]]
+    _model_string: str
+    _config: dict[str, Any]
+    _fallbacks: list[str]
+    _retry_config: dict[str, Any]
+    _extra_kwargs: dict[str, Any]
+    api_key: str | None
+    api_base: str | None
+    api_version: str | None
+    temperature: float
+    max_tokens: int
+    def __init__(
+        self,
+        model_string: str,
+        config: dict[str, Any] | None = None,
+        *,
+        api_key: str | None = None,
+        api_base: str | None = None,
+        api_version: str | None = None,
+        system_instructions: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        fallbacks: list[str] | None = None,
+        retry_config: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ):
+        self._model_string = model_string
+        self._config = config or {}
+        self._fallbacks = fallbacks or []
+        self.model = model_string.split("/", 1)[-1] if "/" in model_string else model_string
+        if retry_config is None:
+            try:
+                from ..config import get_config
+                app_config = get_config()
+                retry_settings = getattr(app_config, "retry", None)
+                if retry_settings:
+                    retry_config = {
+                        "max_attempts": getattr(retry_settings, "max_attempts", 3),
+                    }
+            except Exception as e:
+                logger.debug(f"Could not load retry config: {e}")
+        self._retry_config = retry_config or {"max_attempts": 3}
+        self._resolve_settings(
+            api_key=api_key,
+            api_base=api_base,
+            api_version=api_version,
+            system_instructions=system_instructions,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            **kwargs,
+        )
+        self._configure_litellm_retry()
+        self.conversation_history: list[dict[str, Any]] = []
+        logger.info(f"Initialized {self.__class__.__name__} with model_string={model_string}")
+        super().__init__()
+    def _resolve_settings(
+        self,
+        api_key: str | None,
+        api_base: str | None,
+        api_version: str | None,
+        system_instructions: str | None,
+        temperature: float | None,
+        max_tokens: int | None,
+        **kwargs: Any,
+    ) -> None:
+        defaults = get_model_defaults(self._model_string)
+        self.api_key = api_key or self._config.get("api_key") or self._get_env("API_KEY")
+        self.api_base = api_base or self._config.get("api_base") or self._get_env("API_BASE")
+        self.api_version = (
+            api_version or self._config.get("api_version") or self._get_env("API_VERSION")
+        )
+        self.system_instructions = (
+            system_instructions
+            or self._config.get("system_instructions")
+            or defaults.get("system_instructions", "You are a helpful assistant.")
+        )
+        self.temperature = (
+            temperature
+            if temperature is not None
+            else self._config.get("temperature", defaults.get("temperature", 0.7))
+        )
+        self.max_tokens = (
+            max_tokens
+            if max_tokens is not None
+            else self._config.get("max_tokens", defaults.get("max_tokens", 4096))
+        )
+        self._extra_kwargs = kwargs
+    def _configure_litellm_retry(self) -> None:
+        litellm.num_retries = 0
+        litellm.suppress_debug_info = True
+        logging.getLogger("LiteLLM").setLevel(logging.WARNING)
+    def _get_env(self, suffix: str) -> str | None:
+        provider = self._model_string.split("/")[0] if "/" in self._model_string else "openai"
+        prefixes = {
+            "azure": f"AZURE_{self.model.upper().replace('-', '_')}",
+            "anthropic": "ANTHROPIC",
+            "mistral": "MISTRAL",
+            "openai": "OPENAI",
+            "gemini": "GEMINI",
+            "perplexity": "PERPLEXITY",
+            "nvidia_nim": "NVIDIA",
+        }
+        prefix = prefixes.get(provider, provider.upper())
+        patterns = [
+            f"{prefix}_{suffix}",
+            f"{prefix}_API_KEY" if suffix == "API_KEY" else None,
+            f"LITELLM_{suffix}",
+        ]
+        for pattern in patterns:
+            if pattern and (value := os.getenv(pattern)):
+                return value
+        return None
+    def _build_messages(self, system_instructions: str | None = None) -> list[dict[str, Any]]:
+        messages: list[dict[str, Any]] = []
+        system = system_instructions or self.system_instructions
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.extend(self.conversation_history)
+        return messages
+    def _prepare_generate_params(
+        self,
+        prompt: str,
+        model: str | None,
+        system_instructions: str | None,
+        temperature: float | None,
+        max_tokens: int | None,
+        **kwargs: Any,
+    ) -> tuple[dict[str, Any], str]:
+        if not prompt.strip():
+            raise ValueError("Empty prompt provided")
+        self._reset_usage()  # type: ignore[attr-defined]
+        messages = self._build_messages(system_instructions)
+        messages.append({"role": "user", "content": prompt})
+        model_string = self._model_string
+        if model:
+            if "/" not in model and "/" in self._model_string:
+                provider = self._model_string.split("/")[0]
+                model_string = f"{provider}/{model}"
+            else:
+                model_string = model
+        api_params: dict[str, Any] = {
+            "model": model_string,
+            "messages": messages,
+            "temperature": (temperature if temperature is not None else self.temperature),
+            "max_tokens": max_tokens or self.max_tokens,
+        }
+        if self.api_key:
+            api_params["api_key"] = self.api_key
+        if self.api_base:
+            api_params["api_base"] = self.api_base
+        if self.api_version:
+            api_params["api_version"] = self.api_version
+        api_params.update(self._extra_kwargs)
+        api_params.update(kwargs)
+        return api_params, model_string
+    def _record_response(self, prompt: str, response: Any, model_string: str) -> str:
+        self._extract_usage(response, model_string)
+        message = response.choices[0].message  # type: ignore[reportAttributeAccessIssue]
+        tool_calls = getattr(message, "tool_calls", None)
+        assistant_response = message.content or ""
+        if tool_calls:
+            self.conversation_history.append({"role": "user", "content": prompt})
+            self.conversation_history.append(
+                {
+                    "role": "assistant",
+                    "content": assistant_response,
+                    "tool_calls": self._serialize_tool_calls(tool_calls),
+                }
+            )
+            logger.debug("Response received with %s tool call(s)", len(tool_calls))
+        else:
+            self.conversation_history.append({"role": "user", "content": prompt})
+            self.conversation_history.append(
+                {"role": "assistant", "content": assistant_response}
+            )
+            logger.debug(f"Response received: {assistant_response[:100]}...")
+        return assistant_response
+    def _record_fallback_response(self, response: Any, model_string: str) -> str:
+        self._extract_usage(response, model_string)
+        assistant_response: str = response.choices[0].message.content or ""  # type: ignore[reportAttributeAccessIssue]
+        self.conversation_history.append(
+            {"role": "assistant", "content": assistant_response}
+        )
+        logger.info(f"Fallback model {model_string} succeeded")
+        return assistant_response
+    def _extract_usage(self, response: Any, model_string: str) -> None:
+        usage = getattr(response, "usage", None)
+        if usage:
+            raw_input = getattr(usage, "prompt_tokens", 0)
+            raw_output = getattr(usage, "completion_tokens", 0)
+            raw_total = getattr(usage, "total_tokens", 0)
+            self._last_usage = TokenUsage(
+                input_tokens=int(raw_input) if raw_input else 0,
+                output_tokens=int(raw_output) if raw_output else 0,
+                total_tokens=int(raw_total) if raw_total else 0,
+            )
+        try:
+            self._last_cost_usd = litellm.completion_cost(response)
+        except Exception:
+            self._last_cost_usd = 0.0
+        logger.debug(
+            f"Usage for {model_string}: "
+            f"input={self._last_usage.input_tokens if self._last_usage else 0}, "
+            f"output={self._last_usage.output_tokens if self._last_usage else 0}, "
+            f"cost=${self._last_cost_usd:.6f}"
+        )
+    def _serialize_tool_calls(self, tool_calls: list[Any]) -> list[dict[str, Any]]:
+        serialized: list[dict[str, Any]] = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_id = tool_call.get("id", "")
+                function = tool_call.get("function", {})
+                function_name = function.get("name", "")
+                function_arguments = function.get("arguments", "{}")
+            else:
+                tool_id = getattr(tool_call, "id", "")
+                function = getattr(tool_call, "function", None)
+                function_name = getattr(function, "name", "") if function else ""
+                function_arguments = getattr(function, "arguments", "{}") if function else "{}"
+            serialized.append(
+                {
+                    "id": tool_id,
+                    "function": {
+                        "name": function_name,
+                        "arguments": function_arguments,
+                    },
+                }
+            )
+        return serialized
+    def add_tool_result(self, tool_call_id: str, content: str) -> None:
+        """Append a tool result message to the conversation history.
+        Args:
+            tool_call_id: Provider-specific ID of the tool call being answered.
+            content: The tool's return value as a string.
+        """
+        self.conversation_history.append(
+            {"role": "tool", "tool_call_id": tool_call_id, "content": content}
+        )
+    def clear_conversation(self) -> None:
+        """Remove all messages from the conversation history."""
+        logger.debug("Clearing conversation history")
+        self.conversation_history = []
+    def get_conversation_history(self) -> list[dict[str, Any]]:
+        """Return a shallow copy of the conversation history.
+        Returns:
+            List of message dictionaries.
+        """
+        return self.conversation_history.copy()
+    def set_conversation_history(self, history: list[dict[str, Any]]) -> None:
+        """Replace the conversation history with a new list of messages.
+        Args:
+            history: List of message dictionaries to set.
+        """
+        self.conversation_history = list(history)
+        logger.debug(f"Set conversation history with {len(history)} messages")
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(model_string={self._model_string!r}, model={self.model!r})"

ffai/Clients/FFLiteLLMClient.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright (c) 2025 Antonio Quinonez / Far Finer LLC
+# SPDX-License-Identifier: MIT
+# Contact: antquinonez@farfiner.com
+"""Synchronous LiteLLM-backed AI client implementing FFAIClientBase contract.
+Delegates all shared logic to ``BaseLiteLLMClient`` and provides only the
+synchronous ``completion()`` call and ``clone()`` factory.
+"""
+from __future__ import annotations
+import copy
+import logging
+from typing import Any
+from litellm import completion
+from ..core.client_base import FFAIClientBase
+from ..retry_utils import get_configured_retry_decorator
+from .BaseLiteLLMClient import BaseLiteLLMClient
+logger = logging.getLogger(__name__)
+class FFLiteLLMClient(BaseLiteLLMClient, FFAIClientBase):
+    """LiteLLM-backed AI client implementing FFAIClientBase.
+    This client wraps LiteLLM's completion() function while maintaining
+    the FFAIClientBase contract for compatibility with FFAI wrapper.
+    Key features:
+    - Internal conversation history management
+    - Clone pattern for parallel execution
+    - Model string routing (e.g., "azure/mistral-small-2503")
+    - Retry and fallback support
+    Args:
+        model_string: LiteLLM model identifier (e.g., "openai/gpt-4", "azure/my-deployment")
+        config: Optional configuration dictionary
+        api_key: API key (overrides env var)
+        api_base: API base URL (overrides env var)
+        system_instructions: System prompt
+        temperature: Sampling temperature (0-2)
+        max_tokens: Maximum tokens to generate
+        fallbacks: List of fallback model strings
+        retry_config: Retry configuration
+    Example:
+        >>> client = FFLiteLLMClient(model_string="azure/mistral-small-2503")
+        >>> response = client.generate_response("Hello!")
+        >>>
+        >>> # With fallbacks
+        >>> client = FFLiteLLMClient(
+        ...     model_string="anthropic/claude-3-opus",
+        ...     fallbacks=["openai/gpt-4", "azure/gpt-4"]
+        ... )
+    """
+    def generate_response(
+        self,
+        prompt: str,
+        model: str | None = None,
+        system_instructions: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> str:
+        """Generate a response from the AI model with retry and fallback logic.
+        Retries are handled by ``retry_utils.get_configured_retry_decorator``
+        on the inner ``_call_primary`` method. If the primary model (and all
+        its retries) fail, fallback models are tried once each.
+        Args:
+            prompt: The user prompt
+            model: Override model (appends to provider prefix)
+            system_instructions: Override system instructions
+            temperature: Override temperature
+            max_tokens: Override max tokens
+            **kwargs: Additional LiteLLM parameters
+        Returns:
+            The generated response text
+        Raises:
+            ValueError: If prompt is empty
+            RuntimeError: If all models (including fallbacks) fail
+        """
+        api_params, model_string = self._prepare_generate_params(
+            prompt, model, system_instructions, temperature, max_tokens, **kwargs
+        )
+        logger.debug(
+            f"Calling LiteLLM with model={model_string}, temperature={api_params.get('temperature')}"
+        )
+        try:
+            with self._trace_llm_call(model_string):
+                return self._call_primary(api_params, model_string, prompt)
+        except Exception as e:
+            if self._fallbacks:
+                logger.warning(f"Primary model {model_string} failed, trying fallbacks")
+                return self._try_fallbacks(api_params, str(e))
+            raise
+    @get_configured_retry_decorator()
+    def _call_primary(
+        self, api_params: dict[str, Any], model_string: str, prompt: str
+    ) -> str:
+        """Execute a single LiteLLM completion call (retried by decorator).
+        Args:
+            api_params: Parameters dict for ``litellm.completion()``.
+            model_string: Model identifier for logging.
+            prompt: Original user prompt (used for history).
+        Returns:
+            The assistant response text.
+        Raises:
+            Exception: Re-raised from ``completion()`` after retries exhausted.
+        """
+        response = completion(**api_params)
+        return self._record_response(prompt, response, model_string)
+    def _try_fallbacks(
+        self,
+        original_params: dict[str, Any],
+        original_error: str,
+    ) -> str:
+        """Try fallback models if primary fails."""
+        for fallback_model in self._fallbacks:
+            try:
+                logger.info(f"Trying fallback model: {fallback_model}")
+                params = original_params.copy()
+                params["model"] = fallback_model
+                response = completion(**params)
+                return self._record_fallback_response(response, fallback_model)
+            except Exception as e:
+                logger.warning(f"Fallback model {fallback_model} failed: {e}")
+                continue
+        raise RuntimeError(f"All models failed. Primary error: {original_error}")
+    def clone(self) -> FFLiteLLMClient:
+        """Create a fresh clone of this client with empty history.
+        Used for thread-safe parallel execution where each thread
+        needs an isolated client instance with the same configuration.
+        Returns:
+            New FFLiteLLMClient with same config, empty history.
+        """
+        logger.debug(f"Cloning client with model_string={self._model_string}")
+        clone = FFLiteLLMClient(
+            model_string=self._model_string,
+            config=copy.deepcopy(self._config),
+            api_key=self.api_key,
+            api_base=self.api_base,
+            api_version=self.api_version,
+            system_instructions=self.system_instructions,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            fallbacks=copy.copy(self._fallbacks) if self._fallbacks else None,
+            retry_config=copy.copy(self._retry_config),
+            **copy.deepcopy(self._extra_kwargs),
+        )
+        clone._reset_usage()
+        return clone