PyPI - livellm - Versions diffs - 1.6.1__tar.gz → 1.7.1__tar.gz - Mend

livellm 1.6.1tar.gz → 1.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{livellm-1.6.1 → livellm-1.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: livellm
-Version: 1.6.1
+Version: 1.7.1
 Summary: Python client for the LiveLLM Server
 Project-URL: Homepage, https://github.com/qalby-tech/livellm-client-py
 Project-URL: Repository, https://github.com/qalby-tech/livellm-client-py
@@ -36,6 +36,8 @@ Python client library for the LiveLLM Server - a unified proxy for AI agent, aud
 - 🔄 **Streaming** - Real-time streaming for agent and audio
 - 🛠️ **Flexible API** - Use request objects or keyword arguments
 - 📋 **Structured Output** - Get validated JSON responses with schema support (Pydantic, OutputSchema, or dict)
+- 📏 **Context Overflow Management** - Automatic handling of large texts with truncate/recycle strategies
+- ⏱️ **Per-Request Timeout** - Override default timeout for individual requests
 - 🎙️ **Audio services** - Text-to-speech and transcription
 - 🎤 **Real-Time Transcription** - WebSocket-based live audio transcription with bidirectional streaming
 - ⚡ **Fallback strategies** - Sequential and parallel handling
@@ -95,10 +97,10 @@ from livellm.models import Settings, ProviderKind
 # Basic
 client = LivellmClient(base_url="http://localhost:8000")
-# With timeout and pre-configured providers
+# With default timeout and pre-configured providers
 client = LivellmClient(
     base_url="http://localhost:8000",
-    timeout=30.0,
+    timeout=30.0,  # Default timeout for all requests
     configs=[
         Settings(
             uid="openai",
@@ -116,6 +118,50 @@ client = LivellmClient(
 )
 ```
+### Per-Request Timeout Override
+The timeout provided in `__init__` is the default, but you can override it for individual requests:
+```python
+# Client with 30s default timeout
+client = LivellmClient(base_url="http://localhost:8000", timeout=30.0)
+# Uses default 30s timeout
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Hello")]
+)
+# Override with 120s timeout for this specific request
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Write a long essay...")],
+    timeout=120.0  # Override for this request only
+)
+# Works with streaming too
+async for chunk in client.agent_run_stream(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Tell me a story")],
+    timeout=300.0  # 5 minutes for streaming
+):
+    print(chunk.output, end="")
+# Works with all methods: speak(), speak_stream(), transcribe(), etc.
+audio = await client.speak(
+    provider_uid="openai",
+    model="tts-1",
+    text="Hello world",
+    voice="alloy",
+    mime_type=SpeakMimeType.MP3,
+    sample_rate=24000,
+    timeout=60.0
+)
+```
 ### Supported Providers
 `OPENAI` • `GOOGLE` • `ANTHROPIC` • `GROQ` • `ELEVENLABS`
@@ -439,6 +485,73 @@ data = json.loads(full_output)
 - Type-safe responses
 - Integration with type-checked code
+#### Context Overflow Management
+Handle large texts that exceed model context windows with automatic truncation or iterative processing:
+```python
+from livellm.models import TextMessage, ContextOverflowStrategy, OutputSchema, PropertyDef
+# TRUNCATE strategy (default): Preserves beginning, middle, and end
+# Works with both streaming and non-streaming
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[
+        TextMessage(role="system", content="Summarize the document."),
+        TextMessage(role="user", content=very_long_document)
+    ],
+    context_limit=4000,  # Max tokens
+    context_overflow_strategy=ContextOverflowStrategy.TRUNCATE
+)
+# RECYCLE strategy: Iteratively processes chunks and merges results
+# Useful for extraction tasks - processes entire document
+# Requires output_schema for JSON merging
+output_schema = OutputSchema(
+    title="ExtractedInfo",
+    properties={
+        "topics": PropertyDef(type="array", items={"type": "string"}),
+        "key_figures": PropertyDef(type="array", items={"type": "string"})
+    },
+    required=["topics", "key_figures"]
+)
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[
+        TextMessage(role="system", content="Extract all topics and key figures."),
+        TextMessage(role="user", content=very_long_document)
+    ],
+    context_limit=3000,
+    context_overflow_strategy=ContextOverflowStrategy.RECYCLE,
+    output_schema=output_schema
+)
+# Parse the merged results
+import json
+result = json.loads(response.output)
+print(f"Topics: {result['topics']}")
+print(f"Key figures: {result['key_figures']}")
+```
+**Strategy comparison:**
+| Strategy | How it works | Best for | Streaming |
+|----------|--------------|----------|-----------|
+| `TRUNCATE` | Takes beginning, middle, end portions | Summarization, Q&A | ✅ Yes |
+| `RECYCLE` | Processes chunks iteratively, merges JSON | Full document extraction | ❌ No |
+**Parameters:**
+- `context_limit` (int, default: 0) - Maximum tokens. If ≤ 0, overflow handling is disabled
+- `context_overflow_strategy` (ContextOverflowStrategy, default: TRUNCATE) - Strategy to use
+**Notes:**
+- System prompts are always preserved (never truncated)
+- Token counting includes a 20% safety buffer
+- RECYCLE requires `output_schema` for JSON merging
 ### Audio Services
 #### Text-to-Speech
@@ -711,20 +824,22 @@ response = await client.ping()
 ### Client Methods
+All methods accept an optional `timeout` parameter to override the default client timeout.
 **Configuration**
-- `ping()` - Health check
-- `update_config(config)` / `update_configs(configs)` - Add/update providers
-- `get_configs()` - List all configurations
-- `delete_config(uid)` - Remove provider
+- `ping(timeout?)` - Health check
+- `update_config(config, timeout?)` / `update_configs(configs, timeout?)` - Add/update providers
+- `get_configs(timeout?)` - List all configurations
+- `delete_config(uid, timeout?)` - Remove provider
 **Agent**
-- `agent_run(request | **kwargs)` - Run agent (blocking)
-- `agent_run_stream(request | **kwargs)` - Run agent (streaming)
+- `agent_run(request | **kwargs, timeout?)` - Run agent (blocking)
+- `agent_run_stream(request | **kwargs, timeout?)` - Run agent (streaming)
 **Audio**
-- `speak(request | **kwargs)` - Text-to-speech (blocking)
-- `speak_stream(request | **kwargs)` - Text-to-speech (streaming)
-- `transcribe(request | **kwargs)` - Speech-to-text
+- `speak(request | **kwargs, timeout?)` - Text-to-speech (blocking)
+- `speak_stream(request | **kwargs, timeout?)` - Text-to-speech (streaming)
+- `transcribe(request | **kwargs, timeout?)` - Speech-to-text
 **Real-Time Transcription (TranscriptionWsClient)**
 - `connect()` - Establish WebSocket connection
@@ -750,12 +865,15 @@ response = await client.ping()
 - `MessageRole` - `USER` | `MODEL` | `SYSTEM` | `TOOL_CALL` | `TOOL_RETURN` (or use strings)
 **Requests**
-- `AgentRequest(provider_uid, model, messages, tools?, gen_config?, include_history?, output_schema?)` - Set `include_history=True` to get full conversation. Set `output_schema` for structured JSON output.
+- `AgentRequest(provider_uid, model, messages, tools?, gen_config?, include_history?, output_schema?, context_limit?, context_overflow_strategy?)` - Set `include_history=True` to get full conversation. Set `output_schema` for structured JSON output. Set `context_limit` and `context_overflow_strategy` for handling large texts.
 - `SpeakRequest(provider_uid, model, text, voice, mime_type, sample_rate, gen_config?)`
 - `TranscribeRequest(provider_uid, file, model, language?, gen_config?)`
 - `TranscriptionInitWsRequest(provider_uid, model, language?, input_sample_rate?, input_audio_format?, gen_config?)`
 - `TranscriptionAudioChunkWsRequest(audio)` - Audio chunk for streaming
+**Context Overflow**
+- `ContextOverflowStrategy` - `TRUNCATE` | `RECYCLE`
 **Tools**
 - `WebSearchInput(kind=ToolKind.WEB_SEARCH, search_context_size)`
 - `MCPStreamableServerInput(kind=ToolKind.MCP_STREAMABLE_SERVER, url, prefix?, timeout?)`

{livellm-1.6.1 → livellm-1.7.1}/README.md RENAMED Viewed

@@ -13,6 +13,8 @@ Python client library for the LiveLLM Server - a unified proxy for AI agent, aud
 - 🔄 **Streaming** - Real-time streaming for agent and audio
 - 🛠️ **Flexible API** - Use request objects or keyword arguments
 - 📋 **Structured Output** - Get validated JSON responses with schema support (Pydantic, OutputSchema, or dict)
+- 📏 **Context Overflow Management** - Automatic handling of large texts with truncate/recycle strategies
+- ⏱️ **Per-Request Timeout** - Override default timeout for individual requests
 - 🎙️ **Audio services** - Text-to-speech and transcription
 - 🎤 **Real-Time Transcription** - WebSocket-based live audio transcription with bidirectional streaming
 - ⚡ **Fallback strategies** - Sequential and parallel handling
@@ -72,10 +74,10 @@ from livellm.models import Settings, ProviderKind
 # Basic
 client = LivellmClient(base_url="http://localhost:8000")
-# With timeout and pre-configured providers
+# With default timeout and pre-configured providers
 client = LivellmClient(
     base_url="http://localhost:8000",
-    timeout=30.0,
+    timeout=30.0,  # Default timeout for all requests
     configs=[
         Settings(
             uid="openai",
@@ -93,6 +95,50 @@ client = LivellmClient(
 )
 ```
+### Per-Request Timeout Override
+The timeout provided in `__init__` is the default, but you can override it for individual requests:
+```python
+# Client with 30s default timeout
+client = LivellmClient(base_url="http://localhost:8000", timeout=30.0)
+# Uses default 30s timeout
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Hello")]
+)
+# Override with 120s timeout for this specific request
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Write a long essay...")],
+    timeout=120.0  # Override for this request only
+)
+# Works with streaming too
+async for chunk in client.agent_run_stream(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[TextMessage(role="user", content="Tell me a story")],
+    timeout=300.0  # 5 minutes for streaming
+):
+    print(chunk.output, end="")
+# Works with all methods: speak(), speak_stream(), transcribe(), etc.
+audio = await client.speak(
+    provider_uid="openai",
+    model="tts-1",
+    text="Hello world",
+    voice="alloy",
+    mime_type=SpeakMimeType.MP3,
+    sample_rate=24000,
+    timeout=60.0
+)
+```
 ### Supported Providers
 `OPENAI` • `GOOGLE` • `ANTHROPIC` • `GROQ` • `ELEVENLABS`
@@ -416,6 +462,73 @@ data = json.loads(full_output)
 - Type-safe responses
 - Integration with type-checked code
+#### Context Overflow Management
+Handle large texts that exceed model context windows with automatic truncation or iterative processing:
+```python
+from livellm.models import TextMessage, ContextOverflowStrategy, OutputSchema, PropertyDef
+# TRUNCATE strategy (default): Preserves beginning, middle, and end
+# Works with both streaming and non-streaming
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[
+        TextMessage(role="system", content="Summarize the document."),
+        TextMessage(role="user", content=very_long_document)
+    ],
+    context_limit=4000,  # Max tokens
+    context_overflow_strategy=ContextOverflowStrategy.TRUNCATE
+)
+# RECYCLE strategy: Iteratively processes chunks and merges results
+# Useful for extraction tasks - processes entire document
+# Requires output_schema for JSON merging
+output_schema = OutputSchema(
+    title="ExtractedInfo",
+    properties={
+        "topics": PropertyDef(type="array", items={"type": "string"}),
+        "key_figures": PropertyDef(type="array", items={"type": "string"})
+    },
+    required=["topics", "key_figures"]
+)
+response = await client.agent_run(
+    provider_uid="openai",
+    model="gpt-4",
+    messages=[
+        TextMessage(role="system", content="Extract all topics and key figures."),
+        TextMessage(role="user", content=very_long_document)
+    ],
+    context_limit=3000,
+    context_overflow_strategy=ContextOverflowStrategy.RECYCLE,
+    output_schema=output_schema
+)
+# Parse the merged results
+import json
+result = json.loads(response.output)
+print(f"Topics: {result['topics']}")
+print(f"Key figures: {result['key_figures']}")
+```
+**Strategy comparison:**
+| Strategy | How it works | Best for | Streaming |
+|----------|--------------|----------|-----------|
+| `TRUNCATE` | Takes beginning, middle, end portions | Summarization, Q&A | ✅ Yes |
+| `RECYCLE` | Processes chunks iteratively, merges JSON | Full document extraction | ❌ No |
+**Parameters:**
+- `context_limit` (int, default: 0) - Maximum tokens. If ≤ 0, overflow handling is disabled
+- `context_overflow_strategy` (ContextOverflowStrategy, default: TRUNCATE) - Strategy to use
+**Notes:**
+- System prompts are always preserved (never truncated)
+- Token counting includes a 20% safety buffer
+- RECYCLE requires `output_schema` for JSON merging
 ### Audio Services
 #### Text-to-Speech
@@ -688,20 +801,22 @@ response = await client.ping()
 ### Client Methods
+All methods accept an optional `timeout` parameter to override the default client timeout.
 **Configuration**
-- `ping()` - Health check
-- `update_config(config)` / `update_configs(configs)` - Add/update providers
-- `get_configs()` - List all configurations
-- `delete_config(uid)` - Remove provider
+- `ping(timeout?)` - Health check
+- `update_config(config, timeout?)` / `update_configs(configs, timeout?)` - Add/update providers
+- `get_configs(timeout?)` - List all configurations
+- `delete_config(uid, timeout?)` - Remove provider
 **Agent**
-- `agent_run(request | **kwargs)` - Run agent (blocking)
-- `agent_run_stream(request | **kwargs)` - Run agent (streaming)
+- `agent_run(request | **kwargs, timeout?)` - Run agent (blocking)
+- `agent_run_stream(request | **kwargs, timeout?)` - Run agent (streaming)
 **Audio**
-- `speak(request | **kwargs)` - Text-to-speech (blocking)
-- `speak_stream(request | **kwargs)` - Text-to-speech (streaming)
-- `transcribe(request | **kwargs)` - Speech-to-text
+- `speak(request | **kwargs, timeout?)` - Text-to-speech (blocking)
+- `speak_stream(request | **kwargs, timeout?)` - Text-to-speech (streaming)
+- `transcribe(request | **kwargs, timeout?)` - Speech-to-text
 **Real-Time Transcription (TranscriptionWsClient)**
 - `connect()` - Establish WebSocket connection
@@ -727,12 +842,15 @@ response = await client.ping()
 - `MessageRole` - `USER` | `MODEL` | `SYSTEM` | `TOOL_CALL` | `TOOL_RETURN` (or use strings)
 **Requests**
-- `AgentRequest(provider_uid, model, messages, tools?, gen_config?, include_history?, output_schema?)` - Set `include_history=True` to get full conversation. Set `output_schema` for structured JSON output.
+- `AgentRequest(provider_uid, model, messages, tools?, gen_config?, include_history?, output_schema?, context_limit?, context_overflow_strategy?)` - Set `include_history=True` to get full conversation. Set `output_schema` for structured JSON output. Set `context_limit` and `context_overflow_strategy` for handling large texts.
 - `SpeakRequest(provider_uid, model, text, voice, mime_type, sample_rate, gen_config?)`
 - `TranscribeRequest(provider_uid, file, model, language?, gen_config?)`
 - `TranscriptionInitWsRequest(provider_uid, model, language?, input_sample_rate?, input_audio_format?, gen_config?)`
 - `TranscriptionAudioChunkWsRequest(audio)` - Audio chunk for streaming
+**Context Overflow**
+- `ContextOverflowStrategy` - `TRUNCATE` | `RECYCLE`
 **Tools**
 - `WebSearchInput(kind=ToolKind.WEB_SEARCH, search_context_size)`
 - `MCPStreamableServerInput(kind=ToolKind.MCP_STREAMABLE_SERVER, url, prefix?, timeout?)`

{livellm-1.6.1 → livellm-1.7.1}/livellm/livellm.py RENAMED Viewed

@@ -31,10 +31,15 @@ DEFAULT_USER_AGENT = f"livellm-python/{__version__}"
 class BaseLivellmClient(ABC):
+    # Default timeout (set by subclasses)
+    timeout: Optional[float] = None
     @overload
     async def agent_run(
         self,
         request: Union[AgentRequest, AgentFallbackRequest],
+        *,
+        timeout: Optional[float] = None,
     ) -> AgentResponse:
         ...
@@ -48,13 +53,18 @@ class BaseLivellmClient(ABC):
         tools: Optional[list] = None,
         include_history: bool = False,
         output_schema: Optional[Union[OutputSchema, Dict[str, Any], Type[BaseModel]]] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AgentResponse:
         ...
     @abstractmethod
-    async def handle_agent_run(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AgentResponse:
+    async def handle_agent_run(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AgentResponse:
         ...
     async def agent_run(
@@ -67,6 +77,7 @@ class BaseLivellmClient(ABC):
         tools: Optional[list] = None,
         include_history: bool = False,
         output_schema: Optional[Union[OutputSchema, Dict[str, Any], Type[BaseModel]]] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AgentResponse:
         """
@@ -100,6 +111,7 @@ class BaseLivellmClient(ABC):
                 - An OutputSchema instance
                 - A dict representing a JSON schema
                 - A Pydantic BaseModel class (will be converted to OutputSchema)
+            timeout: Optional timeout in seconds (overrides default client timeout)
         Returns:
             AgentResponse with the agent's output. If output_schema was provided,
@@ -111,7 +123,7 @@ class BaseLivellmClient(ABC):
                 raise TypeError(
                     f"First positional argument must be AgentRequest or AgentFallbackRequest, got {type(request)}"
                 )
-            return await self.handle_agent_run(request)
+            return await self.handle_agent_run(request, timeout=timeout)
         # Otherwise, use keyword arguments
         if provider_uid is None or model is None or messages is None:
@@ -132,7 +144,7 @@ class BaseLivellmClient(ABC):
             include_history=include_history,
             output_schema=resolved_schema
         )
-        return await self.handle_agent_run(agent_request)
+        return await self.handle_agent_run(agent_request, timeout=timeout)
     def _resolve_output_schema(
         self,
@@ -157,6 +169,8 @@ class BaseLivellmClient(ABC):
     def agent_run_stream(
         self,
         request: Union[AgentRequest, AgentFallbackRequest],
+        *,
+        timeout: Optional[float] = None,
     ) -> AsyncIterator[AgentResponse]:
         ...
@@ -170,13 +184,18 @@ class BaseLivellmClient(ABC):
         tools: Optional[list] = None,
         include_history: bool = False,
         output_schema: Optional[Union[OutputSchema, Dict[str, Any], Type[BaseModel]]] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AsyncIterator[AgentResponse]:
         ...
     @abstractmethod
-    async def handle_agent_run_stream(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AsyncIterator[AgentResponse]:
+    async def handle_agent_run_stream(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[AgentResponse]:
         ...
     async def agent_run_stream(
@@ -189,6 +208,7 @@ class BaseLivellmClient(ABC):
         tools: Optional[list] = None,
         include_history: bool = False,
         output_schema: Optional[Union[OutputSchema, Dict[str, Any], Type[BaseModel]]] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AsyncIterator[AgentResponse]:
         """
@@ -225,6 +245,7 @@ class BaseLivellmClient(ABC):
                 - An OutputSchema instance
                 - A dict representing a JSON schema
                 - A Pydantic BaseModel class (will be converted to OutputSchema)
+            timeout: Optional timeout in seconds (overrides default client timeout)
         Returns:
             AsyncIterator of AgentResponse chunks. If output_schema was provided,
@@ -236,7 +257,7 @@ class BaseLivellmClient(ABC):
                 raise TypeError(
                     f"First positional argument must be AgentRequest or AgentFallbackRequest, got {type(request)}"
                 )
-            stream = self.handle_agent_run_stream(request)
+            stream = self.handle_agent_run_stream(request, timeout=timeout)
         else:
             # Otherwise, use keyword arguments
             if provider_uid is None or model is None or messages is None:
@@ -257,7 +278,7 @@ class BaseLivellmClient(ABC):
                 include_history=include_history,
                 output_schema=resolved_schema
             )
-            stream = self.handle_agent_run_stream(agent_request)
+            stream = self.handle_agent_run_stream(agent_request, timeout=timeout)
         async for chunk in stream:
             yield chunk
@@ -266,6 +287,8 @@ class BaseLivellmClient(ABC):
     async def speak(
         self,
         request: Union[SpeakRequest, AudioFallbackRequest],
+        *,
+        timeout: Optional[float] = None,
     ) -> bytes:
         ...
@@ -280,13 +303,18 @@ class BaseLivellmClient(ABC):
         mime_type: str,
         sample_rate: int,
         chunk_size: int = 20,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> bytes:
         ...
     @abstractmethod
-    async def handle_speak(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> bytes:
+    async def handle_speak(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> bytes:
         ...
     async def speak(
@@ -300,6 +328,7 @@ class BaseLivellmClient(ABC):
         mime_type: Optional[str] = None,
         sample_rate: Optional[int] = None,
         chunk_size: int = 20,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> bytes:
         """
@@ -330,6 +359,7 @@ class BaseLivellmClient(ABC):
             mime_type: The MIME type of the output audio
             sample_rate: The sample rate of the output audio
             chunk_size: Chunk size in milliseconds (default: 20ms)
+            timeout: Optional timeout in seconds (overrides default client timeout)
             gen_config: Optional generation configuration
         Returns:
@@ -341,7 +371,7 @@ class BaseLivellmClient(ABC):
                 raise TypeError(
                     f"First positional argument must be SpeakRequest or AudioFallbackRequest, got {type(request)}"
                 )
-            return await self.handle_speak(request)
+            return await self.handle_speak(request, timeout=timeout)
         # Otherwise, use keyword arguments
         if provider_uid is None or model is None or text is None or voice is None or mime_type is None or sample_rate is None:
@@ -360,12 +390,14 @@ class BaseLivellmClient(ABC):
             chunk_size=chunk_size,
             gen_config=kwargs or None
         )
-        return await self.handle_speak(speak_request)
+        return await self.handle_speak(speak_request, timeout=timeout)
     @overload
     def speak_stream(
         self,
         request: Union[SpeakRequest, AudioFallbackRequest],
+        *,
+        timeout: Optional[float] = None,
     ) -> AsyncIterator[bytes]:
         ...
@@ -380,13 +412,18 @@ class BaseLivellmClient(ABC):
         mime_type: str,
         sample_rate: int,
         chunk_size: int = 20,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AsyncIterator[bytes]:
         ...
     @abstractmethod
-    async def handle_speak_stream(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> AsyncIterator[bytes]:
+    async def handle_speak_stream(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[bytes]:
         ...
     async def speak_stream(
@@ -400,6 +437,7 @@ class BaseLivellmClient(ABC):
         mime_type: Optional[str] = None,
         sample_rate: Optional[int] = None,
         chunk_size: int = 20,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> AsyncIterator[bytes]:
         """
@@ -433,6 +471,7 @@ class BaseLivellmClient(ABC):
             mime_type: The MIME type of the output audio
             sample_rate: The sample rate of the output audio
             chunk_size: Chunk size in milliseconds (default: 20ms)
+            timeout: Optional timeout in seconds (overrides default client timeout)
             gen_config: Optional generation configuration
         Returns:
@@ -444,7 +483,7 @@ class BaseLivellmClient(ABC):
                 raise TypeError(
                     f"First positional argument must be SpeakRequest or AudioFallbackRequest, got {type(request)}"
                 )
-            speak_stream = self.handle_speak_stream(request)
+            speak_stream = self.handle_speak_stream(request, timeout=timeout)
         else:
             # Otherwise, use keyword arguments
             if provider_uid is None or model is None or text is None or voice is None or mime_type is None or sample_rate is None:
@@ -463,7 +502,7 @@ class BaseLivellmClient(ABC):
                 chunk_size=chunk_size,
                 gen_config=kwargs or None
             )
-            speak_stream = self.handle_speak_stream(speak_request)
+            speak_stream = self.handle_speak_stream(speak_request, timeout=timeout)
         async for chunk in speak_stream:
             yield chunk
@@ -471,6 +510,8 @@ class BaseLivellmClient(ABC):
     async def transcribe(
         self,
         request: Union[TranscribeRequest, TranscribeFallbackRequest],
+        *,
+        timeout: Optional[float] = None,
     ) -> TranscribeResponse:
         ...
@@ -482,13 +523,18 @@ class BaseLivellmClient(ABC):
         file: File,
         model: str,
         language: Optional[str] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> TranscribeResponse:
         ...
     @abstractmethod
-    async def handle_transcribe(self, request: Union[TranscribeRequest, TranscribeFallbackRequest]) -> TranscribeResponse:
+    async def handle_transcribe(
+        self,
+        request: Union[TranscribeRequest, TranscribeFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> TranscribeResponse:
         ...
     async def transcribe(
@@ -499,6 +545,7 @@ class BaseLivellmClient(ABC):
         file: Optional[File] = None,
         model: Optional[str] = None,
         language: Optional[str] = None,
+        timeout: Optional[float] = None,
         **kwargs
     ) -> TranscribeResponse:
         """
@@ -522,6 +569,7 @@ class BaseLivellmClient(ABC):
             file: The audio file as a tuple (filename, content, content_type)
             model: The model to use for transcription
             language: Optional language code
+            timeout: Optional timeout in seconds (overrides default client timeout)
             gen_config: Optional generation configuration
         Returns:
@@ -534,7 +582,7 @@ class BaseLivellmClient(ABC):
                     f"First positional argument must be TranscribeRequest or TranscribeFallbackRequest, got {type(request)}"
                 )
             # JSON-based request
-            return await self.handle_transcribe(request)
+            return await self.handle_transcribe(request, timeout=timeout)
         # Otherwise, use keyword arguments with multipart form-data request
         if provider_uid is None or file is None or model is None:
@@ -550,7 +598,7 @@ class BaseLivellmClient(ABC):
             language=language,
             gen_config=kwargs or None
         )
-        return await self.handle_transcribe(transcribe_request)
+        return await self.handle_transcribe(transcribe_request, timeout=timeout)
 class LivellmWsClient(BaseLivellmClient):
@@ -628,7 +676,11 @@ class LivellmWsClient(BaseLivellmClient):
             self.__listen_for_responses_task = None
         self.sessions.clear()
-    async def get_response(self, action: WsAction, payload: dict) -> dict:
+    def _get_effective_timeout(self, timeout: Optional[float]) -> Optional[float]:
+        """Get effective timeout: per-request timeout overrides default."""
+        return timeout if timeout is not None else self.timeout
+    async def get_response(self, action: WsAction, payload: dict, timeout: Optional[float] = None) -> dict:
         """Send a request and wait for response."""
         if self.websocket is None:
             await self.connect()
@@ -638,7 +690,17 @@ class LivellmWsClient(BaseLivellmClient):
         q = await self.get_or_update_session(session_id)
         await self.websocket.send(json.dumps(request.model_dump()))
-        response: WsResponse = await q.get()
+        effective_timeout = self._get_effective_timeout(timeout)
+        try:
+            if effective_timeout:
+                response: WsResponse = await asyncio.wait_for(q.get(), timeout=effective_timeout)
+            else:
+                response: WsResponse = await q.get()
+        except asyncio.TimeoutError:
+            self.sessions.pop(session_id, None)
+            raise TimeoutError(f"Request timed out after {effective_timeout} seconds")
         self.sessions.pop(session_id)
         if response.status == WsStatus.ERROR:
             raise Exception(f"WebSocket failed: {response.error}")
@@ -647,7 +709,7 @@ class LivellmWsClient(BaseLivellmClient):
         else:
             raise Exception(f"WebSocket failed with unknown status: {response}")
-    async def get_response_stream(self, action: WsAction, payload: dict) -> AsyncIterator[dict]:
+    async def get_response_stream(self, action: WsAction, payload: dict, timeout: Optional[float] = None) -> AsyncIterator[dict]:
         """Send a request and stream responses."""
         if self.websocket is None:
             await self.connect()
@@ -657,8 +719,17 @@ class LivellmWsClient(BaseLivellmClient):
         q = await self.get_or_update_session(session_id)
         await self.websocket.send(json.dumps(request.model_dump()))
+        effective_timeout = self._get_effective_timeout(timeout)
         while True:
-            response: WsResponse = await q.get()
+            try:
+                if effective_timeout:
+                    response: WsResponse = await asyncio.wait_for(q.get(), timeout=effective_timeout)
+                else:
+                    response: WsResponse = await q.get()
+            except asyncio.TimeoutError:
+                self.sessions.pop(session_id, None)
+                raise TimeoutError(f"Request timed out after {effective_timeout} seconds")
             if response.status == WsStatus.STREAMING:
                 yield response.data
@@ -674,37 +745,60 @@ class LivellmWsClient(BaseLivellmClient):
     # Implement abstract methods from BaseLivellmClient
-    async def handle_agent_run(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AgentResponse:
+    async def handle_agent_run(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AgentResponse:
         """Handle agent run via WebSocket."""
         response = await self.get_response(
             WsAction.AGENT_RUN,
-            request.model_dump()
+            request.model_dump(),
+            timeout=timeout
         )
         return AgentResponse(**response)
-    async def handle_agent_run_stream(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AsyncIterator[AgentResponse]:
+    async def handle_agent_run_stream(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[AgentResponse]:
         """Handle streaming agent run via WebSocket."""
-        async for response in self.get_response_stream(WsAction.AGENT_RUN_STREAM, request.model_dump()):
+        async for response in self.get_response_stream(WsAction.AGENT_RUN_STREAM, request.model_dump(), timeout=timeout):
             yield AgentResponse(**response)
-    async def handle_speak(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> bytes:
+    async def handle_speak(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> bytes:
         """Handle speak request via WebSocket."""
         response = await self.get_response(
             WsAction.AUDIO_SPEAK,
-            request.model_dump()
+            request.model_dump(),
+            timeout=timeout
         )
         return EncodedSpeakResponse(**response).audio
-    async def handle_speak_stream(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> AsyncIterator[bytes]:
+    async def handle_speak_stream(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[bytes]:
         """Handle streaming speak request via WebSocket."""
-        async for response in self.get_response_stream(WsAction.AUDIO_SPEAK_STREAM, request.model_dump()):
+        async for response in self.get_response_stream(WsAction.AUDIO_SPEAK_STREAM, request.model_dump(), timeout=timeout):
             yield EncodedSpeakResponse(**response).audio
-    async def handle_transcribe(self, request: Union[TranscribeRequest, TranscribeFallbackRequest]) -> TranscribeResponse:
+    async def handle_transcribe(
+        self,
+        request: Union[TranscribeRequest, TranscribeFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> TranscribeResponse:
         """Handle transcribe request via WebSocket."""
         response = await self.get_response(
             WsAction.AUDIO_TRANSCRIBE,
-            request.model_dump()
+            request.model_dump(),
+            timeout=timeout
         )
         return TranscribeResponse(**response)
@@ -747,8 +841,8 @@ class LivellmClient(BaseLivellmClient):
         self.base_url = f"{self._root_base_url}/livellm"
         self.timeout = timeout
         self.user_agent = user_agent or DEFAULT_USER_AGENT
-        self.client = httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout) \
-            if self.timeout else httpx.AsyncClient(base_url=self.base_url)
+        # Create client without timeout - we'll pass timeout per-request
+        self.client = httpx.AsyncClient(base_url=self.base_url)
         self.settings = []
         self.headers = {
             "Content-Type": "application/json",
@@ -759,6 +853,10 @@ class LivellmClient(BaseLivellmClient):
         if configs:
             self.update_configs_post_init(configs)
+    def _get_effective_timeout(self, timeout: Optional[float]) -> Optional[float]:
+        """Get effective timeout: per-request timeout overrides default."""
+        return timeout if timeout is not None else self.timeout
     @property
     def realtime(self) -> LivellmWsClient:
         """
@@ -789,15 +887,17 @@ class LivellmClient(BaseLivellmClient):
             return SuccessResponse(success=True, message="Configs updated successfully")
-    async def delete(self, endpoint: str) -> dict:
+    async def delete(self, endpoint: str, timeout: Optional[float] = None) -> dict:
         """
         Delete a resource from the given endpoint and return the response.
         Args:
             endpoint: The endpoint to delete from.
+            timeout: Optional timeout override.
         Returns:
             The response from the endpoint.
         """
-        response = await self.client.delete(endpoint, headers=self.headers)
+        effective_timeout = self._get_effective_timeout(timeout)
+        response = await self.client.delete(endpoint, headers=self.headers, timeout=effective_timeout)
         response.raise_for_status()
         return response.json()
@@ -805,7 +905,8 @@ class LivellmClient(BaseLivellmClient):
         self,
         files: dict,
         data: dict,
-        endpoint: str
+        endpoint: str,
+        timeout: Optional[float] = None
     ) -> dict:
         """
         Post a multipart request to the given endpoint and return the response.
@@ -813,27 +914,32 @@ class LivellmClient(BaseLivellmClient):
             files: The files to send in the request.
             data: The data to send in the request.
             endpoint: The endpoint to post to.
+            timeout: Optional timeout override.
         Returns:
             The response from the endpoint.
         """
+        effective_timeout = self._get_effective_timeout(timeout)
         # Don't pass Content-Type header for multipart - httpx will set it automatically
-        response = await self.client.post(endpoint, files=files, data=data)
+        response = await self.client.post(endpoint, files=files, data=data, timeout=effective_timeout)
         response.raise_for_status()
         return response.json()
     async def get(
         self,
-        endpoint: str
+        endpoint: str,
+        timeout: Optional[float] = None
     ) -> dict:
         """
         Get a request from the given endpoint and return the response.
         Args:
             endpoint: The endpoint to get from.
+            timeout: Optional timeout override.
         Returns:
             The response from the endpoint.
         """
-        response = await self.client.get(endpoint, headers=self.headers)
+        effective_timeout = self._get_effective_timeout(timeout)
+        response = await self.client.get(endpoint, headers=self.headers, timeout=effective_timeout)
         response.raise_for_status()
         return response.json()
@@ -842,7 +948,8 @@ class LivellmClient(BaseLivellmClient):
         json_data: dict,
         endpoint: str,
         expect_stream: bool = False,
-        expect_json: bool = True
+        expect_json: bool = True,
+        timeout: Optional[float] = None
     ) -> Union[dict, bytes, AsyncIterator[Union[dict, bytes]]]:
         """
         Post a request to the given endpoint and return the response.
@@ -854,12 +961,14 @@ class LivellmClient(BaseLivellmClient):
             endpoint: The endpoint to post to.
             expect_stream: Whether to expect a stream response.
             expect_json: Whether to expect a JSON response.
+            timeout: Optional timeout override.
         Returns:
             The response from the endpoint.
         Raises:
             Exception: If the response is not 200 or 201.
         """
-        response = await self.client.post(endpoint, json=json_data, headers=self.headers)
+        effective_timeout = self._get_effective_timeout(timeout)
+        response = await self.client.post(endpoint, json=json_data, headers=self.headers, timeout=effective_timeout)
         if response.status_code not in [200, 201]:
             error_response = await response.aread()
             error_response = error_response.decode("utf-8")
@@ -882,26 +991,26 @@ class LivellmClient(BaseLivellmClient):
             else:
                 return response.content
-    async def ping(self) -> SuccessResponse:
-        result = await self.get("ping")
+    async def ping(self, timeout: Optional[float] = None) -> SuccessResponse:
+        result = await self.get("ping", timeout=timeout)
         return SuccessResponse(**result)
-    async def update_config(self, config: Settings) -> SuccessResponse:
-        result = await self.post(config.model_dump(), "providers/config", expect_json=True)
+    async def update_config(self, config: Settings, timeout: Optional[float] = None) -> SuccessResponse:
+        result = await self.post(config.model_dump(), "providers/config", expect_json=True, timeout=timeout)
         self.settings.append(config)
         return SuccessResponse(**result)
-    async def update_configs(self, configs: List[Settings]) -> SuccessResponse:
+    async def update_configs(self, configs: List[Settings], timeout: Optional[float] = None) -> SuccessResponse:
         for config in configs:
-            await self.update_config(config)
+            await self.update_config(config, timeout=timeout)
         return SuccessResponse(success=True, message="Configs updated successfully")
-    async def get_configs(self) -> List[Settings]:
-        result = await self.get("providers/configs")
+    async def get_configs(self, timeout: Optional[float] = None) -> List[Settings]:
+        result = await self.get("providers/configs", timeout=timeout)
         return [Settings(**config) for config in result]
-    async def delete_config(self, config_uid: str) -> SuccessResponse:
-        result = await self.delete(f"providers/config/{config_uid}")
+    async def delete_config(self, config_uid: str, timeout: Optional[float] = None) -> SuccessResponse:
+        result = await self.delete(f"providers/config/{config_uid}", timeout=timeout)
         return SuccessResponse(**result)
     async def cleanup(self):
@@ -916,59 +1025,51 @@ class LivellmClient(BaseLivellmClient):
         # Also close any realtime WebSocket client if it was created
         if self._realtime is not None:
             await self._realtime.disconnect()
-    # def __del__(self):
-    #     """
-    #     Destructor to clean up resources when the client is garbage collected.
-    #     This will close the HTTP client and attempt to delete configs if cleanup wasn't called.
-    #     Note: It's recommended to use the async context manager or call cleanup() explicitly.
-    #     """
-    #     # Warn user if cleanup wasn't called
-    #     if self.settings:
-    #         warnings.warn(
-    #             "LivellmClient is being garbage collected without explicit cleanup. "
-    #             "Provider configs may not be deleted from the server. "
-    #             "Consider using 'async with' or calling 'await client.cleanup()' explicitly.",
-    #             ResourceWarning,
-    #             stacklevel=2
-    #         )
-    #     # Close the httpx client synchronously
-    #     # httpx.AsyncClient stores a sync Transport that needs cleanup
-    #     try:
-    #         with httpx.Client(base_url=self.base_url) as client:
-    #             for config in self.settings:
-    #                 config: Settings = config
-    #                 client.delete(f"providers/config/{config.uid}", headers=self.headers)
-    #     except Exception:
-    #         # Silently fail - we're in a destructor
-    #         pass
     # Implement abstract methods from BaseLivellmClient
-    async def handle_agent_run(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AgentResponse:
+    async def handle_agent_run(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AgentResponse:
         """Handle agent run via HTTP."""
-        result = await self.post(request.model_dump(), "agent/run", expect_json=True)
+        result = await self.post(request.model_dump(), "agent/run", expect_json=True, timeout=timeout)
         return AgentResponse(**result)
-    async def handle_agent_run_stream(self, request: Union[AgentRequest, AgentFallbackRequest]) -> AsyncIterator[AgentResponse]:
+    async def handle_agent_run_stream(
+        self,
+        request: Union[AgentRequest, AgentFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[AgentResponse]:
         """Handle streaming agent run via HTTP."""
-        stream = await self.post(request.model_dump(), "agent/run_stream", expect_stream=True, expect_json=True)
+        stream = await self.post(request.model_dump(), "agent/run_stream", expect_stream=True, expect_json=True, timeout=timeout)
         async for chunk in stream:
             yield AgentResponse(**chunk)
-    async def handle_speak(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> bytes:
+    async def handle_speak(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> bytes:
         """Handle speak request via HTTP."""
-        return await self.post(request.model_dump(), "audio/speak", expect_json=False)
+        return await self.post(request.model_dump(), "audio/speak", expect_json=False, timeout=timeout)
-    async def handle_speak_stream(self, request: Union[SpeakRequest, AudioFallbackRequest]) -> AsyncIterator[bytes]:
+    async def handle_speak_stream(
+        self,
+        request: Union[SpeakRequest, AudioFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> AsyncIterator[bytes]:
         """Handle streaming speak request via HTTP."""
-        speak_stream = await self.post(request.model_dump(), "audio/speak_stream", expect_stream=True, expect_json=False)
+        speak_stream = await self.post(request.model_dump(), "audio/speak_stream", expect_stream=True, expect_json=False, timeout=timeout)
         async for chunk in speak_stream:
             yield chunk
-    async def handle_transcribe(self, request: Union[TranscribeRequest, TranscribeFallbackRequest]) -> TranscribeResponse:
+    async def handle_transcribe(
+        self,
+        request: Union[TranscribeRequest, TranscribeFallbackRequest],
+        timeout: Optional[float] = None
+    ) -> TranscribeResponse:
         """Handle transcribe request via HTTP."""
-        result = await self.post(request.model_dump(), "audio/transcribe_json", expect_json=True)
+        result = await self.post(request.model_dump(), "audio/transcribe_json", expect_json=True, timeout=timeout)
         return TranscribeResponse(**result)

{livellm-1.6.1 → livellm-1.7.1}/livellm/models/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .common import BaseRequest, ProviderKind, Settings, SuccessResponse
 from .fallback import AgentFallbackRequest, AudioFallbackRequest, TranscribeFallbackRequest, FallbackStrategy
-from .agent.agent import AgentRequest, AgentResponse, AgentResponseUsage
+from .agent.agent import AgentRequest, AgentResponse, AgentResponseUsage, ContextOverflowStrategy
 from .agent.chat import Message, MessageRole, TextMessage, BinaryMessage, ToolCallMessage, ToolReturnMessage
 from .agent.tools import Tool, ToolInput, ToolKind, WebSearchInput, MCPStreamableServerInput
 from .agent.output_schema import OutputSchema, PropertyDef
@@ -24,6 +24,7 @@ __all__ = [
     "AgentRequest",
     "AgentResponse",
     "AgentResponseUsage",
+    "ContextOverflowStrategy",
     "Message",
     "MessageRole",
     "TextMessage",

{livellm-1.6.1 → livellm-1.7.1}/livellm/models/agent/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .agent import AgentRequest, AgentResponse, AgentResponseUsage
+from .agent import AgentRequest, AgentResponse, AgentResponseUsage, ContextOverflowStrategy
 from .chat import Message, MessageRole, TextMessage, BinaryMessage, ToolCallMessage, ToolReturnMessage
 from .tools import Tool, ToolInput, ToolKind, WebSearchInput, MCPStreamableServerInput
 from .output_schema import OutputSchema, PropertyDef
@@ -8,6 +8,7 @@ __all__ = [
     "AgentRequest",
     "AgentResponse",
     "AgentResponseUsage",
+    "ContextOverflowStrategy",
     "Message",
     "MessageRole",
     "TextMessage",

{livellm-1.6.1 → livellm-1.7.1}/livellm/models/agent/agent.py RENAMED Viewed

@@ -2,12 +2,19 @@
 from pydantic import BaseModel, Field
 from typing import Optional, List, Union, Any, Dict
+from enum import Enum
 from .chat import TextMessage, BinaryMessage, ToolCallMessage, ToolReturnMessage
 from .tools import WebSearchInput, MCPStreamableServerInput
 from .output_schema import OutputSchema, PropertyDef
 from ..common import BaseRequest
+class ContextOverflowStrategy(str, Enum):
+    """Strategy for handling context overflow when text exceeds context_limit."""
+    TRUNCATE = "truncate"  # Take beginning, middle, and end portions
+    RECYCLE = "recycle"    # Iteratively process chunks, merging results
 class AgentRequest(BaseRequest):
     model: str = Field(..., description="The model to use")
     messages: List[Union[TextMessage, BinaryMessage, ToolCallMessage, ToolReturnMessage]] = Field(..., description="The messages to use")
@@ -15,6 +22,8 @@ class AgentRequest(BaseRequest):
     gen_config: Optional[dict] = Field(default=None, description="The configuration for the generation")
     include_history: bool = Field(default=False, description="Whether to include full conversation history in the response")
     output_schema: Optional[Union[OutputSchema, Dict[str, Any]]] = Field(default=None, description="JSON schema for structured output. Can be an OutputSchema, a dict representing a JSON schema, or will be converted from a Pydantic BaseModel.")
+    context_limit: int = Field(default=0, description="Maximum context size in tokens. If <= 0, context overflow handling is disabled.")
+    context_overflow_strategy: ContextOverflowStrategy = Field(default=ContextOverflowStrategy.TRUNCATE, description="Strategy for handling context overflow: 'truncate' or 'recycle'")
 class AgentResponseUsage(BaseModel):
     input_tokens: int = Field(..., description="The number of input tokens used")

{livellm-1.6.1 → livellm-1.7.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "livellm"
-version = "1.6.1"
+version = "1.7.1"
 description = "Python client for the LiveLLM Server"
 readme = "README.md"
 requires-python = ">=3.10"