PyPI - inferencesh - Versions diffs - 0.2.23__py3-none-any.whl → 0.4.29__py3-none-any.whl - Mend

inferencesh 0.2.23py3-none-any.whl → 0.4.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

inferencesh/__init__.py +5 -0
inferencesh/client.py +1081 -0
inferencesh/models/base.py +81 -3
inferencesh/models/file.py +120 -21
inferencesh/models/llm.py +485 -136
inferencesh/utils/download.py +15 -7
inferencesh-0.4.29.dist-info/METADATA +196 -0
inferencesh-0.4.29.dist-info/RECORD +15 -0
inferencesh-0.2.23.dist-info/METADATA +0 -105
inferencesh-0.2.23.dist-info/RECORD +0 -14
{inferencesh-0.2.23.dist-info → inferencesh-0.4.29.dist-info}/WHEEL +0 -0
{inferencesh-0.2.23.dist-info → inferencesh-0.4.29.dist-info}/entry_points.txt +0 -0
{inferencesh-0.2.23.dist-info → inferencesh-0.4.29.dist-info}/licenses/LICENSE +0 -0
{inferencesh-0.2.23.dist-info → inferencesh-0.4.29.dist-info}/top_level.txt +0 -0

inferencesh/models/llm.py CHANGED Viewed

@@ -1,96 +1,124 @@
 from typing import Optional, List, Any, Callable, Dict, Generator
 from enum import Enum
-from pydantic import Field
-from queue import Queue
+from pydantic import Field, BaseModel
+from queue import Queue, Empty
 from threading import Thread
 import time
 from contextlib import contextmanager
 import base64
+import json
 from .base import BaseAppInput, BaseAppOutput
 from .file import File
 class ContextMessageRole(str, Enum):
     USER = "user"
     ASSISTANT = "assistant"
     SYSTEM = "system"
+    TOOL = "tool"
 class Message(BaseAppInput):
     role: ContextMessageRole
     content: str
 class ContextMessage(BaseAppInput):
     role: ContextMessageRole = Field(
-        description="The role of the message",
+        description="the role of the message. user, assistant, or system",
     )
     text: str = Field(
-        description="The text content of the message"
+        description="the text content of the message"
     )
     image: Optional[File] = Field(
-        description="The image url of the message",
+        description="the image file of the message",
+        default=None
+    )
+    images: Optional[List[File]] = Field(
+        description="the images of the message",
+        default=None
+    )
+    tool_calls: Optional[List[Dict[str, Any]]] = Field(
+        description="the tool calls of the message",
+        default=None
+    )
+    tool_call_id: Optional[str] = Field(
+        description="the tool call id for tool role messages",
         default=None
     )
-class LLMInput(BaseAppInput):
+class BaseLLMInput(BaseAppInput):
+    """Base class with common LLM fields."""
     system_prompt: str = Field(
-        description="The system prompt to use for the model",
-        default="You are a helpful assistant that can answer questions and help with tasks.",
+        description="the system prompt to use for the model",
+        default="you are a helpful assistant that can answer questions and help with tasks.",
         examples=[
-            "You are a helpful assistant that can answer questions and help with tasks.",
-            "You are a certified medical professional who can provide accurate health information.",
-            "You are a certified financial advisor who can give sound investment guidance.",
-            "You are a certified cybersecurity expert who can explain security best practices.",
-            "You are a certified environmental scientist who can discuss climate and sustainability.",
+            "you are a helpful assistant that can answer questions and help with tasks.",
         ]
     )
     context: List[ContextMessage] = Field(
-        description="The context to use for the model",
+        description="the context to use for the model",
+        default=[],
         examples=[
             [
-                {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
+                {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
                 {"role": "assistant", "content": [{"type": "text", "text": "The capital of France is Paris."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "What is the weather like today?"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "I apologize, but I don't have access to real-time weather information. You would need to check a weather service or app to get current weather conditions for your location."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "Can you help me write a poem about spring?"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "Here's a short poem about spring:\n\nGreen buds awakening,\nSoft rain gently falling down,\nNew life springs anew.\n\nWarm sun breaks through clouds,\nBirds return with joyful song,\nNature's sweet rebirth."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "Explain quantum computing in simple terms"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "Quantum computing is like having a super-powerful calculator that can solve many problems at once instead of one at a time. While regular computers use bits (0s and 1s), quantum computers use quantum bits or \"qubits\" that can be both 0 and 1 at the same time - kind of like being in two places at once! This allows them to process huge amounts of information much faster than regular computers for certain types of problems."}]}
             ]
-        ],
-        default=[]
+        ]
+    )
+    role: ContextMessageRole = Field(
+        description="the role of the input text",
+        default=ContextMessageRole.USER
     )
     text: str = Field(
-        description="The user prompt to use for the model",
+        description="the input text to use for the model",
         examples=[
-            "What is the capital of France?",
-            "What is the weather like today?",
-            "Can you help me write a poem about spring?",
-            "Explain quantum computing in simple terms"
-        ],
+            "write a haiku about artificial general intelligence"
+        ]
     )
+    temperature: float = Field(default=0.7, ge=0.0, le=1.0)
+    top_p: float = Field(default=0.95, ge=0.0, le=1.0)
+    context_size: int = Field(default=4096)
+class ImageCapabilityMixin(BaseModel):
+    """Mixin for models that support image inputs."""
     image: Optional[File] = Field(
-        description="The image to use for the model",
-        default=None
+        description="the image to use for the model",
+        default=None,
+        contentMediaType="image/*",
     )
-    # Optional parameters
-    temperature: float = Field(default=0.7)
-    top_p: float = Field(default=0.95)
-    max_tokens: int = Field(default=4096)
-    context_size: int = Field(default=4096)
-    # Model specific flags
-    reasoning: bool = Field(default=False)
-    tools: List[Dict[str, Any]] = Field(default=[])
+class MultipleImageCapabilityMixin(BaseModel):
+    """Mixin for models that support image inputs."""
+    images: Optional[List[File]] = Field(
+        description="the images to use for the model",
+        default=None,
+    )
+class ReasoningCapabilityMixin(BaseModel):
+    """Mixin for models that support reasoning."""
+    reasoning: bool = Field(
+        description="enable step-by-step reasoning",
+        default=False
+    )
+class ToolsCapabilityMixin(BaseModel):
+    """Mixin for models that support tool/function calling."""
+    tools: Optional[List[Dict[str, Any]]] = Field(
+        description="tool definitions for function calling",
+        default=None
+    )
+    tool_call_id: Optional[str] = Field(
+        description="the tool call id for tool role messages",
+        default=None
+    )
+# Example of how to use:
+class LLMInput(BaseLLMInput):
+    """Default LLM input model with no special capabilities."""
+    pass
+# For backward compatibility
+LLMInput.model_config["title"] = "LLMInput"
 class LLMUsage(BaseAppOutput):
     stop_reason: str = ""
@@ -103,12 +131,38 @@ class LLMUsage(BaseAppOutput):
     reasoning_time: float = 0.0
-class LLMOutput(BaseAppOutput):
-    response: str
-    reasoning: Optional[str] = None
-    tool_calls: Optional[List[Dict[str, Any]]] = None
-    usage: Optional[LLMUsage] = None
+class BaseLLMOutput(BaseAppOutput):
+    """Base class for LLM outputs with common fields."""
+    response: str = Field(description="the generated text response")
+class LLMUsageMixin(BaseModel):
+    """Mixin for models that provide token usage statistics."""
+    usage: Optional[LLMUsage] = Field(
+        description="token usage statistics",
+        default=None
+    )
+class ReasoningMixin(BaseModel):
+    """Mixin for models that support reasoning."""
+    reasoning: Optional[str] = Field(
+        description="the reasoning output of the model",
+        default=None
+    )
+class ToolCallsMixin(BaseModel):
+    """Mixin for models that support tool calls."""
+    tool_calls: Optional[List[Dict[str, Any]]] = Field(
+        description="tool calls for function calling",
+        default=None
+    )
+# Example of how to use:
+class LLMOutput(LLMUsageMixin, BaseLLMOutput):
+    """Default LLM output model with token usage tracking."""
+    pass
+# For backward compatibility
+LLMOutput.model_config["title"] = "LLMOutput"
 @contextmanager
 def timing_context():
@@ -116,7 +170,7 @@ def timing_context():
     class TimingInfo:
         def __init__(self):
             self.start_time = time.time()
-            self.first_token_time = 0
+            self.first_token_time = None
             self.reasoning_start_time = None
             self.total_reasoning_time = 0.0
             self.reasoning_tokens = 0
@@ -140,12 +194,17 @@ def timing_context():
         @property
         def stats(self):
-            end_time = time.time()
+            current_time = time.time()
             if self.first_token_time is None:
-                self.first_token_time = end_time
+                return {
+                    "time_to_first_token": 0.0,
+                    "generation_time": 0.0,
+                    "reasoning_time": self.total_reasoning_time,
+                    "reasoning_tokens": self.reasoning_tokens
+                }
             time_to_first = self.first_token_time - self.start_time
-            generation_time = end_time - self.first_token_time
+            generation_time = current_time - self.first_token_time
             return {
                 "time_to_first_token": time_to_first,
@@ -179,36 +238,184 @@ def build_messages(
         text = transform_user_message(msg.text) if transform_user_message and msg.role == ContextMessageRole.USER else msg.text
         if text:
             parts.append({"type": "text", "text": text})
+        else:
+            parts.append({"type": "text", "text": ""})
         if msg.image:
             if msg.image.path:
                 image_data_uri = image_to_base64_data_uri(msg.image.path)
                 parts.append({"type": "image_url", "image_url": {"url": image_data_uri}})
             elif msg.image.uri:
                 parts.append({"type": "image_url", "image_url": {"url": msg.image.uri}})
+        if msg.images:
+            for image in msg.images:
+                if image.path:
+                    image_data_uri = image_to_base64_data_uri(image.path)
+                    parts.append({"type": "image_url", "image_url": {"url": image_data_uri}})
+                elif image.uri:
+                    parts.append({"type": "image_url", "image_url": {"url": image.uri}})
         if allow_multipart:
             return parts
         if len(parts) == 1 and parts[0]["type"] == "text":
             return parts[0]["text"]
-        raise ValueError("Image content requires multipart support")
+        if len(parts) > 1:
+            if parts.any(lambda x: x["type"] == "image_url"):
+                raise ValueError("Image content requires multipart support")
+            return parts
+        raise ValueError("Invalid message content")
-    multipart = any(m.image for m in input_data.context) or input_data.image is not None
     messages = [{"role": "system", "content": input_data.system_prompt}] if input_data.system_prompt is not None and input_data.system_prompt != "" else []
-    for msg in input_data.context:
-        messages.append({
-            "role": msg.role,
-            "content": render_message(msg, allow_multipart=multipart)
-        })
+    def merge_messages(messages: List[ContextMessage]) -> ContextMessage:
+        text = "\n\n".join(msg.text for msg in messages if msg.text)
+        images = []
+        # Collect single images
+        for msg in messages:
+            if msg.image:
+                images.append(msg.image)
+        # Collect multiple images (flatten the list)
+        for msg in messages:
+            if msg.images:
+                images.extend(msg.images)
+        # Set image to single File if there's exactly one, otherwise None
+        image = images[0] if len(images) == 1 else None
+        # Set images to the list if there are multiple, otherwise None
+        images_list = images if len(images) > 1 else None
+        return ContextMessage(role=messages[0].role, text=text, image=image, images=images_list)
+    def merge_tool_calls(messages: List[ContextMessage]) -> List[Dict[str, Any]]:
+        tool_calls = []
+        for msg in messages:
+            if msg.tool_calls:
+                tool_calls.extend(msg.tool_calls)
+        return tool_calls
+    user_input_text = ""
+    if hasattr(input_data, "text"):
+        user_input_text = transform_user_message(input_data.text) if transform_user_message else input_data.text
+    user_input_image = None
+    multipart = any(m.image for m in input_data.context)
+    if hasattr(input_data, "image"):
+        user_input_image = input_data.image
+        multipart = multipart or input_data.image is not None
+    user_input_images = None
+    if hasattr(input_data, "images"):
+        user_input_images = input_data.images
+        multipart = multipart or input_data.images is not None
-    user_msg = ContextMessage(role=ContextMessageRole.USER, text=input_data.text, image=input_data.image)
-    messages.append({
-        "role": "user",
-        "content": render_message(user_msg, allow_multipart=multipart)
-    })
+    input_role = input_data.role if hasattr(input_data, "role") else ContextMessageRole.USER
+    input_tool_call_id = input_data.tool_call_id if hasattr(input_data, "tool_call_id") else None
+    user_msg = ContextMessage(role=input_role, text=user_input_text, image=user_input_image, images=user_input_images, tool_call_id=input_tool_call_id)
+    input_data.context.append(user_msg)
+    current_role = None
+    current_messages = []
+    for msg in input_data.context:
+        if msg.role == current_role or current_role is None:
+            current_messages.append(msg)
+            current_role = msg.role
+        else:
+            # Convert role enum to string for OpenAI API compatibility
+            role_str = current_role.value if hasattr(current_role, "value") else current_role
+            msg_dict = {
+                "role": role_str,
+                "content": render_message(merge_messages(current_messages), allow_multipart=multipart),
+            }
+            # Only add tool_calls if not empty
+            tool_calls = merge_tool_calls(current_messages)
+            if tool_calls:
+                # Ensure arguments are JSON strings (OpenAI API requirement)
+                for tc in tool_calls:
+                    if "function" in tc and "arguments" in tc["function"]:
+                        if isinstance(tc["function"]["arguments"], dict):
+                            tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
+                msg_dict["tool_calls"] = tool_calls
+            # Add tool_call_id for tool role messages (required by OpenAI API)
+            if role_str == "tool":
+                if current_messages and current_messages[0].tool_call_id:
+                    msg_dict["tool_call_id"] = current_messages[0].tool_call_id
+                else:
+                    # If not provided, use empty string to satisfy schema
+                    msg_dict["tool_call_id"] = ""
+            messages.append(msg_dict)
+            current_messages = [msg]
+            current_role = msg.role
+    if len(current_messages) > 0:
+        # Convert role enum to string for OpenAI API compatibility
+        role_str = current_role.value if hasattr(current_role, "value") else current_role
+        msg_dict = {
+            "role": role_str,
+            "content": render_message(merge_messages(current_messages), allow_multipart=multipart),
+        }
+        # Only add tool_calls if not empty
+        tool_calls = merge_tool_calls(current_messages)
+        if tool_calls:
+            # Ensure arguments are JSON strings (OpenAI API requirement)
+            for tc in tool_calls:
+                if "function" in tc and "arguments" in tc["function"]:
+                    if isinstance(tc["function"]["arguments"], dict):
+                        tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
+            msg_dict["tool_calls"] = tool_calls
+        # Add tool_call_id for tool role messages (required by OpenAI API)
+        if role_str == "tool":
+            if current_messages and current_messages[0].tool_call_id:
+                msg_dict["tool_call_id"] = current_messages[0].tool_call_id
+            else:
+                # If not provided, use empty string to satisfy schema
+                msg_dict["tool_call_id"] = ""
+        messages.append(msg_dict)
     return messages
+def build_tools(tools: Optional[List[Dict[str, Any]]]) -> Optional[List[Dict[str, Any]]]:
+    """Build tools in OpenAI API format.
+    Ensures tools are properly formatted:
+    - Wrapped in {"type": "function", "function": {...}}
+    - Parameters is never None (OpenAI API requirement)
+    """
+    if not tools:
+        return None
+    result = []
+    for tool in tools:
+        # Extract function definition
+        if "type" in tool and "function" in tool:
+            func_def = tool["function"].copy()
+        else:
+            func_def = tool.copy()
+        # Ensure parameters is not None (OpenAI API requirement)
+        if func_def.get("parameters") is None:
+            func_def["parameters"] = {"type": "object", "properties": {}}
+        # Also ensure properties within parameters is not None
+        elif func_def["parameters"].get("properties") is None:
+            func_def["parameters"]["properties"] = {}
+        else:
+            # Remove properties with null values (OpenAI API doesn't accept them)
+            properties = func_def["parameters"].get("properties", {})
+            if properties:
+                func_def["parameters"]["properties"] = {
+                    k: v for k, v in properties.items() if v is not None
+                }
+        # Wrap in OpenAI format
+        result.append({"type": "function", "function": func_def})
+    return result
 class StreamResponse:
     """Holds a single chunk of streamed response."""
     def __init__(self):
@@ -216,7 +423,7 @@ class StreamResponse:
         self.tool_calls = None  # Changed from [] to None
         self.finish_reason = None
         self.timing_stats = {
-            "time_to_first_token": 0.0,
+            "time_to_first_token": None,  # Changed from 0.0 to None
             "generation_time": 0.0,
             "reasoning_time": 0.0,
             "reasoning_tokens": 0,
@@ -232,8 +439,15 @@ class StreamResponse:
     def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
         """Update response state from a chunk."""
         # Update usage stats if present
-        if "usage" in chunk and chunk["usage"] is not None:
-            self.usage_stats.update(chunk["usage"])
+        if "usage" in chunk:
+            usage = chunk["usage"]
+            if usage is not None:
+                # Update usage stats preserving existing values if not provided
+                self.usage_stats.update({
+                    "prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
+                    "completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
+                    "total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
+                })
         # Get the delta from the chunk
         delta = chunk.get("choices", [{}])[0]
@@ -245,23 +459,34 @@ class StreamResponse:
             if message.get("tool_calls"):
                 self._update_tool_calls(message["tool_calls"])
             self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
         elif "delta" in delta:
             delta_content = delta["delta"]
             self.content = delta_content.get("content", "")
             if delta_content.get("tool_calls"):
                 self._update_tool_calls(delta_content["tool_calls"])
             self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
-        # Update timing stats while preserving tokens_per_second
+        # Update timing stats
         timing_stats = timing.stats
-        generation_time = timing_stats["generation_time"]
-        completion_tokens = self.usage_stats.get("completion_tokens", 0)
-        tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
+        if self.timing_stats["time_to_first_token"] is None:
+            self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
         self.timing_stats.update({
-            **timing_stats,
-            "tokens_per_second": tokens_per_second
+            "generation_time": timing_stats["generation_time"],
+            "reasoning_time": timing_stats["reasoning_time"],
+            "reasoning_tokens": timing_stats["reasoning_tokens"]
         })
+        # Calculate tokens per second only if we have valid completion tokens and generation time
+        if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
+            self.timing_stats["tokens_per_second"] = (
+                self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
+            )
     def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
         """Update tool calls, handling both full and partial updates."""
@@ -292,22 +517,22 @@ class StreamResponse:
                     current_tool["function"]["arguments"] += func_delta["arguments"]
     def has_updates(self) -> bool:
-        """Check if this response has any content or tool call updates."""
-        return bool(self.content) or bool(self.tool_calls)
-    def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
-        """Convert current state to LLMOutput."""
-        buffer, output, _ = transformer(self.content, buffer)
+        """Check if this response has any content, tool call, or usage updates."""
+        has_content = bool(self.content)
+        has_tool_calls = bool(self.tool_calls)
+        has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
+        has_finish = bool(self.finish_reason)
-        # Add tool calls if present
-        if self.tool_calls:
-            output.tool_calls = self.tool_calls
-        # Add usage stats if this is final
-        if self.finish_reason:
-            output.usage = LLMUsage(
+        return has_content or has_tool_calls or has_usage or has_finish
+    def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
+        """Convert current state to LLMOutput."""
+        # Create usage object if we have stats
+        usage = None
+        if any(self.usage_stats.values()):
+            usage = LLMUsage(
                 stop_reason=self.usage_stats["stop_reason"],
-                time_to_first_token=self.timing_stats["time_to_first_token"],
+                time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
                 tokens_per_second=self.timing_stats["tokens_per_second"],
                 prompt_tokens=self.usage_stats["prompt_tokens"],
                 completion_tokens=self.usage_stats["completion_tokens"],
@@ -315,6 +540,12 @@ class StreamResponse:
                 reasoning_time=self.timing_stats["reasoning_time"],
                 reasoning_tokens=self.timing_stats["reasoning_tokens"]
             )
+        buffer, output, _ = transformer(self.content, buffer, usage)
+        # Add tool calls if present and supported
+        if self.tool_calls and hasattr(output, 'tool_calls'):
+            output.tool_calls = self.tool_calls
         return output, buffer
@@ -327,6 +558,7 @@ class ResponseState:
         self.function_calls = None  # For future function calling support
         self.tool_calls = None      # List to accumulate tool calls
         self.current_tool_call = None  # Track current tool call being built
+        self.usage = None  # Add usage field
         self.state_changes = {
             "reasoning_started": False,
             "reasoning_ended": False,
@@ -338,7 +570,7 @@ class ResponseState:
 class ResponseTransformer:
     """Base class for transforming model responses."""
-    def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
+    def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
         self.state = ResponseState()
         self.output_cls = output_cls
         self.timing = None  # Will be set by stream_generate
@@ -381,26 +613,27 @@ class ResponseTransformer:
             text: Cleaned text to process for reasoning
         """
         # Default implementation for <think> style reasoning
-        if "<think>" in text and not self.state.state_changes["reasoning_started"]:
+        # Check for tags in the complete buffer
+        if "<think>" in self.state.buffer and not self.state.state_changes["reasoning_started"]:
             self.state.state_changes["reasoning_started"] = True
             if self.timing:
                 self.timing.start_reasoning()
-        if "</think>" in text and not self.state.state_changes["reasoning_ended"]:
-            self.state.state_changes["reasoning_ended"] = True
-            if self.timing:
-                # Estimate token count from character count (rough approximation)
-                token_count = len(self.state.buffer.split("<think>")[1].split("</think>")[0]) // 4
-                self.timing.end_reasoning(token_count)
-        if "<think>" in self.state.buffer:
-            parts = self.state.buffer.split("</think>", 1)
-            if len(parts) > 1:
-                self.state.reasoning = parts[0].split("<think>", 1)[1].strip()
-                self.state.response = parts[1].strip()
-            else:
-                self.state.reasoning = self.state.buffer.split("<think>", 1)[1].strip()
-                self.state.response = ""
+        # Extract content and handle end of reasoning
+        parts = self.state.buffer.split("<think>", 1)
+        if len(parts) > 1:
+            reasoning_text = parts[1]
+            end_parts = reasoning_text.split("</think>", 1)
+            self.state.reasoning = end_parts[0].strip()
+            self.state.response = end_parts[1].strip() if len(end_parts) > 1 else ""
+            # Check for end tag in complete buffer
+            if "</think>" in self.state.buffer and not self.state.state_changes["reasoning_ended"]:
+                self.state.state_changes["reasoning_ended"] = True
+                if self.timing:
+                    # Estimate token count from character count (rough approximation)
+                    token_count = len(self.state.reasoning) // 4
+                    self.timing.end_reasoning(token_count)
         else:
             self.state.response = self.state.buffer
@@ -449,28 +682,43 @@ class ResponseTransformer:
         Returns:
             Tuple of (buffer, LLMOutput, state_changes)
         """
+        # Build base output with required fields
+        output_data = {
+            "response": self.state.response.strip(),
+        }
+        # Add optional fields if they exist
+        if self.state.usage is not None:
+            output_data["usage"] = self.state.usage
+        if self.state.reasoning:
+            output_data["reasoning"] = self.state.reasoning.strip()
+        if self.state.function_calls:
+            output_data["function_calls"] = self.state.function_calls
+        if self.state.tool_calls:
+            output_data["tool_calls"] = self.state.tool_calls
+        output = self.output_cls(**output_data)
         return (
             self.state.buffer,
-            self.output_cls(
-                response=self.state.response.strip(),
-                reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
-                function_calls=self.state.function_calls,
-                tool_calls=self.state.tool_calls
-            ),
+            output,
             self.state.state_changes
         )
-    def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
+    def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
         """Transform a piece of text and return the result.
         Args:
             piece: New piece of text to transform
             buffer: Existing buffer content
+            usage: Optional usage statistics
         Returns:
             Tuple of (new_buffer, output, state_changes)
         """
         self.state.buffer = buffer
+        if usage is not None:
+            self.state.usage = usage
         self.transform_chunk(piece)
         return self.build_output()
@@ -483,42 +731,131 @@ def stream_generate(
     tool_choice: Optional[Dict[str, Any]] = None,
     temperature: float = 0.7,
     top_p: float = 0.95,
-    max_tokens: int = 4096,
     stop: Optional[List[str]] = None,
     verbose: bool = False,
-) -> Generator[LLMOutput, None, None]:
+    output_cls: type[BaseLLMOutput] = LLMOutput,
+    kwargs: Optional[Dict[str, Any]] = None,
+) -> Generator[BaseLLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
+    # Create queues for communication between threads
+    response_queue = Queue()
+    error_queue = Queue()
+    keep_alive_queue = Queue()
+    # Set the output class for the transformer
+    transformer.output_cls = output_cls
+    def _generate_worker():
+        """Worker thread to run the model generation."""
+        try:
+            # Build completion kwargs
+            completion_kwargs = {
+                "messages": messages,
+                "stream": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "stop": stop,
+            }
+            if kwargs:
+                completion_kwargs.update(kwargs)
+            if tools is not None:
+                completion_kwargs["tools"] = tools
+            if tool_choice is not None:
+                completion_kwargs["tool_choice"] = tool_choice
+            # Signal that we're starting
+            keep_alive_queue.put(("init", time.time()))
+            completion = model.create_chat_completion(**completion_kwargs)
+            for chunk in completion:
+                response_queue.put(("chunk", chunk))
+                # Update keep-alive timestamp
+                keep_alive_queue.put(("alive", time.time()))
+            # Signal completion
+            response_queue.put(("done", None))
+        except Exception as e:
+            # Preserve the full exception with traceback
+            import sys
+            error_queue.put((e, sys.exc_info()[2]))
+            response_queue.put(("error", str(e)))
     with timing_context() as timing:
         transformer.timing = timing
-        # Build completion kwargs
-        completion_kwargs = {
-            "messages": messages,
-            "stream": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_tokens": max_tokens,
-            "stop": stop
-        }
-        if tools is not None:
-            completion_kwargs["tools"] = tools
-        if tool_choice is not None:
-            completion_kwargs["tool_choice"] = tool_choice
+        # Start generation thread
+        generation_thread = Thread(target=_generate_worker, daemon=True)
+        generation_thread.start()
         # Initialize response state
         response = StreamResponse()
         buffer = ""
+        # Keep-alive tracking
+        last_activity = time.time()
+        init_timeout = 30.0  # 30 seconds for initial response
+        chunk_timeout = 10.0  # 10 seconds between chunks
+        chunks_begun = False
         try:
-            completion = model.create_chat_completion(**completion_kwargs)
+            # Wait for initial setup
+            try:
+                msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
+                if msg_type != "init":
+                    raise RuntimeError("Unexpected initialization message")
+                last_activity = timestamp
+            except Empty:
+                raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
-            for chunk in completion:
+            while True:
+                # Check for errors - now with proper exception chaining
+                if not error_queue.empty():
+                    exc, tb = error_queue.get()
+                    if isinstance(exc, Exception):
+                        raise exc.with_traceback(tb)
+                    else:
+                        raise RuntimeError(f"Unknown error in worker thread: {exc}")
+                # Check keep-alive
+                try:
+                    while not keep_alive_queue.empty():
+                        _, timestamp = keep_alive_queue.get_nowait()
+                        last_activity = timestamp
+                except Empty:
+                    # Ignore empty queue - this is expected
+                    pass
+                # Check for timeout
+                if chunks_begun and time.time() - last_activity > chunk_timeout:
+                    raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
+                # Get next chunk
+                try:
+                    msg_type, data = response_queue.get(timeout=0.1)
+                except Empty:
+                    continue
+                if msg_type == "error":
+                    # If we get an error message but no exception in error_queue,
+                    # create a new error
+                    raise RuntimeError(f"Generation error: {data}")
+                elif msg_type == "done":
+                    break
+                chunk = data
                 if verbose:
                     print(chunk)
-                # Mark first token time as soon as we get any response
+                # Mark first token time
                 if not timing.first_token_time:
                     timing.mark_first_token()
+                chunks_begun = True
                 # Update response state from chunk
                 response.update_from_chunk(chunk, timing)
@@ -530,7 +867,19 @@ def stream_generate(
                 # Break if we're done
                 if response.finish_reason:
                     break
+            # Wait for generation thread to finish
+            if generation_thread.is_alive():
+                generation_thread.join(timeout=5.0)  # Increased timeout to 5 seconds
+                if generation_thread.is_alive():
+                    # Thread didn't finish - this shouldn't happen normally
+                    raise RuntimeError("Generation thread failed to finish")
         except Exception as e:
-            # Ensure any error is properly propagated
-            raise e
+            # Check if there's a thread error we should chain with
+            if not error_queue.empty():
+                thread_exc, thread_tb = error_queue.get()
+                if isinstance(thread_exc, Exception):
+                    raise e from thread_exc
+            # If no thread error, raise the original exception
+            raise

inferencesh 0.2.23__py3-none-any.whl → 0.4.29__py3-none-any.whl

inferencesh 0.2.23py3-none-any.whl → 0.4.29py3-none-any.whl