PyPI - inferencesh - Versions diffs - 0.2.27__tar.gz → 0.2.29__tar.gz - Mend

inferencesh 0.2.27tar.gz → 0.2.29tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show

{inferencesh-0.2.27/src/inferencesh.egg-info → inferencesh-0.2.29}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.27
+Version: 0.2.29
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>

{inferencesh-0.2.27 → inferencesh-0.2.29}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "inferencesh"
-version = "0.2.27"
+version = "0.2.29"
 description = "inference.sh Python SDK"
 authors = [
     {name = "Inference Shell Inc.", email = "hello@inference.sh"},

{inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/llm.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from typing import Optional, List, Any, Callable, Dict, Generator
 from enum import Enum
 from pydantic import Field, BaseModel
-from queue import Queue
+from queue import Queue, Empty
 from threading import Thread
 import time
 from contextlib import contextmanager
@@ -9,7 +9,6 @@ import base64
 from .base import BaseAppInput, BaseAppOutput
 from .file import File
-from .types import ContextMessage
 class ContextMessageRole(str, Enum):
     USER = "user"
@@ -113,13 +112,27 @@ class LLMUsage(BaseAppOutput):
 class BaseLLMOutput(BaseAppOutput):
     """Base class for LLM outputs with common fields."""
-    text: str = Field(description="The generated text response")
-    done: bool = Field(default=False, description="Whether this is the final chunk")
+    response: str = Field(description="The generated text response")
 class LLMUsageMixin(BaseModel):
     """Mixin for models that provide token usage statistics."""
     usage: Optional[LLMUsage] = Field(
-        description="Token usage statistics"
+        description="Token usage statistics",
+        default=None
+    )
+class ReasoningMixin(BaseModel):
+    """Mixin for models that support reasoning."""
+    reasoning: Optional[str] = Field(
+        description="The reasoning output of the model",
+        default=None
+    )
+class ToolCallsMixin(BaseModel):
+    """Mixin for models that support tool calls."""
+    tool_calls: Optional[List[Dict[str, Any]]] = Field(
+        description="Tool calls for function calling",
+        default=None
     )
 # Example of how to use:
@@ -343,26 +356,28 @@ class StreamResponse:
         return has_content or has_tool_calls or has_usage or has_finish
-    def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
+    def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
         """Convert current state to LLMOutput."""
-        buffer, output, _ = transformer(self.content, buffer)
+        # Create usage object if we have stats
+        usage = None
+        if any(self.usage_stats.values()):
+            usage = LLMUsage(
+                stop_reason=self.usage_stats["stop_reason"],
+                time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
+                tokens_per_second=self.timing_stats["tokens_per_second"],
+                prompt_tokens=self.usage_stats["prompt_tokens"],
+                completion_tokens=self.usage_stats["completion_tokens"],
+                total_tokens=self.usage_stats["total_tokens"],
+                reasoning_time=self.timing_stats["reasoning_time"],
+                reasoning_tokens=self.timing_stats["reasoning_tokens"]
+            )
-        # Add tool calls if present
-        if self.tool_calls:
+        buffer, output, _ = transformer(self.content, buffer, usage)
+        # Add tool calls if present and supported
+        if self.tool_calls and hasattr(output, 'tool_calls'):
             output.tool_calls = self.tool_calls
-        # Add usage stats
-        output.usage = LLMUsage(
-            stop_reason=self.usage_stats["stop_reason"],
-            time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
-            tokens_per_second=self.timing_stats["tokens_per_second"],
-            prompt_tokens=self.usage_stats["prompt_tokens"],
-            completion_tokens=self.usage_stats["completion_tokens"],
-            total_tokens=self.usage_stats["total_tokens"],
-            reasoning_time=self.timing_stats["reasoning_time"],
-            reasoning_tokens=self.timing_stats["reasoning_tokens"]
-        )
         return output, buffer
 class ResponseState:
@@ -374,6 +389,7 @@ class ResponseState:
         self.function_calls = None  # For future function calling support
         self.tool_calls = None      # List to accumulate tool calls
         self.current_tool_call = None  # Track current tool call being built
+        self.usage = None  # Add usage field
         self.state_changes = {
             "reasoning_started": False,
             "reasoning_ended": False,
@@ -385,7 +401,7 @@ class ResponseState:
 class ResponseTransformer:
     """Base class for transforming model responses."""
-    def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
+    def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
         self.state = ResponseState()
         self.output_cls = output_cls
         self.timing = None  # Will be set by stream_generate
@@ -496,28 +512,43 @@ class ResponseTransformer:
         Returns:
             Tuple of (buffer, LLMOutput, state_changes)
         """
+        # Build base output with required fields
+        output_data = {
+            "response": self.state.response.strip(),
+        }
+        # Add optional fields if they exist
+        if self.state.usage is not None:
+            output_data["usage"] = self.state.usage
+        if self.state.reasoning:
+            output_data["reasoning"] = self.state.reasoning.strip()
+        if self.state.function_calls:
+            output_data["function_calls"] = self.state.function_calls
+        if self.state.tool_calls:
+            output_data["tool_calls"] = self.state.tool_calls
+        output = self.output_cls(**output_data)
         return (
             self.state.buffer,
-            self.output_cls(
-                response=self.state.response.strip(),
-                reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
-                function_calls=self.state.function_calls,
-                tool_calls=self.state.tool_calls
-            ),
+            output,
             self.state.state_changes
         )
-    def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
+    def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
         """Transform a piece of text and return the result.
         Args:
             piece: New piece of text to transform
             buffer: Existing buffer content
+            usage: Optional usage statistics
         Returns:
             Tuple of (new_buffer, output, state_changes)
         """
         self.state.buffer = buffer
+        if usage is not None:
+            self.state.usage = usage
         self.transform_chunk(piece)
         return self.build_output()
@@ -533,36 +564,108 @@ def stream_generate(
     max_tokens: int = 4096,
     stop: Optional[List[str]] = None,
     verbose: bool = False,
-) -> Generator[LLMOutput, None, None]:
+    output_cls: type[BaseLLMOutput] = LLMOutput,
+) -> Generator[BaseLLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
+    # Create queues for communication between threads
+    response_queue = Queue()
+    error_queue = Queue()
+    keep_alive_queue = Queue()
+    # Set the output class for the transformer
+    transformer.output_cls = output_cls
+    def _generate_worker():
+        """Worker thread to run the model generation."""
+        try:
+            # Build completion kwargs
+            completion_kwargs = {
+                "messages": messages,
+                "stream": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "max_tokens": max_tokens,
+                "stop": stop
+            }
+            if tools is not None:
+                completion_kwargs["tools"] = tools
+            if tool_choice is not None:
+                completion_kwargs["tool_choice"] = tool_choice
+            # Signal that we're starting
+            keep_alive_queue.put(("init", time.time()))
+            completion = model.create_chat_completion(**completion_kwargs)
+            for chunk in completion:
+                if verbose:
+                    print(chunk)
+                response_queue.put(("chunk", chunk))
+                # Update keep-alive timestamp
+                keep_alive_queue.put(("alive", time.time()))
+            # Signal completion
+            response_queue.put(("done", None))
+        except Exception as e:
+            error_queue.put(e)
+            response_queue.put(("error", str(e)))
     with timing_context() as timing:
         transformer.timing = timing
-        # Build completion kwargs
-        completion_kwargs = {
-            "messages": messages,
-            "stream": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_tokens": max_tokens,
-            "stop": stop
-        }
-        if tools is not None:
-            completion_kwargs["tools"] = tools
-        if tool_choice is not None:
-            completion_kwargs["tool_choice"] = tool_choice
+        # Start generation thread
+        generation_thread = Thread(target=_generate_worker, daemon=True)
+        generation_thread.start()
         # Initialize response state
         response = StreamResponse()
         buffer = ""
+        # Keep-alive tracking
+        last_activity = time.time()
+        init_timeout = 30.0  # 30 seconds for initial response
+        chunk_timeout = 10.0  # 10 seconds between chunks
         try:
-            completion = model.create_chat_completion(**completion_kwargs)
+            # Wait for initial setup
+            try:
+                msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
+                if msg_type != "init":
+                    raise RuntimeError("Unexpected initialization message")
+                last_activity = timestamp
+            except Empty:
+                raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
-            for chunk in completion:
-                if verbose:
-                    print(chunk)
-                # Mark first token time as soon as we get any response
+            while True:
+                # Check for errors
+                if not error_queue.empty():
+                    raise error_queue.get()
+                # Check keep-alive
+                while not keep_alive_queue.empty():
+                    _, timestamp = keep_alive_queue.get_nowait()
+                    last_activity = timestamp
+                # Check for timeout
+                if time.time() - last_activity > chunk_timeout:
+                    raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
+                # Get next chunk
+                try:
+                    msg_type, data = response_queue.get(timeout=0.1)
+                except Empty:
+                    continue
+                if msg_type == "error":
+                    raise RuntimeError(f"Generation error: {data}")
+                elif msg_type == "done":
+                    break
+                chunk = data
+                # Mark first token time
                 if not timing.first_token_time:
                     timing.mark_first_token()
@@ -577,6 +680,13 @@ def stream_generate(
                 # Break if we're done
                 if response.finish_reason:
                     break
+            # Wait for generation thread to finish
+            generation_thread.join(timeout=5.0)  # Increased timeout to 5 seconds
+            if generation_thread.is_alive():
+                # Thread didn't finish - this shouldn't happen normally
+                # but we handle it gracefully
+                raise RuntimeError("Generation thread failed to finish")
         except Exception as e:
             # Ensure any error is properly propagated

{inferencesh-0.2.27 → inferencesh-0.2.29/src/inferencesh.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.27
+Version: 0.2.29
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>