PyPI - inferencesh - Versions diffs - 0.2.21__tar.gz → 0.2.23__tar.gz - Mend

inferencesh 0.2.21tar.gz → 0.2.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show

{inferencesh-0.2.21/src/inferencesh.egg-info → inferencesh-0.2.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.21
+Version: 0.2.23
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>

{inferencesh-0.2.21 → inferencesh-0.2.23}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "inferencesh"
-version = "0.2.21"
+version = "0.2.23"
 description = "inference.sh Python SDK"
 authors = [
     {name = "Inference Shell Inc.", email = "hello@inference.sh"},

{inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/models/llm.py RENAMED Viewed

@@ -116,7 +116,7 @@ def timing_context():
     class TimingInfo:
         def __init__(self):
             self.start_time = time.time()
-            self.first_token_time = None
+            self.first_token_time = 0
             self.reasoning_start_time = None
             self.total_reasoning_time = 0.0
             self.reasoning_tokens = 0
@@ -209,6 +209,115 @@ def build_messages(
     return messages
+class StreamResponse:
+    """Holds a single chunk of streamed response."""
+    def __init__(self):
+        self.content = ""
+        self.tool_calls = None  # Changed from [] to None
+        self.finish_reason = None
+        self.timing_stats = {
+            "time_to_first_token": 0.0,
+            "generation_time": 0.0,
+            "reasoning_time": 0.0,
+            "reasoning_tokens": 0,
+            "tokens_per_second": 0.0
+        }
+        self.usage_stats = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_tokens": 0,
+            "stop_reason": ""
+        }
+    def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
+        """Update response state from a chunk."""
+        # Update usage stats if present
+        if "usage" in chunk and chunk["usage"] is not None:
+            self.usage_stats.update(chunk["usage"])
+        # Get the delta from the chunk
+        delta = chunk.get("choices", [{}])[0]
+        # Extract content and tool calls from either message or delta
+        if "message" in delta:
+            message = delta["message"]
+            self.content = message.get("content", "")
+            if message.get("tool_calls"):
+                self._update_tool_calls(message["tool_calls"])
+            self.finish_reason = delta.get("finish_reason")
+        elif "delta" in delta:
+            delta_content = delta["delta"]
+            self.content = delta_content.get("content", "")
+            if delta_content.get("tool_calls"):
+                self._update_tool_calls(delta_content["tool_calls"])
+            self.finish_reason = delta.get("finish_reason")
+        # Update timing stats while preserving tokens_per_second
+        timing_stats = timing.stats
+        generation_time = timing_stats["generation_time"]
+        completion_tokens = self.usage_stats.get("completion_tokens", 0)
+        tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
+        self.timing_stats.update({
+            **timing_stats,
+            "tokens_per_second": tokens_per_second
+        })
+    def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
+        """Update tool calls, handling both full and partial updates."""
+        if self.tool_calls is None:
+            self.tool_calls = []
+        for tool_delta in new_tool_calls:
+            tool_id = tool_delta.get("id")
+            if not tool_id:
+                continue
+            # Find or create tool call
+            current_tool = next((t for t in self.tool_calls if t["id"] == tool_id), None)
+            if not current_tool:
+                current_tool = {
+                    "id": tool_id,
+                    "type": tool_delta.get("type", "function"),
+                    "function": {"name": "", "arguments": ""}
+                }
+                self.tool_calls.append(current_tool)
+            # Update tool call
+            if "function" in tool_delta:
+                func_delta = tool_delta["function"]
+                if "name" in func_delta:
+                    current_tool["function"]["name"] = func_delta["name"]
+                if "arguments" in func_delta:
+                    current_tool["function"]["arguments"] += func_delta["arguments"]
+    def has_updates(self) -> bool:
+        """Check if this response has any content or tool call updates."""
+        return bool(self.content) or bool(self.tool_calls)
+    def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
+        """Convert current state to LLMOutput."""
+        buffer, output, _ = transformer(self.content, buffer)
+        # Add tool calls if present
+        if self.tool_calls:
+            output.tool_calls = self.tool_calls
+        # Add usage stats if this is final
+        if self.finish_reason:
+            output.usage = LLMUsage(
+                stop_reason=self.usage_stats["stop_reason"],
+                time_to_first_token=self.timing_stats["time_to_first_token"],
+                tokens_per_second=self.timing_stats["tokens_per_second"],
+                prompt_tokens=self.usage_stats["prompt_tokens"],
+                completion_tokens=self.usage_stats["completion_tokens"],
+                total_tokens=self.usage_stats["total_tokens"],
+                reasoning_time=self.timing_stats["reasoning_time"],
+                reasoning_tokens=self.timing_stats["reasoning_tokens"]
+            )
+        return output, buffer
 class ResponseState:
     """Holds the state of response transformation."""
     def __init__(self):
@@ -216,7 +325,7 @@ class ResponseState:
         self.response = ""
         self.reasoning = None
         self.function_calls = None  # For future function calling support
-        self.tool_calls = []      # List to accumulate tool calls
+        self.tool_calls = None      # List to accumulate tool calls
         self.current_tool_call = None  # Track current tool call being built
         self.state_changes = {
             "reasoning_started": False,
@@ -243,6 +352,9 @@ class ResponseTransformer:
         Returns:
             Cleaned text with common and model-specific tokens removed
         """
+        if text is None:
+            return ""
         # Common token cleaning across most models
         cleaned = (text.replace("<|im_end|>", "")
                       .replace("<|im_start|>", "")
@@ -366,159 +478,59 @@ class ResponseTransformer:
 def stream_generate(
     model: Any,
     messages: List[Dict[str, Any]],
-    tools: List[Dict[str, Any]],
-    tool_choice: Dict[str, Any],
-    transformer: ResponseTransformer,
+    transformer: ResponseTransformer = ResponseTransformer(),
+    tools: Optional[List[Dict[str, Any]]] = None,
+    tool_choice: Optional[Dict[str, Any]] = None,
     temperature: float = 0.7,
     top_p: float = 0.95,
     max_tokens: int = 4096,
     stop: Optional[List[str]] = None,
+    verbose: bool = False,
 ) -> Generator[LLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
-    response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
-    thread_exception = None
-    usage_stats = {
-        "prompt_tokens": 0,
-        "completion_tokens": 0,
-        "total_tokens": 0,
-        "stop_reason": ""
-    }
     with timing_context() as timing:
         transformer.timing = timing
-        def generation_thread():
-            nonlocal thread_exception, usage_stats
-            try:
-                completion = model.create_chat_completion(
-                    messages=messages,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    stream=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                    max_tokens=max_tokens,
-                    stop=stop
-                )
-                tool_calls = []
-                current_tool = None
-                for chunk in completion:
-                    if "usage" in chunk and chunk["usage"] is not None:
-                        usage_stats.update(chunk["usage"])
-                    delta = chunk.get("choices", [{}])[0]
-                    content = ""
-                    finish_reason = None
-                    # Extract delta content from either message or delta
-                    if "message" in delta:
-                        message = delta["message"]
-                        content = message.get("content", "")
-                        if message.get("tool_calls"):
-                            for tool in message["tool_calls"]:
-                                if tool.get("id") not in {t.get("id") for t in tool_calls}:
-                                    tool_calls.append(tool)
-                        finish_reason = delta.get("finish_reason")
-                    elif "delta" in delta:
-                        delta_content = delta["delta"]
-                        content = delta_content.get("content", "")
-                        # Handle streaming tool calls
-                        if delta_content.get("tool_calls"):
-                            for tool_delta in delta_content["tool_calls"]:
-                                tool_id = tool_delta.get("id")
-                                # Find or create tool call
-                                if tool_id:
-                                    current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
-                                    if not current_tool:
-                                        current_tool = {
-                                            "id": tool_id,
-                                            "type": tool_delta.get("type", "function"),
-                                            "function": {"name": "", "arguments": ""}
-                                        }
-                                        tool_calls.append(current_tool)
-                                # Update tool call
-                                if current_tool and "function" in tool_delta:
-                                    func_delta = tool_delta["function"]
-                                    if "name" in func_delta:
-                                        current_tool["function"]["name"] = func_delta["name"]
-                                    if "arguments" in func_delta:
-                                        current_tool["function"]["arguments"] += func_delta["arguments"]
-                        finish_reason = delta.get("finish_reason")
-                    has_update = bool(content)
-                    has_tool_update = bool(
-                        (delta.get("message", {}) or {}).get("tool_calls") or
-                        (delta.get("delta", {}) or {}).get("tool_calls")
-                    )
-                    if has_update or has_tool_update:
-                        if not timing.first_token_time:
-                            timing.mark_first_token()
-                        response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
-                    if finish_reason:
-                        usage_stats["stop_reason"] = finish_reason
-            except Exception as e:
-                thread_exception = e
-            finally:
-                timing_stats = timing.stats
-                generation_time = timing_stats["generation_time"]
-                tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
-                response_queue.put((None, {
-                    "time_to_first_token": timing_stats["time_to_first_token"],
-                    "tokens_per_second": tokens_per_second,
-                    "reasoning_time": timing_stats["reasoning_time"],
-                    "reasoning_tokens": timing_stats["reasoning_tokens"]
-                }, tool_calls if tool_calls else None))
-        thread = Thread(target=generation_thread, daemon=True)
-        thread.start()
+        # Build completion kwargs
+        completion_kwargs = {
+            "messages": messages,
+            "stream": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": max_tokens,
+            "stop": stop
+        }
+        if tools is not None:
+            completion_kwargs["tools"] = tools
+        if tool_choice is not None:
+            completion_kwargs["tool_choice"] = tool_choice
+        # Initialize response state
+        response = StreamResponse()
         buffer = ""
         try:
-            while True:
-                try:
-                    result = response_queue.get(timeout=30.0)
-                    if thread_exception:
-                        raise thread_exception
-                    piece, timing_stats, tool_calls = result
-                    if piece is None:
-                        # Final yield with complete usage stats
-                        usage = LLMUsage(
-                            stop_reason=usage_stats["stop_reason"],
-                            time_to_first_token=timing_stats["time_to_first_token"],
-                            tokens_per_second=timing_stats["tokens_per_second"],
-                            prompt_tokens=usage_stats["prompt_tokens"],
-                            completion_tokens=usage_stats["completion_tokens"],
-                            total_tokens=usage_stats["total_tokens"],
-                            reasoning_time=timing_stats["reasoning_time"],
-                            reasoning_tokens=timing_stats["reasoning_tokens"]
-                        )
-                        buffer, output, _ = transformer(piece or "", buffer)
-                        output.usage = usage
-                        if tool_calls:
-                            output.tool_calls = tool_calls
-                        yield output
-                        break
-                    buffer, output, _ = transformer(piece, buffer)
-                    if tool_calls:
-                        output.tool_calls = tool_calls
+            completion = model.create_chat_completion(**completion_kwargs)
+            for chunk in completion:
+                if verbose:
+                    print(chunk)
+                # Mark first token time as soon as we get any response
+                if not timing.first_token_time:
+                    timing.mark_first_token()
+                # Update response state from chunk
+                response.update_from_chunk(chunk, timing)
+                # Yield output if we have updates
+                if response.has_updates():
+                    output, buffer = response.to_output(buffer, transformer)
                     yield output
-                except Exception as e:
-                    if thread_exception and isinstance(e, thread_exception.__class__):
-                        raise thread_exception
+                # Break if we're done
+                if response.finish_reason:
                     break
-        finally:
-            if thread and thread.is_alive():
-                thread.join(timeout=2.0)
+        except Exception as e:
+            # Ensure any error is properly propagated
+            raise e

{inferencesh-0.2.21 → inferencesh-0.2.23/src/inferencesh.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.21
+Version: 0.2.23
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>