PyPI - inferencesh - Versions diffs - 0.2.23__tar.gz → 0.2.25__tar.gz - Mend

inferencesh 0.2.23tar.gz → 0.2.25tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show

{inferencesh-0.2.23/src/inferencesh.egg-info → inferencesh-0.2.25}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.23
+Version: 0.2.25
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>

{inferencesh-0.2.23 → inferencesh-0.2.25}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "inferencesh"
-version = "0.2.23"
+version = "0.2.25"
 description = "inference.sh Python SDK"
 authors = [
     {name = "Inference Shell Inc.", email = "hello@inference.sh"},

{inferencesh-0.2.23 → inferencesh-0.2.25}/src/inferencesh/models/llm.py RENAMED Viewed

@@ -10,7 +10,6 @@ import base64
 from .base import BaseAppInput, BaseAppOutput
 from .file import File
 class ContextMessageRole(str, Enum):
     USER = "user"
     ASSISTANT = "assistant"
@@ -116,7 +115,7 @@ def timing_context():
     class TimingInfo:
         def __init__(self):
             self.start_time = time.time()
-            self.first_token_time = 0
+            self.first_token_time = None
             self.reasoning_start_time = None
             self.total_reasoning_time = 0.0
             self.reasoning_tokens = 0
@@ -140,12 +139,17 @@ def timing_context():
         @property
         def stats(self):
-            end_time = time.time()
+            current_time = time.time()
             if self.first_token_time is None:
-                self.first_token_time = end_time
+                return {
+                    "time_to_first_token": 0.0,
+                    "generation_time": 0.0,
+                    "reasoning_time": self.total_reasoning_time,
+                    "reasoning_tokens": self.reasoning_tokens
+                }
             time_to_first = self.first_token_time - self.start_time
-            generation_time = end_time - self.first_token_time
+            generation_time = current_time - self.first_token_time
             return {
                 "time_to_first_token": time_to_first,
@@ -216,7 +220,7 @@ class StreamResponse:
         self.tool_calls = None  # Changed from [] to None
         self.finish_reason = None
         self.timing_stats = {
-            "time_to_first_token": 0.0,
+            "time_to_first_token": None,  # Changed from 0.0 to None
             "generation_time": 0.0,
             "reasoning_time": 0.0,
             "reasoning_tokens": 0,
@@ -232,8 +236,15 @@ class StreamResponse:
     def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
         """Update response state from a chunk."""
         # Update usage stats if present
-        if "usage" in chunk and chunk["usage"] is not None:
-            self.usage_stats.update(chunk["usage"])
+        if "usage" in chunk:
+            usage = chunk["usage"]
+            if usage is not None:
+                # Update usage stats preserving existing values if not provided
+                self.usage_stats.update({
+                    "prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
+                    "completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
+                    "total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
+                })
         # Get the delta from the chunk
         delta = chunk.get("choices", [{}])[0]
@@ -245,23 +256,34 @@ class StreamResponse:
             if message.get("tool_calls"):
                 self._update_tool_calls(message["tool_calls"])
             self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
         elif "delta" in delta:
             delta_content = delta["delta"]
             self.content = delta_content.get("content", "")
             if delta_content.get("tool_calls"):
                 self._update_tool_calls(delta_content["tool_calls"])
             self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
-        # Update timing stats while preserving tokens_per_second
+        # Update timing stats
         timing_stats = timing.stats
-        generation_time = timing_stats["generation_time"]
-        completion_tokens = self.usage_stats.get("completion_tokens", 0)
-        tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
+        if self.timing_stats["time_to_first_token"] is None:
+            self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
         self.timing_stats.update({
-            **timing_stats,
-            "tokens_per_second": tokens_per_second
+            "generation_time": timing_stats["generation_time"],
+            "reasoning_time": timing_stats["reasoning_time"],
+            "reasoning_tokens": timing_stats["reasoning_tokens"]
         })
+        # Calculate tokens per second only if we have valid completion tokens and generation time
+        if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
+            self.timing_stats["tokens_per_second"] = (
+                self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
+            )
     def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
         """Update tool calls, handling both full and partial updates."""
@@ -292,29 +314,33 @@ class StreamResponse:
                     current_tool["function"]["arguments"] += func_delta["arguments"]
     def has_updates(self) -> bool:
-        """Check if this response has any content or tool call updates."""
-        return bool(self.content) or bool(self.tool_calls)
+        """Check if this response has any content, tool call, or usage updates."""
+        has_content = bool(self.content)
+        has_tool_calls = bool(self.tool_calls)
+        has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
+        has_finish = bool(self.finish_reason)
+        return has_content or has_tool_calls or has_usage or has_finish
     def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
-        """Convert current state to LLMOutput."""
+        """Convert current state to LLMOutput."""
         buffer, output, _ = transformer(self.content, buffer)
         # Add tool calls if present
         if self.tool_calls:
             output.tool_calls = self.tool_calls
-        # Add usage stats if this is final
-        if self.finish_reason:
-            output.usage = LLMUsage(
-                stop_reason=self.usage_stats["stop_reason"],
-                time_to_first_token=self.timing_stats["time_to_first_token"],
-                tokens_per_second=self.timing_stats["tokens_per_second"],
-                prompt_tokens=self.usage_stats["prompt_tokens"],
-                completion_tokens=self.usage_stats["completion_tokens"],
-                total_tokens=self.usage_stats["total_tokens"],
-                reasoning_time=self.timing_stats["reasoning_time"],
-                reasoning_tokens=self.timing_stats["reasoning_tokens"]
-            )
+        # Add usage stats
+        output.usage = LLMUsage(
+            stop_reason=self.usage_stats["stop_reason"],
+            time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
+            tokens_per_second=self.timing_stats["tokens_per_second"],
+            prompt_tokens=self.usage_stats["prompt_tokens"],
+            completion_tokens=self.usage_stats["completion_tokens"],
+            total_tokens=self.usage_stats["total_tokens"],
+            reasoning_time=self.timing_stats["reasoning_time"],
+            reasoning_tokens=self.timing_stats["reasoning_tokens"]
+        )
         return output, buffer

{inferencesh-0.2.23 → inferencesh-0.2.25/src/inferencesh.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.23
+Version: 0.2.25
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>