PyPI - inferencesh - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

inferencesh 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of inferencesh might be problematic. Click here for more details.

Files changed (7) hide show

inferencesh/models/llm.py CHANGED Viewed

@@ -10,7 +10,6 @@ import base64
 from .base import BaseAppInput, BaseAppOutput
 from .file import File
 class ContextMessageRole(str, Enum):
     USER = "user"
     ASSISTANT = "assistant"
@@ -140,12 +139,17 @@ def timing_context():
         @property
         def stats(self):
-            end_time = time.time()
+            current_time = time.time()
             if self.first_token_time is None:
-                self.first_token_time = end_time
+                return {
+                    "time_to_first_token": 0.0,
+                    "generation_time": 0.0,
+                    "reasoning_time": self.total_reasoning_time,
+                    "reasoning_tokens": self.reasoning_tokens
+                }
             time_to_first = self.first_token_time - self.start_time
-            generation_time = end_time - self.first_token_time
+            generation_time = current_time - self.first_token_time
             return {
                 "time_to_first_token": time_to_first,
@@ -209,6 +213,151 @@ def build_messages(
     return messages
+class StreamResponse:
+    """Holds a single chunk of streamed response."""
+    def __init__(self):
+        self.content = ""
+        self.tool_calls = None  # Changed from [] to None
+        self.finish_reason = None
+        self.timing_stats = {
+            "time_to_first_token": None,  # Changed from 0.0 to None
+            "generation_time": 0.0,
+            "reasoning_time": 0.0,
+            "reasoning_tokens": 0,
+            "tokens_per_second": 0.0
+        }
+        self.usage_stats = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_tokens": 0,
+            "stop_reason": ""
+        }
+    def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
+        """Update response state from a chunk."""
+        print("DEBUG: Entering update_from_chunk")
+        print(f"DEBUG: Current usage stats: {self.usage_stats}")
+        print(f"DEBUG: Chunk: {chunk}")
+        # Update usage stats if present
+        if "usage" in chunk:
+            usage = chunk["usage"]
+            if usage is not None:
+                print(f"DEBUG: Updating usage stats with: {usage}")
+                # Update usage stats preserving existing values if not provided
+                self.usage_stats.update({
+                    "prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
+                    "completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
+                    "total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
+                })
+                print(f"DEBUG: Updated usage stats: {self.usage_stats}")
+        # Get the delta from the chunk
+        delta = chunk.get("choices", [{}])[0]
+        # Extract content and tool calls from either message or delta
+        if "message" in delta:
+            message = delta["message"]
+            self.content = message.get("content", "")
+            if message.get("tool_calls"):
+                self._update_tool_calls(message["tool_calls"])
+            self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
+        elif "delta" in delta:
+            delta_content = delta["delta"]
+            self.content = delta_content.get("content", "")
+            if delta_content.get("tool_calls"):
+                self._update_tool_calls(delta_content["tool_calls"])
+            self.finish_reason = delta.get("finish_reason")
+            if self.finish_reason:
+                self.usage_stats["stop_reason"] = self.finish_reason
+        # Update timing stats
+        timing_stats = timing.stats
+        if self.timing_stats["time_to_first_token"] is None:
+            self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
+        self.timing_stats.update({
+            "generation_time": timing_stats["generation_time"],
+            "reasoning_time": timing_stats["reasoning_time"],
+            "reasoning_tokens": timing_stats["reasoning_tokens"]
+        })
+        # Calculate tokens per second only if we have valid completion tokens and generation time
+        if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
+            self.timing_stats["tokens_per_second"] = (
+                self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
+            )
+        print(f"DEBUG: Final usage stats in update_from_chunk: {self.usage_stats}")
+    def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
+        """Update tool calls, handling both full and partial updates."""
+        if self.tool_calls is None:
+            self.tool_calls = []
+        for tool_delta in new_tool_calls:
+            tool_id = tool_delta.get("id")
+            if not tool_id:
+                continue
+            # Find or create tool call
+            current_tool = next((t for t in self.tool_calls if t["id"] == tool_id), None)
+            if not current_tool:
+                current_tool = {
+                    "id": tool_id,
+                    "type": tool_delta.get("type", "function"),
+                    "function": {"name": "", "arguments": ""}
+                }
+                self.tool_calls.append(current_tool)
+            # Update tool call
+            if "function" in tool_delta:
+                func_delta = tool_delta["function"]
+                if "name" in func_delta:
+                    current_tool["function"]["name"] = func_delta["name"]
+                if "arguments" in func_delta:
+                    current_tool["function"]["arguments"] += func_delta["arguments"]
+    def has_updates(self) -> bool:
+        """Check if this response has any content, tool call, or usage updates."""
+        has_content = bool(self.content)
+        has_tool_calls = bool(self.tool_calls)
+        has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
+        has_finish = bool(self.finish_reason)
+        print(f"DEBUG: has_updates check - content: {has_content}, tool_calls: {has_tool_calls}, usage: {has_usage}, finish: {has_finish}")
+        return has_content or has_tool_calls or has_usage or has_finish
+    def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
+        """Convert current state to LLMOutput."""
+        print("DEBUG: Entering to_output")
+        print(f"DEBUG: Usage stats before conversion: {self.usage_stats}")
+        buffer, output, _ = transformer(self.content, buffer)
+        # Add tool calls if present
+        if self.tool_calls:
+            output.tool_calls = self.tool_calls
+        # Add usage stats
+        print(f"DEBUG: Creating LLMUsage with stats: {self.usage_stats}")
+        output.usage = LLMUsage(
+            stop_reason=self.usage_stats["stop_reason"],
+            time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
+            tokens_per_second=self.timing_stats["tokens_per_second"],
+            prompt_tokens=self.usage_stats["prompt_tokens"],
+            completion_tokens=self.usage_stats["completion_tokens"],
+            total_tokens=self.usage_stats["total_tokens"],
+            reasoning_time=self.timing_stats["reasoning_time"],
+            reasoning_tokens=self.timing_stats["reasoning_tokens"]
+        )
+        print(f"DEBUG: Created output usage: {output.usage}")
+        return output, buffer
 class ResponseState:
     """Holds the state of response transformation."""
     def __init__(self):
@@ -216,7 +365,7 @@ class ResponseState:
         self.response = ""
         self.reasoning = None
         self.function_calls = None  # For future function calling support
-        self.tool_calls = []      # List to accumulate tool calls
+        self.tool_calls = None      # List to accumulate tool calls
         self.current_tool_call = None  # Track current tool call being built
         self.state_changes = {
             "reasoning_started": False,
@@ -243,6 +392,9 @@ class ResponseTransformer:
         Returns:
             Cleaned text with common and model-specific tokens removed
         """
+        if text is None:
+            return ""
         # Common token cleaning across most models
         cleaned = (text.replace("<|im_end|>", "")
                       .replace("<|im_start|>", "")
@@ -367,158 +519,58 @@ def stream_generate(
     model: Any,
     messages: List[Dict[str, Any]],
     transformer: ResponseTransformer = ResponseTransformer(),
-    tools: List[Dict[str, Any]] = [],
-    tool_choice: Dict[str, Any] = {},
+    tools: Optional[List[Dict[str, Any]]] = None,
+    tool_choice: Optional[Dict[str, Any]] = None,
     temperature: float = 0.7,
     top_p: float = 0.95,
     max_tokens: int = 4096,
     stop: Optional[List[str]] = None,
+    verbose: bool = False,
 ) -> Generator[LLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
-    response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
-    thread_exception = None
-    usage_stats = {
-        "prompt_tokens": 0,
-        "completion_tokens": 0,
-        "total_tokens": 0,
-        "stop_reason": ""
-    }
     with timing_context() as timing:
         transformer.timing = timing
-        def generation_thread():
-            nonlocal thread_exception, usage_stats
-            try:
-                completion = model.create_chat_completion(
-                    messages=messages,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    stream=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                    max_tokens=max_tokens,
-                    stop=stop
-                )
-                tool_calls = []
-                current_tool = None
-                for chunk in completion:
-                    if "usage" in chunk and chunk["usage"] is not None:
-                        usage_stats.update(chunk["usage"])
-                    delta = chunk.get("choices", [{}])[0]
-                    content = ""
-                    finish_reason = None
-                    # Extract delta content from either message or delta
-                    if "message" in delta:
-                        message = delta["message"]
-                        content = message.get("content", "")
-                        if message.get("tool_calls"):
-                            for tool in message["tool_calls"]:
-                                if tool.get("id") not in {t.get("id") for t in tool_calls}:
-                                    tool_calls.append(tool)
-                        finish_reason = delta.get("finish_reason")
-                    elif "delta" in delta:
-                        delta_content = delta["delta"]
-                        content = delta_content.get("content", "")
-                        # Handle streaming tool calls
-                        if delta_content.get("tool_calls"):
-                            for tool_delta in delta_content["tool_calls"]:
-                                tool_id = tool_delta.get("id")
-                                # Find or create tool call
-                                if tool_id:
-                                    current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
-                                    if not current_tool:
-                                        current_tool = {
-                                            "id": tool_id,
-                                            "type": tool_delta.get("type", "function"),
-                                            "function": {"name": "", "arguments": ""}
-                                        }
-                                        tool_calls.append(current_tool)
-                                # Update tool call
-                                if current_tool and "function" in tool_delta:
-                                    func_delta = tool_delta["function"]
-                                    if "name" in func_delta:
-                                        current_tool["function"]["name"] = func_delta["name"]
-                                    if "arguments" in func_delta:
-                                        current_tool["function"]["arguments"] += func_delta["arguments"]
-                        finish_reason = delta.get("finish_reason")
-                    has_update = bool(content)
-                    has_tool_update = bool(
-                        (delta.get("message", {}) or {}).get("tool_calls") or
-                        (delta.get("delta", {}) or {}).get("tool_calls")
-                    )
-                    if has_update or has_tool_update:
-                        if not timing.first_token_time:
-                            timing.mark_first_token()
-                        response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
-                    if finish_reason:
-                        usage_stats["stop_reason"] = finish_reason
-            except Exception as e:
-                thread_exception = e
-            finally:
-                timing_stats = timing.stats
-                generation_time = timing_stats["generation_time"]
-                tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
-                response_queue.put((None, {
-                    "time_to_first_token": timing_stats["time_to_first_token"],
-                    "tokens_per_second": tokens_per_second,
-                    "reasoning_time": timing_stats["reasoning_time"],
-                    "reasoning_tokens": timing_stats["reasoning_tokens"]
-                }, tool_calls if tool_calls else None))
-        thread = Thread(target=generation_thread, daemon=True)
-        thread.start()
+        # Build completion kwargs
+        completion_kwargs = {
+            "messages": messages,
+            "stream": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": max_tokens,
+            "stop": stop
+        }
+        if tools is not None:
+            completion_kwargs["tools"] = tools
+        if tool_choice is not None:
+            completion_kwargs["tool_choice"] = tool_choice
+        # Initialize response state
+        response = StreamResponse()
         buffer = ""
         try:
-            while True:
-                try:
-                    result = response_queue.get(timeout=30.0)
-                    if thread_exception:
-                        raise thread_exception
-                    piece, timing_stats, tool_calls = result
-                    if piece is None:
-                        # Final yield with complete usage stats
-                        usage = LLMUsage(
-                            stop_reason=usage_stats["stop_reason"],
-                            time_to_first_token=timing_stats["time_to_first_token"],
-                            tokens_per_second=timing_stats["tokens_per_second"],
-                            prompt_tokens=usage_stats["prompt_tokens"],
-                            completion_tokens=usage_stats["completion_tokens"],
-                            total_tokens=usage_stats["total_tokens"],
-                            reasoning_time=timing_stats["reasoning_time"],
-                            reasoning_tokens=timing_stats["reasoning_tokens"]
-                        )
-                        buffer, output, _ = transformer(piece or "", buffer)
-                        output.usage = usage
-                        if tool_calls:
-                            output.tool_calls = tool_calls
-                        yield output
-                        break
-                    buffer, output, _ = transformer(piece, buffer)
-                    if tool_calls:
-                        output.tool_calls = tool_calls
+            completion = model.create_chat_completion(**completion_kwargs)
+            for chunk in completion:
+                if verbose:
+                    print(chunk)
+                # Mark first token time as soon as we get any response
+                if not timing.first_token_time:
+                    timing.mark_first_token()
+                # Update response state from chunk
+                response.update_from_chunk(chunk, timing)
+                # Yield output if we have updates
+                if response.has_updates():
+                    output, buffer = response.to_output(buffer, transformer)
                     yield output
-                except Exception as e:
-                    if thread_exception and isinstance(e, thread_exception.__class__):
-                        raise thread_exception
+                # Break if we're done
+                if response.finish_reason:
                     break
-        finally:
-            if thread and thread.is_alive():
-                thread.join(timeout=2.0)
+        except Exception as e:
+            # Ensure any error is properly propagated
+            raise e

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.22
+Version: 0.2.24
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/RECORD RENAMED Viewed

@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
 inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
 inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
 inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
-inferencesh/models/llm.py,sha256=XVHsHANGHXhB54aXAS-YcQNcgM673Q_b90xa10gorbA,21729
+inferencesh/models/llm.py,sha256=ycg20sSx3UJevjoTVukBZXwRyXY06tFZKAmlVp0MBzQ,23168
 inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
 inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
 inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
-inferencesh-0.2.22.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
-inferencesh-0.2.22.dist-info/METADATA,sha256=o78bpkWPq1MqQH_qgT3VTK1hJLKrZRHUX8e5PVuS_4M,2757
-inferencesh-0.2.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-inferencesh-0.2.22.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
-inferencesh-0.2.22.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
-inferencesh-0.2.22.dist-info/RECORD,,
+inferencesh-0.2.24.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
+inferencesh-0.2.24.dist-info/METADATA,sha256=kQq9qN65EU9DS-SAQHm3Sw73yzz-FZVQX6ueHSgktW8,2757
+inferencesh-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+inferencesh-0.2.24.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
+inferencesh-0.2.24.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
+inferencesh-0.2.24.dist-info/RECORD,,

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{inferencesh-0.2.22.dist-info → inferencesh-0.2.24.dist-info}/top_level.txt RENAMED Viewed

File without changes

inferencesh 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

Potentially problematic release.

inferencesh 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl