PyPI - inferencesh - Versions diffs - 0.2.26__py3-none-any.whl → 0.2.28__py3-none-any.whl - Mend

inferencesh 0.2.26py3-none-any.whl → 0.2.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of inferencesh might be problematic. Click here for more details.

Files changed (7) hide show

inferencesh/models/llm.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Optional, List, Any, Callable, Dict, Generator
 from enum import Enum
-from pydantic import Field
-from queue import Queue
+from pydantic import Field, BaseModel
+from queue import Queue, Empty
 from threading import Thread
 import time
 from contextlib import contextmanager
@@ -33,7 +33,8 @@ class ContextMessage(BaseAppInput):
         default=None
     )
-class LLMInput(BaseAppInput):
+class BaseLLMInput(BaseAppInput):
+    """Base class with common LLM fields."""
     system_prompt: str = Field(
         description="The system prompt to use for the model",
         default="You are a helpful assistant that can answer questions and help with tasks.",
@@ -47,25 +48,13 @@ class LLMInput(BaseAppInput):
     )
     context: List[ContextMessage] = Field(
         description="The context to use for the model",
+        default=[],
         examples=[
             [
-                {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
+                {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
                 {"role": "assistant", "content": [{"type": "text", "text": "The capital of France is Paris."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "What is the weather like today?"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "I apologize, but I don't have access to real-time weather information. You would need to check a weather service or app to get current weather conditions for your location."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "Can you help me write a poem about spring?"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "Here's a short poem about spring:\n\nGreen buds awakening,\nSoft rain gently falling down,\nNew life springs anew.\n\nWarm sun breaks through clouds,\nBirds return with joyful song,\nNature's sweet rebirth."}]}
-            ],
-            [
-                {"role": "user", "content": [{"type": "text", "text": "Explain quantum computing in simple terms"}]},
-                {"role": "assistant", "content": [{"type": "text", "text": "Quantum computing is like having a super-powerful calculator that can solve many problems at once instead of one at a time. While regular computers use bits (0s and 1s), quantum computers use quantum bits or \"qubits\" that can be both 0 and 1 at the same time - kind of like being in two places at once! This allows them to process huge amounts of information much faster than regular computers for certain types of problems."}]}
             ]
-        ],
-        default=[]
+        ]
     )
     text: str = Field(
         description="The user prompt to use for the model",
@@ -74,22 +63,41 @@ class LLMInput(BaseAppInput):
             "What is the weather like today?",
             "Can you help me write a poem about spring?",
             "Explain quantum computing in simple terms"
-        ],
-    )
-    image: Optional[File] = Field(
-        description="The image to use for the model",
-        default=None
+        ]
     )
-    # Optional parameters
     temperature: float = Field(default=0.7)
     top_p: float = Field(default=0.95)
     max_tokens: int = Field(default=4096)
     context_size: int = Field(default=4096)
-    # Model specific flags
-    reasoning: Optional[bool] = Field(default=None)
-    tools: Optional[List[Dict[str, Any]]] = Field(default=None)
+class ImageCapabilityMixin(BaseModel):
+    """Mixin for models that support image inputs."""
+    image: Optional[File] = Field(
+        description="The image to use for the model",
+        default=None
+    )
+class ReasoningCapabilityMixin(BaseModel):
+    """Mixin for models that support reasoning."""
+    reasoning: bool = Field(
+        description="Enable step-by-step reasoning",
+        default=False
+    )
+class ToolsCapabilityMixin(BaseModel):
+    """Mixin for models that support tool/function calling."""
+    tools: Optional[List[Dict[str, Any]]] = Field(
+        description="Tool definitions for function calling",
+        default=None
+    )
+# Example of how to use:
+class LLMInput(BaseLLMInput):
+    """Default LLM input model with no special capabilities."""
+    pass
+# For backward compatibility
+LLMInput.model_config["title"] = "LLMInput"
 class LLMUsage(BaseAppOutput):
     stop_reason: str = ""
@@ -102,12 +110,24 @@ class LLMUsage(BaseAppOutput):
     reasoning_time: float = 0.0
-class LLMOutput(BaseAppOutput):
-    response: str
-    reasoning: Optional[str] = None
-    tool_calls: Optional[List[Dict[str, Any]]] = None
-    usage: Optional[LLMUsage] = None
+class BaseLLMOutput(BaseAppOutput):
+    """Base class for LLM outputs with common fields."""
+    response: str = Field(description="The generated text response")
+class LLMUsageMixin(BaseModel):
+    """Mixin for models that provide token usage statistics."""
+    usage: Optional[LLMUsage] = Field(
+        description="Token usage statistics",
+        default=None
+    )
+# Example of how to use:
+class LLMOutput(BaseLLMOutput, LLMUsageMixin):
+    """Default LLM output model with token usage tracking."""
+    pass
+# For backward compatibility
+LLMOutput.model_config["title"] = "LLMOutput"
 @contextmanager
 def timing_context():
@@ -322,26 +342,28 @@ class StreamResponse:
         return has_content or has_tool_calls or has_usage or has_finish
-    def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
+    def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
         """Convert current state to LLMOutput."""
-        buffer, output, _ = transformer(self.content, buffer)
+        # Create usage object if we have stats
+        usage = None
+        if any(self.usage_stats.values()):
+            usage = LLMUsage(
+                stop_reason=self.usage_stats["stop_reason"],
+                time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
+                tokens_per_second=self.timing_stats["tokens_per_second"],
+                prompt_tokens=self.usage_stats["prompt_tokens"],
+                completion_tokens=self.usage_stats["completion_tokens"],
+                total_tokens=self.usage_stats["total_tokens"],
+                reasoning_time=self.timing_stats["reasoning_time"],
+                reasoning_tokens=self.timing_stats["reasoning_tokens"]
+            )
+        buffer, output, _ = transformer(self.content, buffer, usage)
-        # Add tool calls if present
-        if self.tool_calls:
+        # Add tool calls if present and supported
+        if self.tool_calls and hasattr(output, 'tool_calls'):
             output.tool_calls = self.tool_calls
-        # Add usage stats
-        output.usage = LLMUsage(
-            stop_reason=self.usage_stats["stop_reason"],
-            time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
-            tokens_per_second=self.timing_stats["tokens_per_second"],
-            prompt_tokens=self.usage_stats["prompt_tokens"],
-            completion_tokens=self.usage_stats["completion_tokens"],
-            total_tokens=self.usage_stats["total_tokens"],
-            reasoning_time=self.timing_stats["reasoning_time"],
-            reasoning_tokens=self.timing_stats["reasoning_tokens"]
-        )
         return output, buffer
 class ResponseState:
@@ -353,6 +375,7 @@ class ResponseState:
         self.function_calls = None  # For future function calling support
         self.tool_calls = None      # List to accumulate tool calls
         self.current_tool_call = None  # Track current tool call being built
+        self.usage = None  # Add usage field
         self.state_changes = {
             "reasoning_started": False,
             "reasoning_ended": False,
@@ -364,7 +387,7 @@ class ResponseState:
 class ResponseTransformer:
     """Base class for transforming model responses."""
-    def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
+    def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
         self.state = ResponseState()
         self.output_cls = output_cls
         self.timing = None  # Will be set by stream_generate
@@ -475,28 +498,43 @@ class ResponseTransformer:
         Returns:
             Tuple of (buffer, LLMOutput, state_changes)
         """
+        # Build base output with required fields
+        output_data = {
+            "response": self.state.response.strip(),
+        }
+        # Add optional fields if they exist
+        if self.state.usage is not None:
+            output_data["usage"] = self.state.usage
+        if self.state.reasoning:
+            output_data["reasoning"] = self.state.reasoning.strip()
+        if self.state.function_calls:
+            output_data["function_calls"] = self.state.function_calls
+        if self.state.tool_calls:
+            output_data["tool_calls"] = self.state.tool_calls
+        output = self.output_cls(**output_data)
         return (
             self.state.buffer,
-            self.output_cls(
-                response=self.state.response.strip(),
-                reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
-                function_calls=self.state.function_calls,
-                tool_calls=self.state.tool_calls
-            ),
+            output,
             self.state.state_changes
         )
-    def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
+    def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
         """Transform a piece of text and return the result.
         Args:
             piece: New piece of text to transform
             buffer: Existing buffer content
+            usage: Optional usage statistics
         Returns:
             Tuple of (new_buffer, output, state_changes)
         """
         self.state.buffer = buffer
+        if usage is not None:
+            self.state.usage = usage
         self.transform_chunk(piece)
         return self.build_output()
@@ -512,36 +550,108 @@ def stream_generate(
     max_tokens: int = 4096,
     stop: Optional[List[str]] = None,
     verbose: bool = False,
-) -> Generator[LLMOutput, None, None]:
+    output_cls: type[BaseLLMOutput] = LLMOutput,
+) -> Generator[BaseLLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
+    # Create queues for communication between threads
+    response_queue = Queue()
+    error_queue = Queue()
+    keep_alive_queue = Queue()
+    # Set the output class for the transformer
+    transformer.output_cls = output_cls
+    def _generate_worker():
+        """Worker thread to run the model generation."""
+        try:
+            # Build completion kwargs
+            completion_kwargs = {
+                "messages": messages,
+                "stream": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "max_tokens": max_tokens,
+                "stop": stop
+            }
+            if tools is not None:
+                completion_kwargs["tools"] = tools
+            if tool_choice is not None:
+                completion_kwargs["tool_choice"] = tool_choice
+            # Signal that we're starting
+            keep_alive_queue.put(("init", time.time()))
+            completion = model.create_chat_completion(**completion_kwargs)
+            for chunk in completion:
+                if verbose:
+                    print(chunk)
+                response_queue.put(("chunk", chunk))
+                # Update keep-alive timestamp
+                keep_alive_queue.put(("alive", time.time()))
+            # Signal completion
+            response_queue.put(("done", None))
+        except Exception as e:
+            error_queue.put(e)
+            response_queue.put(("error", str(e)))
     with timing_context() as timing:
         transformer.timing = timing
-        # Build completion kwargs
-        completion_kwargs = {
-            "messages": messages,
-            "stream": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_tokens": max_tokens,
-            "stop": stop
-        }
-        if tools is not None:
-            completion_kwargs["tools"] = tools
-        if tool_choice is not None:
-            completion_kwargs["tool_choice"] = tool_choice
+        # Start generation thread
+        generation_thread = Thread(target=_generate_worker, daemon=True)
+        generation_thread.start()
         # Initialize response state
         response = StreamResponse()
         buffer = ""
+        # Keep-alive tracking
+        last_activity = time.time()
+        init_timeout = 30.0  # 30 seconds for initial response
+        chunk_timeout = 10.0  # 10 seconds between chunks
         try:
-            completion = model.create_chat_completion(**completion_kwargs)
+            # Wait for initial setup
+            try:
+                msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
+                if msg_type != "init":
+                    raise RuntimeError("Unexpected initialization message")
+                last_activity = timestamp
+            except Empty:
+                raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
-            for chunk in completion:
-                if verbose:
-                    print(chunk)
-                # Mark first token time as soon as we get any response
+            while True:
+                # Check for errors
+                if not error_queue.empty():
+                    raise error_queue.get()
+                # Check keep-alive
+                while not keep_alive_queue.empty():
+                    _, timestamp = keep_alive_queue.get_nowait()
+                    last_activity = timestamp
+                # Check for timeout
+                if time.time() - last_activity > chunk_timeout:
+                    raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
+                # Get next chunk
+                try:
+                    msg_type, data = response_queue.get(timeout=0.1)
+                except Empty:
+                    continue
+                if msg_type == "error":
+                    raise RuntimeError(f"Generation error: {data}")
+                elif msg_type == "done":
+                    break
+                chunk = data
+                # Mark first token time
                 if not timing.first_token_time:
                     timing.mark_first_token()
@@ -556,6 +666,13 @@ def stream_generate(
                 # Break if we're done
                 if response.finish_reason:
                     break
+            # Wait for generation thread to finish
+            generation_thread.join(timeout=5.0)  # Increased timeout to 5 seconds
+            if generation_thread.is_alive():
+                # Thread didn't finish - this shouldn't happen normally
+                # but we handle it gracefully
+                raise RuntimeError("Generation thread failed to finish")
         except Exception as e:
             # Ensure any error is properly propagated

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inferencesh
-Version: 0.2.26
+Version: 0.2.28
 Summary: inference.sh Python SDK
 Author: Inference Shell Inc.
 Author-email: "Inference Shell Inc." <hello@inference.sh>

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/RECORD RENAMED Viewed

@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
 inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
 inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
 inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
-inferencesh/models/llm.py,sha256=3NZW6-_SXNPqVHu5LUDACYQupa9DHMJxDvay3x-AEUY,22383
+inferencesh/models/llm.py,sha256=E2Mz56Cu_GODDhnNKE5gE5pOTgX4ekJv6UdO44wWON8,25806
 inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
 inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
 inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
-inferencesh-0.2.26.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
-inferencesh-0.2.26.dist-info/METADATA,sha256=cbycGHYSsZVicaxlQWqiDV4KhZsTkXDT0OU3hArQG04,2757
-inferencesh-0.2.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-inferencesh-0.2.26.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
-inferencesh-0.2.26.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
-inferencesh-0.2.26.dist-info/RECORD,,
+inferencesh-0.2.28.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
+inferencesh-0.2.28.dist-info/METADATA,sha256=9TxV1q5wsokL3de27EJKvRr9MFfOi86rxzoEEnKVTSU,2757
+inferencesh-0.2.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+inferencesh-0.2.28.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
+inferencesh-0.2.28.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
+inferencesh-0.2.28.dist-info/RECORD,,

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/WHEEL RENAMED Viewed

File without changes

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/top_level.txt RENAMED Viewed

File without changes

inferencesh 0.2.26__py3-none-any.whl → 0.2.28__py3-none-any.whl

Potentially problematic release.

inferencesh 0.2.26py3-none-any.whl → 0.2.28py3-none-any.whl