inferencesh 0.2.27__tar.gz → 0.2.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show
  1. {inferencesh-0.2.27/src/inferencesh.egg-info → inferencesh-0.2.29}/PKG-INFO +1 -1
  2. {inferencesh-0.2.27 → inferencesh-0.2.29}/pyproject.toml +1 -1
  3. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/llm.py +158 -48
  4. {inferencesh-0.2.27 → inferencesh-0.2.29/src/inferencesh.egg-info}/PKG-INFO +1 -1
  5. {inferencesh-0.2.27 → inferencesh-0.2.29}/LICENSE +0 -0
  6. {inferencesh-0.2.27 → inferencesh-0.2.29}/README.md +0 -0
  7. {inferencesh-0.2.27 → inferencesh-0.2.29}/setup.cfg +0 -0
  8. {inferencesh-0.2.27 → inferencesh-0.2.29}/setup.py +0 -0
  9. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/__init__.py +0 -0
  10. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/__init__.py +0 -0
  11. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/base.py +0 -0
  12. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/file.py +0 -0
  13. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/__init__.py +0 -0
  14. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/download.py +0 -0
  15. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/storage.py +0 -0
  16. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/SOURCES.txt +0 -0
  17. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/dependency_links.txt +0 -0
  18. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/entry_points.txt +0 -0
  19. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/requires.txt +0 -0
  20. {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/top_level.txt +0 -0
  21. {inferencesh-0.2.27 → inferencesh-0.2.29}/tests/test_sdk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.27
3
+ Version: 0.2.29
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "inferencesh"
7
- version = "0.2.27"
7
+ version = "0.2.29"
8
8
  description = "inference.sh Python SDK"
9
9
  authors = [
10
10
  {name = "Inference Shell Inc.", email = "hello@inference.sh"},
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, List, Any, Callable, Dict, Generator
2
2
  from enum import Enum
3
3
  from pydantic import Field, BaseModel
4
- from queue import Queue
4
+ from queue import Queue, Empty
5
5
  from threading import Thread
6
6
  import time
7
7
  from contextlib import contextmanager
@@ -9,7 +9,6 @@ import base64
9
9
 
10
10
  from .base import BaseAppInput, BaseAppOutput
11
11
  from .file import File
12
- from .types import ContextMessage
13
12
 
14
13
  class ContextMessageRole(str, Enum):
15
14
  USER = "user"
@@ -113,13 +112,27 @@ class LLMUsage(BaseAppOutput):
113
112
 
114
113
  class BaseLLMOutput(BaseAppOutput):
115
114
  """Base class for LLM outputs with common fields."""
116
- text: str = Field(description="The generated text response")
117
- done: bool = Field(default=False, description="Whether this is the final chunk")
115
+ response: str = Field(description="The generated text response")
118
116
 
119
117
  class LLMUsageMixin(BaseModel):
120
118
  """Mixin for models that provide token usage statistics."""
121
119
  usage: Optional[LLMUsage] = Field(
122
- description="Token usage statistics"
120
+ description="Token usage statistics",
121
+ default=None
122
+ )
123
+
124
+ class ReasoningMixin(BaseModel):
125
+ """Mixin for models that support reasoning."""
126
+ reasoning: Optional[str] = Field(
127
+ description="The reasoning output of the model",
128
+ default=None
129
+ )
130
+
131
+ class ToolCallsMixin(BaseModel):
132
+ """Mixin for models that support tool calls."""
133
+ tool_calls: Optional[List[Dict[str, Any]]] = Field(
134
+ description="Tool calls for function calling",
135
+ default=None
123
136
  )
124
137
 
125
138
  # Example of how to use:
@@ -343,26 +356,28 @@ class StreamResponse:
343
356
 
344
357
  return has_content or has_tool_calls or has_usage or has_finish
345
358
 
346
- def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
359
+ def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
347
360
  """Convert current state to LLMOutput."""
348
- buffer, output, _ = transformer(self.content, buffer)
361
+ # Create usage object if we have stats
362
+ usage = None
363
+ if any(self.usage_stats.values()):
364
+ usage = LLMUsage(
365
+ stop_reason=self.usage_stats["stop_reason"],
366
+ time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
367
+ tokens_per_second=self.timing_stats["tokens_per_second"],
368
+ prompt_tokens=self.usage_stats["prompt_tokens"],
369
+ completion_tokens=self.usage_stats["completion_tokens"],
370
+ total_tokens=self.usage_stats["total_tokens"],
371
+ reasoning_time=self.timing_stats["reasoning_time"],
372
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
373
+ )
349
374
 
350
- # Add tool calls if present
351
- if self.tool_calls:
375
+ buffer, output, _ = transformer(self.content, buffer, usage)
376
+
377
+ # Add tool calls if present and supported
378
+ if self.tool_calls and hasattr(output, 'tool_calls'):
352
379
  output.tool_calls = self.tool_calls
353
380
 
354
- # Add usage stats
355
- output.usage = LLMUsage(
356
- stop_reason=self.usage_stats["stop_reason"],
357
- time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
358
- tokens_per_second=self.timing_stats["tokens_per_second"],
359
- prompt_tokens=self.usage_stats["prompt_tokens"],
360
- completion_tokens=self.usage_stats["completion_tokens"],
361
- total_tokens=self.usage_stats["total_tokens"],
362
- reasoning_time=self.timing_stats["reasoning_time"],
363
- reasoning_tokens=self.timing_stats["reasoning_tokens"]
364
- )
365
-
366
381
  return output, buffer
367
382
 
368
383
  class ResponseState:
@@ -374,6 +389,7 @@ class ResponseState:
374
389
  self.function_calls = None # For future function calling support
375
390
  self.tool_calls = None # List to accumulate tool calls
376
391
  self.current_tool_call = None # Track current tool call being built
392
+ self.usage = None # Add usage field
377
393
  self.state_changes = {
378
394
  "reasoning_started": False,
379
395
  "reasoning_ended": False,
@@ -385,7 +401,7 @@ class ResponseState:
385
401
 
386
402
  class ResponseTransformer:
387
403
  """Base class for transforming model responses."""
388
- def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
404
+ def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
389
405
  self.state = ResponseState()
390
406
  self.output_cls = output_cls
391
407
  self.timing = None # Will be set by stream_generate
@@ -496,28 +512,43 @@ class ResponseTransformer:
496
512
  Returns:
497
513
  Tuple of (buffer, LLMOutput, state_changes)
498
514
  """
515
+ # Build base output with required fields
516
+ output_data = {
517
+ "response": self.state.response.strip(),
518
+ }
519
+
520
+ # Add optional fields if they exist
521
+ if self.state.usage is not None:
522
+ output_data["usage"] = self.state.usage
523
+ if self.state.reasoning:
524
+ output_data["reasoning"] = self.state.reasoning.strip()
525
+ if self.state.function_calls:
526
+ output_data["function_calls"] = self.state.function_calls
527
+ if self.state.tool_calls:
528
+ output_data["tool_calls"] = self.state.tool_calls
529
+
530
+ output = self.output_cls(**output_data)
531
+
499
532
  return (
500
533
  self.state.buffer,
501
- self.output_cls(
502
- response=self.state.response.strip(),
503
- reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
504
- function_calls=self.state.function_calls,
505
- tool_calls=self.state.tool_calls
506
- ),
534
+ output,
507
535
  self.state.state_changes
508
536
  )
509
537
 
510
- def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
538
+ def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
511
539
  """Transform a piece of text and return the result.
512
540
 
513
541
  Args:
514
542
  piece: New piece of text to transform
515
543
  buffer: Existing buffer content
544
+ usage: Optional usage statistics
516
545
 
517
546
  Returns:
518
547
  Tuple of (new_buffer, output, state_changes)
519
548
  """
520
549
  self.state.buffer = buffer
550
+ if usage is not None:
551
+ self.state.usage = usage
521
552
  self.transform_chunk(piece)
522
553
  return self.build_output()
523
554
 
@@ -533,36 +564,108 @@ def stream_generate(
533
564
  max_tokens: int = 4096,
534
565
  stop: Optional[List[str]] = None,
535
566
  verbose: bool = False,
536
- ) -> Generator[LLMOutput, None, None]:
567
+ output_cls: type[BaseLLMOutput] = LLMOutput,
568
+ ) -> Generator[BaseLLMOutput, None, None]:
537
569
  """Stream generate from LLaMA.cpp model with timing and usage tracking."""
570
+
571
+ # Create queues for communication between threads
572
+ response_queue = Queue()
573
+ error_queue = Queue()
574
+ keep_alive_queue = Queue()
575
+
576
+ # Set the output class for the transformer
577
+ transformer.output_cls = output_cls
578
+
579
+ def _generate_worker():
580
+ """Worker thread to run the model generation."""
581
+ try:
582
+ # Build completion kwargs
583
+ completion_kwargs = {
584
+ "messages": messages,
585
+ "stream": True,
586
+ "temperature": temperature,
587
+ "top_p": top_p,
588
+ "max_tokens": max_tokens,
589
+ "stop": stop
590
+ }
591
+ if tools is not None:
592
+ completion_kwargs["tools"] = tools
593
+ if tool_choice is not None:
594
+ completion_kwargs["tool_choice"] = tool_choice
595
+
596
+ # Signal that we're starting
597
+ keep_alive_queue.put(("init", time.time()))
598
+
599
+ completion = model.create_chat_completion(**completion_kwargs)
600
+
601
+ for chunk in completion:
602
+ if verbose:
603
+ print(chunk)
604
+ response_queue.put(("chunk", chunk))
605
+ # Update keep-alive timestamp
606
+ keep_alive_queue.put(("alive", time.time()))
607
+
608
+ # Signal completion
609
+ response_queue.put(("done", None))
610
+
611
+ except Exception as e:
612
+ error_queue.put(e)
613
+ response_queue.put(("error", str(e)))
614
+
538
615
  with timing_context() as timing:
539
616
  transformer.timing = timing
540
617
 
541
- # Build completion kwargs
542
- completion_kwargs = {
543
- "messages": messages,
544
- "stream": True,
545
- "temperature": temperature,
546
- "top_p": top_p,
547
- "max_tokens": max_tokens,
548
- "stop": stop
549
- }
550
- if tools is not None:
551
- completion_kwargs["tools"] = tools
552
- if tool_choice is not None:
553
- completion_kwargs["tool_choice"] = tool_choice
618
+ # Start generation thread
619
+ generation_thread = Thread(target=_generate_worker, daemon=True)
620
+ generation_thread.start()
554
621
 
555
622
  # Initialize response state
556
623
  response = StreamResponse()
557
624
  buffer = ""
558
625
 
626
+ # Keep-alive tracking
627
+ last_activity = time.time()
628
+ init_timeout = 30.0 # 30 seconds for initial response
629
+ chunk_timeout = 10.0 # 10 seconds between chunks
630
+
559
631
  try:
560
- completion = model.create_chat_completion(**completion_kwargs)
632
+ # Wait for initial setup
633
+ try:
634
+ msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
635
+ if msg_type != "init":
636
+ raise RuntimeError("Unexpected initialization message")
637
+ last_activity = timestamp
638
+ except Empty:
639
+ raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
561
640
 
562
- for chunk in completion:
563
- if verbose:
564
- print(chunk)
565
- # Mark first token time as soon as we get any response
641
+ while True:
642
+ # Check for errors
643
+ if not error_queue.empty():
644
+ raise error_queue.get()
645
+
646
+ # Check keep-alive
647
+ while not keep_alive_queue.empty():
648
+ _, timestamp = keep_alive_queue.get_nowait()
649
+ last_activity = timestamp
650
+
651
+ # Check for timeout
652
+ if time.time() - last_activity > chunk_timeout:
653
+ raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
654
+
655
+ # Get next chunk
656
+ try:
657
+ msg_type, data = response_queue.get(timeout=0.1)
658
+ except Empty:
659
+ continue
660
+
661
+ if msg_type == "error":
662
+ raise RuntimeError(f"Generation error: {data}")
663
+ elif msg_type == "done":
664
+ break
665
+
666
+ chunk = data
667
+
668
+ # Mark first token time
566
669
  if not timing.first_token_time:
567
670
  timing.mark_first_token()
568
671
 
@@ -577,6 +680,13 @@ def stream_generate(
577
680
  # Break if we're done
578
681
  if response.finish_reason:
579
682
  break
683
+
684
+ # Wait for generation thread to finish
685
+ generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
686
+ if generation_thread.is_alive():
687
+ # Thread didn't finish - this shouldn't happen normally
688
+ # but we handle it gracefully
689
+ raise RuntimeError("Generation thread failed to finish")
580
690
 
581
691
  except Exception as e:
582
692
  # Ensure any error is properly propagated
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.27
3
+ Version: 0.2.29
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
File without changes
File without changes
File without changes
File without changes