inferencesh 0.2.27__tar.gz → 0.2.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show
  1. {inferencesh-0.2.27/src/inferencesh.egg-info → inferencesh-0.2.28}/PKG-INFO +1 -1
  2. {inferencesh-0.2.27 → inferencesh-0.2.28}/pyproject.toml +1 -1
  3. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/models/llm.py +144 -48
  4. {inferencesh-0.2.27 → inferencesh-0.2.28/src/inferencesh.egg-info}/PKG-INFO +1 -1
  5. {inferencesh-0.2.27 → inferencesh-0.2.28}/LICENSE +0 -0
  6. {inferencesh-0.2.27 → inferencesh-0.2.28}/README.md +0 -0
  7. {inferencesh-0.2.27 → inferencesh-0.2.28}/setup.cfg +0 -0
  8. {inferencesh-0.2.27 → inferencesh-0.2.28}/setup.py +0 -0
  9. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/__init__.py +0 -0
  10. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/models/__init__.py +0 -0
  11. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/models/base.py +0 -0
  12. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/models/file.py +0 -0
  13. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/utils/__init__.py +0 -0
  14. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/utils/download.py +0 -0
  15. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh/utils/storage.py +0 -0
  16. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh.egg-info/SOURCES.txt +0 -0
  17. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh.egg-info/dependency_links.txt +0 -0
  18. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh.egg-info/entry_points.txt +0 -0
  19. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh.egg-info/requires.txt +0 -0
  20. {inferencesh-0.2.27 → inferencesh-0.2.28}/src/inferencesh.egg-info/top_level.txt +0 -0
  21. {inferencesh-0.2.27 → inferencesh-0.2.28}/tests/test_sdk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.27
3
+ Version: 0.2.28
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "inferencesh"
7
- version = "0.2.27"
7
+ version = "0.2.28"
8
8
  description = "inference.sh Python SDK"
9
9
  authors = [
10
10
  {name = "Inference Shell Inc.", email = "hello@inference.sh"},
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, List, Any, Callable, Dict, Generator
2
2
  from enum import Enum
3
3
  from pydantic import Field, BaseModel
4
- from queue import Queue
4
+ from queue import Queue, Empty
5
5
  from threading import Thread
6
6
  import time
7
7
  from contextlib import contextmanager
@@ -9,7 +9,6 @@ import base64
9
9
 
10
10
  from .base import BaseAppInput, BaseAppOutput
11
11
  from .file import File
12
- from .types import ContextMessage
13
12
 
14
13
  class ContextMessageRole(str, Enum):
15
14
  USER = "user"
@@ -113,13 +112,13 @@ class LLMUsage(BaseAppOutput):
113
112
 
114
113
  class BaseLLMOutput(BaseAppOutput):
115
114
  """Base class for LLM outputs with common fields."""
116
- text: str = Field(description="The generated text response")
117
- done: bool = Field(default=False, description="Whether this is the final chunk")
115
+ response: str = Field(description="The generated text response")
118
116
 
119
117
  class LLMUsageMixin(BaseModel):
120
118
  """Mixin for models that provide token usage statistics."""
121
119
  usage: Optional[LLMUsage] = Field(
122
- description="Token usage statistics"
120
+ description="Token usage statistics",
121
+ default=None
123
122
  )
124
123
 
125
124
  # Example of how to use:
@@ -343,26 +342,28 @@ class StreamResponse:
343
342
 
344
343
  return has_content or has_tool_calls or has_usage or has_finish
345
344
 
346
- def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
345
+ def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
347
346
  """Convert current state to LLMOutput."""
348
- buffer, output, _ = transformer(self.content, buffer)
347
+ # Create usage object if we have stats
348
+ usage = None
349
+ if any(self.usage_stats.values()):
350
+ usage = LLMUsage(
351
+ stop_reason=self.usage_stats["stop_reason"],
352
+ time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
353
+ tokens_per_second=self.timing_stats["tokens_per_second"],
354
+ prompt_tokens=self.usage_stats["prompt_tokens"],
355
+ completion_tokens=self.usage_stats["completion_tokens"],
356
+ total_tokens=self.usage_stats["total_tokens"],
357
+ reasoning_time=self.timing_stats["reasoning_time"],
358
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
359
+ )
360
+
361
+ buffer, output, _ = transformer(self.content, buffer, usage)
349
362
 
350
- # Add tool calls if present
351
- if self.tool_calls:
363
+ # Add tool calls if present and supported
364
+ if self.tool_calls and hasattr(output, 'tool_calls'):
352
365
  output.tool_calls = self.tool_calls
353
366
 
354
- # Add usage stats
355
- output.usage = LLMUsage(
356
- stop_reason=self.usage_stats["stop_reason"],
357
- time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
358
- tokens_per_second=self.timing_stats["tokens_per_second"],
359
- prompt_tokens=self.usage_stats["prompt_tokens"],
360
- completion_tokens=self.usage_stats["completion_tokens"],
361
- total_tokens=self.usage_stats["total_tokens"],
362
- reasoning_time=self.timing_stats["reasoning_time"],
363
- reasoning_tokens=self.timing_stats["reasoning_tokens"]
364
- )
365
-
366
367
  return output, buffer
367
368
 
368
369
  class ResponseState:
@@ -374,6 +375,7 @@ class ResponseState:
374
375
  self.function_calls = None # For future function calling support
375
376
  self.tool_calls = None # List to accumulate tool calls
376
377
  self.current_tool_call = None # Track current tool call being built
378
+ self.usage = None # Add usage field
377
379
  self.state_changes = {
378
380
  "reasoning_started": False,
379
381
  "reasoning_ended": False,
@@ -385,7 +387,7 @@ class ResponseState:
385
387
 
386
388
  class ResponseTransformer:
387
389
  """Base class for transforming model responses."""
388
- def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
390
+ def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
389
391
  self.state = ResponseState()
390
392
  self.output_cls = output_cls
391
393
  self.timing = None # Will be set by stream_generate
@@ -496,28 +498,43 @@ class ResponseTransformer:
496
498
  Returns:
497
499
  Tuple of (buffer, LLMOutput, state_changes)
498
500
  """
501
+ # Build base output with required fields
502
+ output_data = {
503
+ "response": self.state.response.strip(),
504
+ }
505
+
506
+ # Add optional fields if they exist
507
+ if self.state.usage is not None:
508
+ output_data["usage"] = self.state.usage
509
+ if self.state.reasoning:
510
+ output_data["reasoning"] = self.state.reasoning.strip()
511
+ if self.state.function_calls:
512
+ output_data["function_calls"] = self.state.function_calls
513
+ if self.state.tool_calls:
514
+ output_data["tool_calls"] = self.state.tool_calls
515
+
516
+ output = self.output_cls(**output_data)
517
+
499
518
  return (
500
519
  self.state.buffer,
501
- self.output_cls(
502
- response=self.state.response.strip(),
503
- reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
504
- function_calls=self.state.function_calls,
505
- tool_calls=self.state.tool_calls
506
- ),
520
+ output,
507
521
  self.state.state_changes
508
522
  )
509
523
 
510
- def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
524
+ def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
511
525
  """Transform a piece of text and return the result.
512
526
 
513
527
  Args:
514
528
  piece: New piece of text to transform
515
529
  buffer: Existing buffer content
530
+ usage: Optional usage statistics
516
531
 
517
532
  Returns:
518
533
  Tuple of (new_buffer, output, state_changes)
519
534
  """
520
535
  self.state.buffer = buffer
536
+ if usage is not None:
537
+ self.state.usage = usage
521
538
  self.transform_chunk(piece)
522
539
  return self.build_output()
523
540
 
@@ -533,36 +550,108 @@ def stream_generate(
533
550
  max_tokens: int = 4096,
534
551
  stop: Optional[List[str]] = None,
535
552
  verbose: bool = False,
536
- ) -> Generator[LLMOutput, None, None]:
553
+ output_cls: type[BaseLLMOutput] = LLMOutput,
554
+ ) -> Generator[BaseLLMOutput, None, None]:
537
555
  """Stream generate from LLaMA.cpp model with timing and usage tracking."""
556
+
557
+ # Create queues for communication between threads
558
+ response_queue = Queue()
559
+ error_queue = Queue()
560
+ keep_alive_queue = Queue()
561
+
562
+ # Set the output class for the transformer
563
+ transformer.output_cls = output_cls
564
+
565
+ def _generate_worker():
566
+ """Worker thread to run the model generation."""
567
+ try:
568
+ # Build completion kwargs
569
+ completion_kwargs = {
570
+ "messages": messages,
571
+ "stream": True,
572
+ "temperature": temperature,
573
+ "top_p": top_p,
574
+ "max_tokens": max_tokens,
575
+ "stop": stop
576
+ }
577
+ if tools is not None:
578
+ completion_kwargs["tools"] = tools
579
+ if tool_choice is not None:
580
+ completion_kwargs["tool_choice"] = tool_choice
581
+
582
+ # Signal that we're starting
583
+ keep_alive_queue.put(("init", time.time()))
584
+
585
+ completion = model.create_chat_completion(**completion_kwargs)
586
+
587
+ for chunk in completion:
588
+ if verbose:
589
+ print(chunk)
590
+ response_queue.put(("chunk", chunk))
591
+ # Update keep-alive timestamp
592
+ keep_alive_queue.put(("alive", time.time()))
593
+
594
+ # Signal completion
595
+ response_queue.put(("done", None))
596
+
597
+ except Exception as e:
598
+ error_queue.put(e)
599
+ response_queue.put(("error", str(e)))
600
+
538
601
  with timing_context() as timing:
539
602
  transformer.timing = timing
540
603
 
541
- # Build completion kwargs
542
- completion_kwargs = {
543
- "messages": messages,
544
- "stream": True,
545
- "temperature": temperature,
546
- "top_p": top_p,
547
- "max_tokens": max_tokens,
548
- "stop": stop
549
- }
550
- if tools is not None:
551
- completion_kwargs["tools"] = tools
552
- if tool_choice is not None:
553
- completion_kwargs["tool_choice"] = tool_choice
604
+ # Start generation thread
605
+ generation_thread = Thread(target=_generate_worker, daemon=True)
606
+ generation_thread.start()
554
607
 
555
608
  # Initialize response state
556
609
  response = StreamResponse()
557
610
  buffer = ""
558
611
 
612
+ # Keep-alive tracking
613
+ last_activity = time.time()
614
+ init_timeout = 30.0 # 30 seconds for initial response
615
+ chunk_timeout = 10.0 # 10 seconds between chunks
616
+
559
617
  try:
560
- completion = model.create_chat_completion(**completion_kwargs)
618
+ # Wait for initial setup
619
+ try:
620
+ msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
621
+ if msg_type != "init":
622
+ raise RuntimeError("Unexpected initialization message")
623
+ last_activity = timestamp
624
+ except Empty:
625
+ raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
561
626
 
562
- for chunk in completion:
563
- if verbose:
564
- print(chunk)
565
- # Mark first token time as soon as we get any response
627
+ while True:
628
+ # Check for errors
629
+ if not error_queue.empty():
630
+ raise error_queue.get()
631
+
632
+ # Check keep-alive
633
+ while not keep_alive_queue.empty():
634
+ _, timestamp = keep_alive_queue.get_nowait()
635
+ last_activity = timestamp
636
+
637
+ # Check for timeout
638
+ if time.time() - last_activity > chunk_timeout:
639
+ raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
640
+
641
+ # Get next chunk
642
+ try:
643
+ msg_type, data = response_queue.get(timeout=0.1)
644
+ except Empty:
645
+ continue
646
+
647
+ if msg_type == "error":
648
+ raise RuntimeError(f"Generation error: {data}")
649
+ elif msg_type == "done":
650
+ break
651
+
652
+ chunk = data
653
+
654
+ # Mark first token time
566
655
  if not timing.first_token_time:
567
656
  timing.mark_first_token()
568
657
 
@@ -577,6 +666,13 @@ def stream_generate(
577
666
  # Break if we're done
578
667
  if response.finish_reason:
579
668
  break
669
+
670
+ # Wait for generation thread to finish
671
+ generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
672
+ if generation_thread.is_alive():
673
+ # Thread didn't finish - this shouldn't happen normally
674
+ # but we handle it gracefully
675
+ raise RuntimeError("Generation thread failed to finish")
580
676
 
581
677
  except Exception as e:
582
678
  # Ensure any error is properly propagated
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.27
3
+ Version: 0.2.28
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
File without changes
File without changes
File without changes
File without changes