inferencesh 0.2.27__tar.gz → 0.2.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- {inferencesh-0.2.27/src/inferencesh.egg-info → inferencesh-0.2.29}/PKG-INFO +1 -1
- {inferencesh-0.2.27 → inferencesh-0.2.29}/pyproject.toml +1 -1
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/llm.py +158 -48
- {inferencesh-0.2.27 → inferencesh-0.2.29/src/inferencesh.egg-info}/PKG-INFO +1 -1
- {inferencesh-0.2.27 → inferencesh-0.2.29}/LICENSE +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/README.md +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/setup.cfg +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/setup.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/__init__.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/__init__.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/base.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/models/file.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/__init__.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/download.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh/utils/storage.py +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/SOURCES.txt +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/dependency_links.txt +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/entry_points.txt +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/requires.txt +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/src/inferencesh.egg-info/top_level.txt +0 -0
- {inferencesh-0.2.27 → inferencesh-0.2.29}/tests/test_sdk.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional, List, Any, Callable, Dict, Generator
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pydantic import Field, BaseModel
|
|
4
|
-
from queue import Queue
|
|
4
|
+
from queue import Queue, Empty
|
|
5
5
|
from threading import Thread
|
|
6
6
|
import time
|
|
7
7
|
from contextlib import contextmanager
|
|
@@ -9,7 +9,6 @@ import base64
|
|
|
9
9
|
|
|
10
10
|
from .base import BaseAppInput, BaseAppOutput
|
|
11
11
|
from .file import File
|
|
12
|
-
from .types import ContextMessage
|
|
13
12
|
|
|
14
13
|
class ContextMessageRole(str, Enum):
|
|
15
14
|
USER = "user"
|
|
@@ -113,13 +112,27 @@ class LLMUsage(BaseAppOutput):
|
|
|
113
112
|
|
|
114
113
|
class BaseLLMOutput(BaseAppOutput):
|
|
115
114
|
"""Base class for LLM outputs with common fields."""
|
|
116
|
-
|
|
117
|
-
done: bool = Field(default=False, description="Whether this is the final chunk")
|
|
115
|
+
response: str = Field(description="The generated text response")
|
|
118
116
|
|
|
119
117
|
class LLMUsageMixin(BaseModel):
|
|
120
118
|
"""Mixin for models that provide token usage statistics."""
|
|
121
119
|
usage: Optional[LLMUsage] = Field(
|
|
122
|
-
description="Token usage statistics"
|
|
120
|
+
description="Token usage statistics",
|
|
121
|
+
default=None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
class ReasoningMixin(BaseModel):
|
|
125
|
+
"""Mixin for models that support reasoning."""
|
|
126
|
+
reasoning: Optional[str] = Field(
|
|
127
|
+
description="The reasoning output of the model",
|
|
128
|
+
default=None
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
class ToolCallsMixin(BaseModel):
|
|
132
|
+
"""Mixin for models that support tool calls."""
|
|
133
|
+
tool_calls: Optional[List[Dict[str, Any]]] = Field(
|
|
134
|
+
description="Tool calls for function calling",
|
|
135
|
+
default=None
|
|
123
136
|
)
|
|
124
137
|
|
|
125
138
|
# Example of how to use:
|
|
@@ -343,26 +356,28 @@ class StreamResponse:
|
|
|
343
356
|
|
|
344
357
|
return has_content or has_tool_calls or has_usage or has_finish
|
|
345
358
|
|
|
346
|
-
def to_output(self, buffer: str, transformer: Any) ->
|
|
359
|
+
def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
|
|
347
360
|
"""Convert current state to LLMOutput."""
|
|
348
|
-
|
|
361
|
+
# Create usage object if we have stats
|
|
362
|
+
usage = None
|
|
363
|
+
if any(self.usage_stats.values()):
|
|
364
|
+
usage = LLMUsage(
|
|
365
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
366
|
+
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
367
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
368
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
369
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
370
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
371
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
372
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
373
|
+
)
|
|
349
374
|
|
|
350
|
-
|
|
351
|
-
|
|
375
|
+
buffer, output, _ = transformer(self.content, buffer, usage)
|
|
376
|
+
|
|
377
|
+
# Add tool calls if present and supported
|
|
378
|
+
if self.tool_calls and hasattr(output, 'tool_calls'):
|
|
352
379
|
output.tool_calls = self.tool_calls
|
|
353
380
|
|
|
354
|
-
# Add usage stats
|
|
355
|
-
output.usage = LLMUsage(
|
|
356
|
-
stop_reason=self.usage_stats["stop_reason"],
|
|
357
|
-
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
358
|
-
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
359
|
-
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
360
|
-
completion_tokens=self.usage_stats["completion_tokens"],
|
|
361
|
-
total_tokens=self.usage_stats["total_tokens"],
|
|
362
|
-
reasoning_time=self.timing_stats["reasoning_time"],
|
|
363
|
-
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
364
|
-
)
|
|
365
|
-
|
|
366
381
|
return output, buffer
|
|
367
382
|
|
|
368
383
|
class ResponseState:
|
|
@@ -374,6 +389,7 @@ class ResponseState:
|
|
|
374
389
|
self.function_calls = None # For future function calling support
|
|
375
390
|
self.tool_calls = None # List to accumulate tool calls
|
|
376
391
|
self.current_tool_call = None # Track current tool call being built
|
|
392
|
+
self.usage = None # Add usage field
|
|
377
393
|
self.state_changes = {
|
|
378
394
|
"reasoning_started": False,
|
|
379
395
|
"reasoning_ended": False,
|
|
@@ -385,7 +401,7 @@ class ResponseState:
|
|
|
385
401
|
|
|
386
402
|
class ResponseTransformer:
|
|
387
403
|
"""Base class for transforming model responses."""
|
|
388
|
-
def __init__(self, output_cls: type[
|
|
404
|
+
def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
|
|
389
405
|
self.state = ResponseState()
|
|
390
406
|
self.output_cls = output_cls
|
|
391
407
|
self.timing = None # Will be set by stream_generate
|
|
@@ -496,28 +512,43 @@ class ResponseTransformer:
|
|
|
496
512
|
Returns:
|
|
497
513
|
Tuple of (buffer, LLMOutput, state_changes)
|
|
498
514
|
"""
|
|
515
|
+
# Build base output with required fields
|
|
516
|
+
output_data = {
|
|
517
|
+
"response": self.state.response.strip(),
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
# Add optional fields if they exist
|
|
521
|
+
if self.state.usage is not None:
|
|
522
|
+
output_data["usage"] = self.state.usage
|
|
523
|
+
if self.state.reasoning:
|
|
524
|
+
output_data["reasoning"] = self.state.reasoning.strip()
|
|
525
|
+
if self.state.function_calls:
|
|
526
|
+
output_data["function_calls"] = self.state.function_calls
|
|
527
|
+
if self.state.tool_calls:
|
|
528
|
+
output_data["tool_calls"] = self.state.tool_calls
|
|
529
|
+
|
|
530
|
+
output = self.output_cls(**output_data)
|
|
531
|
+
|
|
499
532
|
return (
|
|
500
533
|
self.state.buffer,
|
|
501
|
-
|
|
502
|
-
response=self.state.response.strip(),
|
|
503
|
-
reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
|
|
504
|
-
function_calls=self.state.function_calls,
|
|
505
|
-
tool_calls=self.state.tool_calls
|
|
506
|
-
),
|
|
534
|
+
output,
|
|
507
535
|
self.state.state_changes
|
|
508
536
|
)
|
|
509
537
|
|
|
510
|
-
def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
|
|
538
|
+
def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
|
|
511
539
|
"""Transform a piece of text and return the result.
|
|
512
540
|
|
|
513
541
|
Args:
|
|
514
542
|
piece: New piece of text to transform
|
|
515
543
|
buffer: Existing buffer content
|
|
544
|
+
usage: Optional usage statistics
|
|
516
545
|
|
|
517
546
|
Returns:
|
|
518
547
|
Tuple of (new_buffer, output, state_changes)
|
|
519
548
|
"""
|
|
520
549
|
self.state.buffer = buffer
|
|
550
|
+
if usage is not None:
|
|
551
|
+
self.state.usage = usage
|
|
521
552
|
self.transform_chunk(piece)
|
|
522
553
|
return self.build_output()
|
|
523
554
|
|
|
@@ -533,36 +564,108 @@ def stream_generate(
|
|
|
533
564
|
max_tokens: int = 4096,
|
|
534
565
|
stop: Optional[List[str]] = None,
|
|
535
566
|
verbose: bool = False,
|
|
536
|
-
|
|
567
|
+
output_cls: type[BaseLLMOutput] = LLMOutput,
|
|
568
|
+
) -> Generator[BaseLLMOutput, None, None]:
|
|
537
569
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
570
|
+
|
|
571
|
+
# Create queues for communication between threads
|
|
572
|
+
response_queue = Queue()
|
|
573
|
+
error_queue = Queue()
|
|
574
|
+
keep_alive_queue = Queue()
|
|
575
|
+
|
|
576
|
+
# Set the output class for the transformer
|
|
577
|
+
transformer.output_cls = output_cls
|
|
578
|
+
|
|
579
|
+
def _generate_worker():
|
|
580
|
+
"""Worker thread to run the model generation."""
|
|
581
|
+
try:
|
|
582
|
+
# Build completion kwargs
|
|
583
|
+
completion_kwargs = {
|
|
584
|
+
"messages": messages,
|
|
585
|
+
"stream": True,
|
|
586
|
+
"temperature": temperature,
|
|
587
|
+
"top_p": top_p,
|
|
588
|
+
"max_tokens": max_tokens,
|
|
589
|
+
"stop": stop
|
|
590
|
+
}
|
|
591
|
+
if tools is not None:
|
|
592
|
+
completion_kwargs["tools"] = tools
|
|
593
|
+
if tool_choice is not None:
|
|
594
|
+
completion_kwargs["tool_choice"] = tool_choice
|
|
595
|
+
|
|
596
|
+
# Signal that we're starting
|
|
597
|
+
keep_alive_queue.put(("init", time.time()))
|
|
598
|
+
|
|
599
|
+
completion = model.create_chat_completion(**completion_kwargs)
|
|
600
|
+
|
|
601
|
+
for chunk in completion:
|
|
602
|
+
if verbose:
|
|
603
|
+
print(chunk)
|
|
604
|
+
response_queue.put(("chunk", chunk))
|
|
605
|
+
# Update keep-alive timestamp
|
|
606
|
+
keep_alive_queue.put(("alive", time.time()))
|
|
607
|
+
|
|
608
|
+
# Signal completion
|
|
609
|
+
response_queue.put(("done", None))
|
|
610
|
+
|
|
611
|
+
except Exception as e:
|
|
612
|
+
error_queue.put(e)
|
|
613
|
+
response_queue.put(("error", str(e)))
|
|
614
|
+
|
|
538
615
|
with timing_context() as timing:
|
|
539
616
|
transformer.timing = timing
|
|
540
617
|
|
|
541
|
-
#
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
"stream": True,
|
|
545
|
-
"temperature": temperature,
|
|
546
|
-
"top_p": top_p,
|
|
547
|
-
"max_tokens": max_tokens,
|
|
548
|
-
"stop": stop
|
|
549
|
-
}
|
|
550
|
-
if tools is not None:
|
|
551
|
-
completion_kwargs["tools"] = tools
|
|
552
|
-
if tool_choice is not None:
|
|
553
|
-
completion_kwargs["tool_choice"] = tool_choice
|
|
618
|
+
# Start generation thread
|
|
619
|
+
generation_thread = Thread(target=_generate_worker, daemon=True)
|
|
620
|
+
generation_thread.start()
|
|
554
621
|
|
|
555
622
|
# Initialize response state
|
|
556
623
|
response = StreamResponse()
|
|
557
624
|
buffer = ""
|
|
558
625
|
|
|
626
|
+
# Keep-alive tracking
|
|
627
|
+
last_activity = time.time()
|
|
628
|
+
init_timeout = 30.0 # 30 seconds for initial response
|
|
629
|
+
chunk_timeout = 10.0 # 10 seconds between chunks
|
|
630
|
+
|
|
559
631
|
try:
|
|
560
|
-
|
|
632
|
+
# Wait for initial setup
|
|
633
|
+
try:
|
|
634
|
+
msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
|
|
635
|
+
if msg_type != "init":
|
|
636
|
+
raise RuntimeError("Unexpected initialization message")
|
|
637
|
+
last_activity = timestamp
|
|
638
|
+
except Empty:
|
|
639
|
+
raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
|
|
561
640
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
641
|
+
while True:
|
|
642
|
+
# Check for errors
|
|
643
|
+
if not error_queue.empty():
|
|
644
|
+
raise error_queue.get()
|
|
645
|
+
|
|
646
|
+
# Check keep-alive
|
|
647
|
+
while not keep_alive_queue.empty():
|
|
648
|
+
_, timestamp = keep_alive_queue.get_nowait()
|
|
649
|
+
last_activity = timestamp
|
|
650
|
+
|
|
651
|
+
# Check for timeout
|
|
652
|
+
if time.time() - last_activity > chunk_timeout:
|
|
653
|
+
raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
|
|
654
|
+
|
|
655
|
+
# Get next chunk
|
|
656
|
+
try:
|
|
657
|
+
msg_type, data = response_queue.get(timeout=0.1)
|
|
658
|
+
except Empty:
|
|
659
|
+
continue
|
|
660
|
+
|
|
661
|
+
if msg_type == "error":
|
|
662
|
+
raise RuntimeError(f"Generation error: {data}")
|
|
663
|
+
elif msg_type == "done":
|
|
664
|
+
break
|
|
665
|
+
|
|
666
|
+
chunk = data
|
|
667
|
+
|
|
668
|
+
# Mark first token time
|
|
566
669
|
if not timing.first_token_time:
|
|
567
670
|
timing.mark_first_token()
|
|
568
671
|
|
|
@@ -577,6 +680,13 @@ def stream_generate(
|
|
|
577
680
|
# Break if we're done
|
|
578
681
|
if response.finish_reason:
|
|
579
682
|
break
|
|
683
|
+
|
|
684
|
+
# Wait for generation thread to finish
|
|
685
|
+
generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
|
|
686
|
+
if generation_thread.is_alive():
|
|
687
|
+
# Thread didn't finish - this shouldn't happen normally
|
|
688
|
+
# but we handle it gracefully
|
|
689
|
+
raise RuntimeError("Generation thread failed to finish")
|
|
580
690
|
|
|
581
691
|
except Exception as e:
|
|
582
692
|
# Ensure any error is properly propagated
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|