inferencesh 0.2.27__py3-none-any.whl → 0.2.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +144 -48
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/METADATA +1 -1
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/RECORD +7 -7
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.27.dist-info → inferencesh-0.2.28.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional, List, Any, Callable, Dict, Generator
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pydantic import Field, BaseModel
|
|
4
|
-
from queue import Queue
|
|
4
|
+
from queue import Queue, Empty
|
|
5
5
|
from threading import Thread
|
|
6
6
|
import time
|
|
7
7
|
from contextlib import contextmanager
|
|
@@ -9,7 +9,6 @@ import base64
|
|
|
9
9
|
|
|
10
10
|
from .base import BaseAppInput, BaseAppOutput
|
|
11
11
|
from .file import File
|
|
12
|
-
from .types import ContextMessage
|
|
13
12
|
|
|
14
13
|
class ContextMessageRole(str, Enum):
|
|
15
14
|
USER = "user"
|
|
@@ -113,13 +112,13 @@ class LLMUsage(BaseAppOutput):
|
|
|
113
112
|
|
|
114
113
|
class BaseLLMOutput(BaseAppOutput):
|
|
115
114
|
"""Base class for LLM outputs with common fields."""
|
|
116
|
-
|
|
117
|
-
done: bool = Field(default=False, description="Whether this is the final chunk")
|
|
115
|
+
response: str = Field(description="The generated text response")
|
|
118
116
|
|
|
119
117
|
class LLMUsageMixin(BaseModel):
|
|
120
118
|
"""Mixin for models that provide token usage statistics."""
|
|
121
119
|
usage: Optional[LLMUsage] = Field(
|
|
122
|
-
description="Token usage statistics"
|
|
120
|
+
description="Token usage statistics",
|
|
121
|
+
default=None
|
|
123
122
|
)
|
|
124
123
|
|
|
125
124
|
# Example of how to use:
|
|
@@ -343,26 +342,28 @@ class StreamResponse:
|
|
|
343
342
|
|
|
344
343
|
return has_content or has_tool_calls or has_usage or has_finish
|
|
345
344
|
|
|
346
|
-
def to_output(self, buffer: str, transformer: Any) ->
|
|
345
|
+
def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
|
|
347
346
|
"""Convert current state to LLMOutput."""
|
|
348
|
-
|
|
347
|
+
# Create usage object if we have stats
|
|
348
|
+
usage = None
|
|
349
|
+
if any(self.usage_stats.values()):
|
|
350
|
+
usage = LLMUsage(
|
|
351
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
352
|
+
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
353
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
354
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
355
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
356
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
357
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
358
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
buffer, output, _ = transformer(self.content, buffer, usage)
|
|
349
362
|
|
|
350
|
-
# Add tool calls if present
|
|
351
|
-
if self.tool_calls:
|
|
363
|
+
# Add tool calls if present and supported
|
|
364
|
+
if self.tool_calls and hasattr(output, 'tool_calls'):
|
|
352
365
|
output.tool_calls = self.tool_calls
|
|
353
366
|
|
|
354
|
-
# Add usage stats
|
|
355
|
-
output.usage = LLMUsage(
|
|
356
|
-
stop_reason=self.usage_stats["stop_reason"],
|
|
357
|
-
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
358
|
-
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
359
|
-
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
360
|
-
completion_tokens=self.usage_stats["completion_tokens"],
|
|
361
|
-
total_tokens=self.usage_stats["total_tokens"],
|
|
362
|
-
reasoning_time=self.timing_stats["reasoning_time"],
|
|
363
|
-
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
364
|
-
)
|
|
365
|
-
|
|
366
367
|
return output, buffer
|
|
367
368
|
|
|
368
369
|
class ResponseState:
|
|
@@ -374,6 +375,7 @@ class ResponseState:
|
|
|
374
375
|
self.function_calls = None # For future function calling support
|
|
375
376
|
self.tool_calls = None # List to accumulate tool calls
|
|
376
377
|
self.current_tool_call = None # Track current tool call being built
|
|
378
|
+
self.usage = None # Add usage field
|
|
377
379
|
self.state_changes = {
|
|
378
380
|
"reasoning_started": False,
|
|
379
381
|
"reasoning_ended": False,
|
|
@@ -385,7 +387,7 @@ class ResponseState:
|
|
|
385
387
|
|
|
386
388
|
class ResponseTransformer:
|
|
387
389
|
"""Base class for transforming model responses."""
|
|
388
|
-
def __init__(self, output_cls: type[
|
|
390
|
+
def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
|
|
389
391
|
self.state = ResponseState()
|
|
390
392
|
self.output_cls = output_cls
|
|
391
393
|
self.timing = None # Will be set by stream_generate
|
|
@@ -496,28 +498,43 @@ class ResponseTransformer:
|
|
|
496
498
|
Returns:
|
|
497
499
|
Tuple of (buffer, LLMOutput, state_changes)
|
|
498
500
|
"""
|
|
501
|
+
# Build base output with required fields
|
|
502
|
+
output_data = {
|
|
503
|
+
"response": self.state.response.strip(),
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
# Add optional fields if they exist
|
|
507
|
+
if self.state.usage is not None:
|
|
508
|
+
output_data["usage"] = self.state.usage
|
|
509
|
+
if self.state.reasoning:
|
|
510
|
+
output_data["reasoning"] = self.state.reasoning.strip()
|
|
511
|
+
if self.state.function_calls:
|
|
512
|
+
output_data["function_calls"] = self.state.function_calls
|
|
513
|
+
if self.state.tool_calls:
|
|
514
|
+
output_data["tool_calls"] = self.state.tool_calls
|
|
515
|
+
|
|
516
|
+
output = self.output_cls(**output_data)
|
|
517
|
+
|
|
499
518
|
return (
|
|
500
519
|
self.state.buffer,
|
|
501
|
-
|
|
502
|
-
response=self.state.response.strip(),
|
|
503
|
-
reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
|
|
504
|
-
function_calls=self.state.function_calls,
|
|
505
|
-
tool_calls=self.state.tool_calls
|
|
506
|
-
),
|
|
520
|
+
output,
|
|
507
521
|
self.state.state_changes
|
|
508
522
|
)
|
|
509
523
|
|
|
510
|
-
def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
|
|
524
|
+
def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
|
|
511
525
|
"""Transform a piece of text and return the result.
|
|
512
526
|
|
|
513
527
|
Args:
|
|
514
528
|
piece: New piece of text to transform
|
|
515
529
|
buffer: Existing buffer content
|
|
530
|
+
usage: Optional usage statistics
|
|
516
531
|
|
|
517
532
|
Returns:
|
|
518
533
|
Tuple of (new_buffer, output, state_changes)
|
|
519
534
|
"""
|
|
520
535
|
self.state.buffer = buffer
|
|
536
|
+
if usage is not None:
|
|
537
|
+
self.state.usage = usage
|
|
521
538
|
self.transform_chunk(piece)
|
|
522
539
|
return self.build_output()
|
|
523
540
|
|
|
@@ -533,36 +550,108 @@ def stream_generate(
|
|
|
533
550
|
max_tokens: int = 4096,
|
|
534
551
|
stop: Optional[List[str]] = None,
|
|
535
552
|
verbose: bool = False,
|
|
536
|
-
|
|
553
|
+
output_cls: type[BaseLLMOutput] = LLMOutput,
|
|
554
|
+
) -> Generator[BaseLLMOutput, None, None]:
|
|
537
555
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
556
|
+
|
|
557
|
+
# Create queues for communication between threads
|
|
558
|
+
response_queue = Queue()
|
|
559
|
+
error_queue = Queue()
|
|
560
|
+
keep_alive_queue = Queue()
|
|
561
|
+
|
|
562
|
+
# Set the output class for the transformer
|
|
563
|
+
transformer.output_cls = output_cls
|
|
564
|
+
|
|
565
|
+
def _generate_worker():
|
|
566
|
+
"""Worker thread to run the model generation."""
|
|
567
|
+
try:
|
|
568
|
+
# Build completion kwargs
|
|
569
|
+
completion_kwargs = {
|
|
570
|
+
"messages": messages,
|
|
571
|
+
"stream": True,
|
|
572
|
+
"temperature": temperature,
|
|
573
|
+
"top_p": top_p,
|
|
574
|
+
"max_tokens": max_tokens,
|
|
575
|
+
"stop": stop
|
|
576
|
+
}
|
|
577
|
+
if tools is not None:
|
|
578
|
+
completion_kwargs["tools"] = tools
|
|
579
|
+
if tool_choice is not None:
|
|
580
|
+
completion_kwargs["tool_choice"] = tool_choice
|
|
581
|
+
|
|
582
|
+
# Signal that we're starting
|
|
583
|
+
keep_alive_queue.put(("init", time.time()))
|
|
584
|
+
|
|
585
|
+
completion = model.create_chat_completion(**completion_kwargs)
|
|
586
|
+
|
|
587
|
+
for chunk in completion:
|
|
588
|
+
if verbose:
|
|
589
|
+
print(chunk)
|
|
590
|
+
response_queue.put(("chunk", chunk))
|
|
591
|
+
# Update keep-alive timestamp
|
|
592
|
+
keep_alive_queue.put(("alive", time.time()))
|
|
593
|
+
|
|
594
|
+
# Signal completion
|
|
595
|
+
response_queue.put(("done", None))
|
|
596
|
+
|
|
597
|
+
except Exception as e:
|
|
598
|
+
error_queue.put(e)
|
|
599
|
+
response_queue.put(("error", str(e)))
|
|
600
|
+
|
|
538
601
|
with timing_context() as timing:
|
|
539
602
|
transformer.timing = timing
|
|
540
603
|
|
|
541
|
-
#
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
"stream": True,
|
|
545
|
-
"temperature": temperature,
|
|
546
|
-
"top_p": top_p,
|
|
547
|
-
"max_tokens": max_tokens,
|
|
548
|
-
"stop": stop
|
|
549
|
-
}
|
|
550
|
-
if tools is not None:
|
|
551
|
-
completion_kwargs["tools"] = tools
|
|
552
|
-
if tool_choice is not None:
|
|
553
|
-
completion_kwargs["tool_choice"] = tool_choice
|
|
604
|
+
# Start generation thread
|
|
605
|
+
generation_thread = Thread(target=_generate_worker, daemon=True)
|
|
606
|
+
generation_thread.start()
|
|
554
607
|
|
|
555
608
|
# Initialize response state
|
|
556
609
|
response = StreamResponse()
|
|
557
610
|
buffer = ""
|
|
558
611
|
|
|
612
|
+
# Keep-alive tracking
|
|
613
|
+
last_activity = time.time()
|
|
614
|
+
init_timeout = 30.0 # 30 seconds for initial response
|
|
615
|
+
chunk_timeout = 10.0 # 10 seconds between chunks
|
|
616
|
+
|
|
559
617
|
try:
|
|
560
|
-
|
|
618
|
+
# Wait for initial setup
|
|
619
|
+
try:
|
|
620
|
+
msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
|
|
621
|
+
if msg_type != "init":
|
|
622
|
+
raise RuntimeError("Unexpected initialization message")
|
|
623
|
+
last_activity = timestamp
|
|
624
|
+
except Empty:
|
|
625
|
+
raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
|
|
561
626
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
627
|
+
while True:
|
|
628
|
+
# Check for errors
|
|
629
|
+
if not error_queue.empty():
|
|
630
|
+
raise error_queue.get()
|
|
631
|
+
|
|
632
|
+
# Check keep-alive
|
|
633
|
+
while not keep_alive_queue.empty():
|
|
634
|
+
_, timestamp = keep_alive_queue.get_nowait()
|
|
635
|
+
last_activity = timestamp
|
|
636
|
+
|
|
637
|
+
# Check for timeout
|
|
638
|
+
if time.time() - last_activity > chunk_timeout:
|
|
639
|
+
raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
|
|
640
|
+
|
|
641
|
+
# Get next chunk
|
|
642
|
+
try:
|
|
643
|
+
msg_type, data = response_queue.get(timeout=0.1)
|
|
644
|
+
except Empty:
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
if msg_type == "error":
|
|
648
|
+
raise RuntimeError(f"Generation error: {data}")
|
|
649
|
+
elif msg_type == "done":
|
|
650
|
+
break
|
|
651
|
+
|
|
652
|
+
chunk = data
|
|
653
|
+
|
|
654
|
+
# Mark first token time
|
|
566
655
|
if not timing.first_token_time:
|
|
567
656
|
timing.mark_first_token()
|
|
568
657
|
|
|
@@ -577,6 +666,13 @@ def stream_generate(
|
|
|
577
666
|
# Break if we're done
|
|
578
667
|
if response.finish_reason:
|
|
579
668
|
break
|
|
669
|
+
|
|
670
|
+
# Wait for generation thread to finish
|
|
671
|
+
generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
|
|
672
|
+
if generation_thread.is_alive():
|
|
673
|
+
# Thread didn't finish - this shouldn't happen normally
|
|
674
|
+
# but we handle it gracefully
|
|
675
|
+
raise RuntimeError("Generation thread failed to finish")
|
|
580
676
|
|
|
581
677
|
except Exception as e:
|
|
582
678
|
# Ensure any error is properly propagated
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=E2Mz56Cu_GODDhnNKE5gE5pOTgX4ekJv6UdO44wWON8,25806
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.28.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.28.dist-info/METADATA,sha256=9TxV1q5wsokL3de27EJKvRr9MFfOi86rxzoEEnKVTSU,2757
|
|
11
|
+
inferencesh-0.2.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.28.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.28.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.28.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|