inferencesh 0.2.26__py3-none-any.whl → 0.2.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +194 -77
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/METADATA +1 -1
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/RECORD +7 -7
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.26.dist-info → inferencesh-0.2.28.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional, List, Any, Callable, Dict, Generator
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from pydantic import Field
|
|
4
|
-
from queue import Queue
|
|
3
|
+
from pydantic import Field, BaseModel
|
|
4
|
+
from queue import Queue, Empty
|
|
5
5
|
from threading import Thread
|
|
6
6
|
import time
|
|
7
7
|
from contextlib import contextmanager
|
|
@@ -33,7 +33,8 @@ class ContextMessage(BaseAppInput):
|
|
|
33
33
|
default=None
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
class
|
|
36
|
+
class BaseLLMInput(BaseAppInput):
|
|
37
|
+
"""Base class with common LLM fields."""
|
|
37
38
|
system_prompt: str = Field(
|
|
38
39
|
description="The system prompt to use for the model",
|
|
39
40
|
default="You are a helpful assistant that can answer questions and help with tasks.",
|
|
@@ -47,25 +48,13 @@ class LLMInput(BaseAppInput):
|
|
|
47
48
|
)
|
|
48
49
|
context: List[ContextMessage] = Field(
|
|
49
50
|
description="The context to use for the model",
|
|
51
|
+
default=[],
|
|
50
52
|
examples=[
|
|
51
53
|
[
|
|
52
|
-
{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
|
|
54
|
+
{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
|
|
53
55
|
{"role": "assistant", "content": [{"type": "text", "text": "The capital of France is Paris."}]}
|
|
54
|
-
],
|
|
55
|
-
[
|
|
56
|
-
{"role": "user", "content": [{"type": "text", "text": "What is the weather like today?"}]},
|
|
57
|
-
{"role": "assistant", "content": [{"type": "text", "text": "I apologize, but I don't have access to real-time weather information. You would need to check a weather service or app to get current weather conditions for your location."}]}
|
|
58
|
-
],
|
|
59
|
-
[
|
|
60
|
-
{"role": "user", "content": [{"type": "text", "text": "Can you help me write a poem about spring?"}]},
|
|
61
|
-
{"role": "assistant", "content": [{"type": "text", "text": "Here's a short poem about spring:\n\nGreen buds awakening,\nSoft rain gently falling down,\nNew life springs anew.\n\nWarm sun breaks through clouds,\nBirds return with joyful song,\nNature's sweet rebirth."}]}
|
|
62
|
-
],
|
|
63
|
-
[
|
|
64
|
-
{"role": "user", "content": [{"type": "text", "text": "Explain quantum computing in simple terms"}]},
|
|
65
|
-
{"role": "assistant", "content": [{"type": "text", "text": "Quantum computing is like having a super-powerful calculator that can solve many problems at once instead of one at a time. While regular computers use bits (0s and 1s), quantum computers use quantum bits or \"qubits\" that can be both 0 and 1 at the same time - kind of like being in two places at once! This allows them to process huge amounts of information much faster than regular computers for certain types of problems."}]}
|
|
66
56
|
]
|
|
67
|
-
]
|
|
68
|
-
default=[]
|
|
57
|
+
]
|
|
69
58
|
)
|
|
70
59
|
text: str = Field(
|
|
71
60
|
description="The user prompt to use for the model",
|
|
@@ -74,22 +63,41 @@ class LLMInput(BaseAppInput):
|
|
|
74
63
|
"What is the weather like today?",
|
|
75
64
|
"Can you help me write a poem about spring?",
|
|
76
65
|
"Explain quantum computing in simple terms"
|
|
77
|
-
]
|
|
78
|
-
)
|
|
79
|
-
image: Optional[File] = Field(
|
|
80
|
-
description="The image to use for the model",
|
|
81
|
-
default=None
|
|
66
|
+
]
|
|
82
67
|
)
|
|
83
|
-
# Optional parameters
|
|
84
68
|
temperature: float = Field(default=0.7)
|
|
85
69
|
top_p: float = Field(default=0.95)
|
|
86
70
|
max_tokens: int = Field(default=4096)
|
|
87
71
|
context_size: int = Field(default=4096)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
72
|
+
|
|
73
|
+
class ImageCapabilityMixin(BaseModel):
|
|
74
|
+
"""Mixin for models that support image inputs."""
|
|
75
|
+
image: Optional[File] = Field(
|
|
76
|
+
description="The image to use for the model",
|
|
77
|
+
default=None
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
class ReasoningCapabilityMixin(BaseModel):
|
|
81
|
+
"""Mixin for models that support reasoning."""
|
|
82
|
+
reasoning: bool = Field(
|
|
83
|
+
description="Enable step-by-step reasoning",
|
|
84
|
+
default=False
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
class ToolsCapabilityMixin(BaseModel):
|
|
88
|
+
"""Mixin for models that support tool/function calling."""
|
|
89
|
+
tools: Optional[List[Dict[str, Any]]] = Field(
|
|
90
|
+
description="Tool definitions for function calling",
|
|
91
|
+
default=None
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Example of how to use:
|
|
95
|
+
class LLMInput(BaseLLMInput):
|
|
96
|
+
"""Default LLM input model with no special capabilities."""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
# For backward compatibility
|
|
100
|
+
LLMInput.model_config["title"] = "LLMInput"
|
|
93
101
|
|
|
94
102
|
class LLMUsage(BaseAppOutput):
|
|
95
103
|
stop_reason: str = ""
|
|
@@ -102,12 +110,24 @@ class LLMUsage(BaseAppOutput):
|
|
|
102
110
|
reasoning_time: float = 0.0
|
|
103
111
|
|
|
104
112
|
|
|
105
|
-
class
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
class BaseLLMOutput(BaseAppOutput):
|
|
114
|
+
"""Base class for LLM outputs with common fields."""
|
|
115
|
+
response: str = Field(description="The generated text response")
|
|
116
|
+
|
|
117
|
+
class LLMUsageMixin(BaseModel):
|
|
118
|
+
"""Mixin for models that provide token usage statistics."""
|
|
119
|
+
usage: Optional[LLMUsage] = Field(
|
|
120
|
+
description="Token usage statistics",
|
|
121
|
+
default=None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Example of how to use:
|
|
125
|
+
class LLMOutput(BaseLLMOutput, LLMUsageMixin):
|
|
126
|
+
"""Default LLM output model with token usage tracking."""
|
|
127
|
+
pass
|
|
110
128
|
|
|
129
|
+
# For backward compatibility
|
|
130
|
+
LLMOutput.model_config["title"] = "LLMOutput"
|
|
111
131
|
|
|
112
132
|
@contextmanager
|
|
113
133
|
def timing_context():
|
|
@@ -322,26 +342,28 @@ class StreamResponse:
|
|
|
322
342
|
|
|
323
343
|
return has_content or has_tool_calls or has_usage or has_finish
|
|
324
344
|
|
|
325
|
-
def to_output(self, buffer: str, transformer: Any) ->
|
|
345
|
+
def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
|
|
326
346
|
"""Convert current state to LLMOutput."""
|
|
327
|
-
|
|
347
|
+
# Create usage object if we have stats
|
|
348
|
+
usage = None
|
|
349
|
+
if any(self.usage_stats.values()):
|
|
350
|
+
usage = LLMUsage(
|
|
351
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
352
|
+
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
353
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
354
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
355
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
356
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
357
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
358
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
buffer, output, _ = transformer(self.content, buffer, usage)
|
|
328
362
|
|
|
329
|
-
# Add tool calls if present
|
|
330
|
-
if self.tool_calls:
|
|
363
|
+
# Add tool calls if present and supported
|
|
364
|
+
if self.tool_calls and hasattr(output, 'tool_calls'):
|
|
331
365
|
output.tool_calls = self.tool_calls
|
|
332
366
|
|
|
333
|
-
# Add usage stats
|
|
334
|
-
output.usage = LLMUsage(
|
|
335
|
-
stop_reason=self.usage_stats["stop_reason"],
|
|
336
|
-
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
337
|
-
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
338
|
-
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
339
|
-
completion_tokens=self.usage_stats["completion_tokens"],
|
|
340
|
-
total_tokens=self.usage_stats["total_tokens"],
|
|
341
|
-
reasoning_time=self.timing_stats["reasoning_time"],
|
|
342
|
-
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
343
|
-
)
|
|
344
|
-
|
|
345
367
|
return output, buffer
|
|
346
368
|
|
|
347
369
|
class ResponseState:
|
|
@@ -353,6 +375,7 @@ class ResponseState:
|
|
|
353
375
|
self.function_calls = None # For future function calling support
|
|
354
376
|
self.tool_calls = None # List to accumulate tool calls
|
|
355
377
|
self.current_tool_call = None # Track current tool call being built
|
|
378
|
+
self.usage = None # Add usage field
|
|
356
379
|
self.state_changes = {
|
|
357
380
|
"reasoning_started": False,
|
|
358
381
|
"reasoning_ended": False,
|
|
@@ -364,7 +387,7 @@ class ResponseState:
|
|
|
364
387
|
|
|
365
388
|
class ResponseTransformer:
|
|
366
389
|
"""Base class for transforming model responses."""
|
|
367
|
-
def __init__(self, output_cls: type[
|
|
390
|
+
def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
|
|
368
391
|
self.state = ResponseState()
|
|
369
392
|
self.output_cls = output_cls
|
|
370
393
|
self.timing = None # Will be set by stream_generate
|
|
@@ -475,28 +498,43 @@ class ResponseTransformer:
|
|
|
475
498
|
Returns:
|
|
476
499
|
Tuple of (buffer, LLMOutput, state_changes)
|
|
477
500
|
"""
|
|
501
|
+
# Build base output with required fields
|
|
502
|
+
output_data = {
|
|
503
|
+
"response": self.state.response.strip(),
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
# Add optional fields if they exist
|
|
507
|
+
if self.state.usage is not None:
|
|
508
|
+
output_data["usage"] = self.state.usage
|
|
509
|
+
if self.state.reasoning:
|
|
510
|
+
output_data["reasoning"] = self.state.reasoning.strip()
|
|
511
|
+
if self.state.function_calls:
|
|
512
|
+
output_data["function_calls"] = self.state.function_calls
|
|
513
|
+
if self.state.tool_calls:
|
|
514
|
+
output_data["tool_calls"] = self.state.tool_calls
|
|
515
|
+
|
|
516
|
+
output = self.output_cls(**output_data)
|
|
517
|
+
|
|
478
518
|
return (
|
|
479
519
|
self.state.buffer,
|
|
480
|
-
|
|
481
|
-
response=self.state.response.strip(),
|
|
482
|
-
reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
|
|
483
|
-
function_calls=self.state.function_calls,
|
|
484
|
-
tool_calls=self.state.tool_calls
|
|
485
|
-
),
|
|
520
|
+
output,
|
|
486
521
|
self.state.state_changes
|
|
487
522
|
)
|
|
488
523
|
|
|
489
|
-
def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
|
|
524
|
+
def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
|
|
490
525
|
"""Transform a piece of text and return the result.
|
|
491
526
|
|
|
492
527
|
Args:
|
|
493
528
|
piece: New piece of text to transform
|
|
494
529
|
buffer: Existing buffer content
|
|
530
|
+
usage: Optional usage statistics
|
|
495
531
|
|
|
496
532
|
Returns:
|
|
497
533
|
Tuple of (new_buffer, output, state_changes)
|
|
498
534
|
"""
|
|
499
535
|
self.state.buffer = buffer
|
|
536
|
+
if usage is not None:
|
|
537
|
+
self.state.usage = usage
|
|
500
538
|
self.transform_chunk(piece)
|
|
501
539
|
return self.build_output()
|
|
502
540
|
|
|
@@ -512,36 +550,108 @@ def stream_generate(
|
|
|
512
550
|
max_tokens: int = 4096,
|
|
513
551
|
stop: Optional[List[str]] = None,
|
|
514
552
|
verbose: bool = False,
|
|
515
|
-
|
|
553
|
+
output_cls: type[BaseLLMOutput] = LLMOutput,
|
|
554
|
+
) -> Generator[BaseLLMOutput, None, None]:
|
|
516
555
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
556
|
+
|
|
557
|
+
# Create queues for communication between threads
|
|
558
|
+
response_queue = Queue()
|
|
559
|
+
error_queue = Queue()
|
|
560
|
+
keep_alive_queue = Queue()
|
|
561
|
+
|
|
562
|
+
# Set the output class for the transformer
|
|
563
|
+
transformer.output_cls = output_cls
|
|
564
|
+
|
|
565
|
+
def _generate_worker():
|
|
566
|
+
"""Worker thread to run the model generation."""
|
|
567
|
+
try:
|
|
568
|
+
# Build completion kwargs
|
|
569
|
+
completion_kwargs = {
|
|
570
|
+
"messages": messages,
|
|
571
|
+
"stream": True,
|
|
572
|
+
"temperature": temperature,
|
|
573
|
+
"top_p": top_p,
|
|
574
|
+
"max_tokens": max_tokens,
|
|
575
|
+
"stop": stop
|
|
576
|
+
}
|
|
577
|
+
if tools is not None:
|
|
578
|
+
completion_kwargs["tools"] = tools
|
|
579
|
+
if tool_choice is not None:
|
|
580
|
+
completion_kwargs["tool_choice"] = tool_choice
|
|
581
|
+
|
|
582
|
+
# Signal that we're starting
|
|
583
|
+
keep_alive_queue.put(("init", time.time()))
|
|
584
|
+
|
|
585
|
+
completion = model.create_chat_completion(**completion_kwargs)
|
|
586
|
+
|
|
587
|
+
for chunk in completion:
|
|
588
|
+
if verbose:
|
|
589
|
+
print(chunk)
|
|
590
|
+
response_queue.put(("chunk", chunk))
|
|
591
|
+
# Update keep-alive timestamp
|
|
592
|
+
keep_alive_queue.put(("alive", time.time()))
|
|
593
|
+
|
|
594
|
+
# Signal completion
|
|
595
|
+
response_queue.put(("done", None))
|
|
596
|
+
|
|
597
|
+
except Exception as e:
|
|
598
|
+
error_queue.put(e)
|
|
599
|
+
response_queue.put(("error", str(e)))
|
|
600
|
+
|
|
517
601
|
with timing_context() as timing:
|
|
518
602
|
transformer.timing = timing
|
|
519
603
|
|
|
520
|
-
#
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
"stream": True,
|
|
524
|
-
"temperature": temperature,
|
|
525
|
-
"top_p": top_p,
|
|
526
|
-
"max_tokens": max_tokens,
|
|
527
|
-
"stop": stop
|
|
528
|
-
}
|
|
529
|
-
if tools is not None:
|
|
530
|
-
completion_kwargs["tools"] = tools
|
|
531
|
-
if tool_choice is not None:
|
|
532
|
-
completion_kwargs["tool_choice"] = tool_choice
|
|
604
|
+
# Start generation thread
|
|
605
|
+
generation_thread = Thread(target=_generate_worker, daemon=True)
|
|
606
|
+
generation_thread.start()
|
|
533
607
|
|
|
534
608
|
# Initialize response state
|
|
535
609
|
response = StreamResponse()
|
|
536
610
|
buffer = ""
|
|
537
611
|
|
|
612
|
+
# Keep-alive tracking
|
|
613
|
+
last_activity = time.time()
|
|
614
|
+
init_timeout = 30.0 # 30 seconds for initial response
|
|
615
|
+
chunk_timeout = 10.0 # 10 seconds between chunks
|
|
616
|
+
|
|
538
617
|
try:
|
|
539
|
-
|
|
618
|
+
# Wait for initial setup
|
|
619
|
+
try:
|
|
620
|
+
msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
|
|
621
|
+
if msg_type != "init":
|
|
622
|
+
raise RuntimeError("Unexpected initialization message")
|
|
623
|
+
last_activity = timestamp
|
|
624
|
+
except Empty:
|
|
625
|
+
raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
|
|
540
626
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
627
|
+
while True:
|
|
628
|
+
# Check for errors
|
|
629
|
+
if not error_queue.empty():
|
|
630
|
+
raise error_queue.get()
|
|
631
|
+
|
|
632
|
+
# Check keep-alive
|
|
633
|
+
while not keep_alive_queue.empty():
|
|
634
|
+
_, timestamp = keep_alive_queue.get_nowait()
|
|
635
|
+
last_activity = timestamp
|
|
636
|
+
|
|
637
|
+
# Check for timeout
|
|
638
|
+
if time.time() - last_activity > chunk_timeout:
|
|
639
|
+
raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
|
|
640
|
+
|
|
641
|
+
# Get next chunk
|
|
642
|
+
try:
|
|
643
|
+
msg_type, data = response_queue.get(timeout=0.1)
|
|
644
|
+
except Empty:
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
if msg_type == "error":
|
|
648
|
+
raise RuntimeError(f"Generation error: {data}")
|
|
649
|
+
elif msg_type == "done":
|
|
650
|
+
break
|
|
651
|
+
|
|
652
|
+
chunk = data
|
|
653
|
+
|
|
654
|
+
# Mark first token time
|
|
545
655
|
if not timing.first_token_time:
|
|
546
656
|
timing.mark_first_token()
|
|
547
657
|
|
|
@@ -556,6 +666,13 @@ def stream_generate(
|
|
|
556
666
|
# Break if we're done
|
|
557
667
|
if response.finish_reason:
|
|
558
668
|
break
|
|
669
|
+
|
|
670
|
+
# Wait for generation thread to finish
|
|
671
|
+
generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
|
|
672
|
+
if generation_thread.is_alive():
|
|
673
|
+
# Thread didn't finish - this shouldn't happen normally
|
|
674
|
+
# but we handle it gracefully
|
|
675
|
+
raise RuntimeError("Generation thread failed to finish")
|
|
559
676
|
|
|
560
677
|
except Exception as e:
|
|
561
678
|
# Ensure any error is properly propagated
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=E2Mz56Cu_GODDhnNKE5gE5pOTgX4ekJv6UdO44wWON8,25806
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.28.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.28.dist-info/METADATA,sha256=9TxV1q5wsokL3de27EJKvRr9MFfOi86rxzoEEnKVTSU,2757
|
|
11
|
+
inferencesh-0.2.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.28.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.28.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.28.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|