inferencesh 0.2.26__py3-none-any.whl → 0.2.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

inferencesh/models/llm.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, List, Any, Callable, Dict, Generator
2
2
  from enum import Enum
3
- from pydantic import Field
4
- from queue import Queue
3
+ from pydantic import Field, BaseModel
4
+ from queue import Queue, Empty
5
5
  from threading import Thread
6
6
  import time
7
7
  from contextlib import contextmanager
@@ -33,7 +33,8 @@ class ContextMessage(BaseAppInput):
33
33
  default=None
34
34
  )
35
35
 
36
- class LLMInput(BaseAppInput):
36
+ class BaseLLMInput(BaseAppInput):
37
+ """Base class with common LLM fields."""
37
38
  system_prompt: str = Field(
38
39
  description="The system prompt to use for the model",
39
40
  default="You are a helpful assistant that can answer questions and help with tasks.",
@@ -47,25 +48,13 @@ class LLMInput(BaseAppInput):
47
48
  )
48
49
  context: List[ContextMessage] = Field(
49
50
  description="The context to use for the model",
51
+ default=[],
50
52
  examples=[
51
53
  [
52
- {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
54
+ {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
53
55
  {"role": "assistant", "content": [{"type": "text", "text": "The capital of France is Paris."}]}
54
- ],
55
- [
56
- {"role": "user", "content": [{"type": "text", "text": "What is the weather like today?"}]},
57
- {"role": "assistant", "content": [{"type": "text", "text": "I apologize, but I don't have access to real-time weather information. You would need to check a weather service or app to get current weather conditions for your location."}]}
58
- ],
59
- [
60
- {"role": "user", "content": [{"type": "text", "text": "Can you help me write a poem about spring?"}]},
61
- {"role": "assistant", "content": [{"type": "text", "text": "Here's a short poem about spring:\n\nGreen buds awakening,\nSoft rain gently falling down,\nNew life springs anew.\n\nWarm sun breaks through clouds,\nBirds return with joyful song,\nNature's sweet rebirth."}]}
62
- ],
63
- [
64
- {"role": "user", "content": [{"type": "text", "text": "Explain quantum computing in simple terms"}]},
65
- {"role": "assistant", "content": [{"type": "text", "text": "Quantum computing is like having a super-powerful calculator that can solve many problems at once instead of one at a time. While regular computers use bits (0s and 1s), quantum computers use quantum bits or \"qubits\" that can be both 0 and 1 at the same time - kind of like being in two places at once! This allows them to process huge amounts of information much faster than regular computers for certain types of problems."}]}
66
56
  ]
67
- ],
68
- default=[]
57
+ ]
69
58
  )
70
59
  text: str = Field(
71
60
  description="The user prompt to use for the model",
@@ -74,22 +63,41 @@ class LLMInput(BaseAppInput):
74
63
  "What is the weather like today?",
75
64
  "Can you help me write a poem about spring?",
76
65
  "Explain quantum computing in simple terms"
77
- ],
78
- )
79
- image: Optional[File] = Field(
80
- description="The image to use for the model",
81
- default=None
66
+ ]
82
67
  )
83
- # Optional parameters
84
68
  temperature: float = Field(default=0.7)
85
69
  top_p: float = Field(default=0.95)
86
70
  max_tokens: int = Field(default=4096)
87
71
  context_size: int = Field(default=4096)
88
-
89
- # Model specific flags
90
- reasoning: Optional[bool] = Field(default=None)
91
-
92
- tools: Optional[List[Dict[str, Any]]] = Field(default=None)
72
+
73
+ class ImageCapabilityMixin(BaseModel):
74
+ """Mixin for models that support image inputs."""
75
+ image: Optional[File] = Field(
76
+ description="The image to use for the model",
77
+ default=None
78
+ )
79
+
80
+ class ReasoningCapabilityMixin(BaseModel):
81
+ """Mixin for models that support reasoning."""
82
+ reasoning: bool = Field(
83
+ description="Enable step-by-step reasoning",
84
+ default=False
85
+ )
86
+
87
+ class ToolsCapabilityMixin(BaseModel):
88
+ """Mixin for models that support tool/function calling."""
89
+ tools: Optional[List[Dict[str, Any]]] = Field(
90
+ description="Tool definitions for function calling",
91
+ default=None
92
+ )
93
+
94
+ # Example of how to use:
95
+ class LLMInput(BaseLLMInput):
96
+ """Default LLM input model with no special capabilities."""
97
+ pass
98
+
99
+ # For backward compatibility
100
+ LLMInput.model_config["title"] = "LLMInput"
93
101
 
94
102
  class LLMUsage(BaseAppOutput):
95
103
  stop_reason: str = ""
@@ -102,12 +110,24 @@ class LLMUsage(BaseAppOutput):
102
110
  reasoning_time: float = 0.0
103
111
 
104
112
 
105
- class LLMOutput(BaseAppOutput):
106
- response: str
107
- reasoning: Optional[str] = None
108
- tool_calls: Optional[List[Dict[str, Any]]] = None
109
- usage: Optional[LLMUsage] = None
113
+ class BaseLLMOutput(BaseAppOutput):
114
+ """Base class for LLM outputs with common fields."""
115
+ response: str = Field(description="The generated text response")
116
+
117
+ class LLMUsageMixin(BaseModel):
118
+ """Mixin for models that provide token usage statistics."""
119
+ usage: Optional[LLMUsage] = Field(
120
+ description="Token usage statistics",
121
+ default=None
122
+ )
123
+
124
+ # Example of how to use:
125
+ class LLMOutput(BaseLLMOutput, LLMUsageMixin):
126
+ """Default LLM output model with token usage tracking."""
127
+ pass
110
128
 
129
+ # For backward compatibility
130
+ LLMOutput.model_config["title"] = "LLMOutput"
111
131
 
112
132
  @contextmanager
113
133
  def timing_context():
@@ -322,26 +342,28 @@ class StreamResponse:
322
342
 
323
343
  return has_content or has_tool_calls or has_usage or has_finish
324
344
 
325
- def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
345
+ def to_output(self, buffer: str, transformer: Any) -> tuple[BaseLLMOutput, str]:
326
346
  """Convert current state to LLMOutput."""
327
- buffer, output, _ = transformer(self.content, buffer)
347
+ # Create usage object if we have stats
348
+ usage = None
349
+ if any(self.usage_stats.values()):
350
+ usage = LLMUsage(
351
+ stop_reason=self.usage_stats["stop_reason"],
352
+ time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
353
+ tokens_per_second=self.timing_stats["tokens_per_second"],
354
+ prompt_tokens=self.usage_stats["prompt_tokens"],
355
+ completion_tokens=self.usage_stats["completion_tokens"],
356
+ total_tokens=self.usage_stats["total_tokens"],
357
+ reasoning_time=self.timing_stats["reasoning_time"],
358
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
359
+ )
360
+
361
+ buffer, output, _ = transformer(self.content, buffer, usage)
328
362
 
329
- # Add tool calls if present
330
- if self.tool_calls:
363
+ # Add tool calls if present and supported
364
+ if self.tool_calls and hasattr(output, 'tool_calls'):
331
365
  output.tool_calls = self.tool_calls
332
366
 
333
- # Add usage stats
334
- output.usage = LLMUsage(
335
- stop_reason=self.usage_stats["stop_reason"],
336
- time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
337
- tokens_per_second=self.timing_stats["tokens_per_second"],
338
- prompt_tokens=self.usage_stats["prompt_tokens"],
339
- completion_tokens=self.usage_stats["completion_tokens"],
340
- total_tokens=self.usage_stats["total_tokens"],
341
- reasoning_time=self.timing_stats["reasoning_time"],
342
- reasoning_tokens=self.timing_stats["reasoning_tokens"]
343
- )
344
-
345
367
  return output, buffer
346
368
 
347
369
  class ResponseState:
@@ -353,6 +375,7 @@ class ResponseState:
353
375
  self.function_calls = None # For future function calling support
354
376
  self.tool_calls = None # List to accumulate tool calls
355
377
  self.current_tool_call = None # Track current tool call being built
378
+ self.usage = None # Add usage field
356
379
  self.state_changes = {
357
380
  "reasoning_started": False,
358
381
  "reasoning_ended": False,
@@ -364,7 +387,7 @@ class ResponseState:
364
387
 
365
388
  class ResponseTransformer:
366
389
  """Base class for transforming model responses."""
367
- def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
390
+ def __init__(self, output_cls: type[BaseLLMOutput] = LLMOutput):
368
391
  self.state = ResponseState()
369
392
  self.output_cls = output_cls
370
393
  self.timing = None # Will be set by stream_generate
@@ -475,28 +498,43 @@ class ResponseTransformer:
475
498
  Returns:
476
499
  Tuple of (buffer, LLMOutput, state_changes)
477
500
  """
501
+ # Build base output with required fields
502
+ output_data = {
503
+ "response": self.state.response.strip(),
504
+ }
505
+
506
+ # Add optional fields if they exist
507
+ if self.state.usage is not None:
508
+ output_data["usage"] = self.state.usage
509
+ if self.state.reasoning:
510
+ output_data["reasoning"] = self.state.reasoning.strip()
511
+ if self.state.function_calls:
512
+ output_data["function_calls"] = self.state.function_calls
513
+ if self.state.tool_calls:
514
+ output_data["tool_calls"] = self.state.tool_calls
515
+
516
+ output = self.output_cls(**output_data)
517
+
478
518
  return (
479
519
  self.state.buffer,
480
- self.output_cls(
481
- response=self.state.response.strip(),
482
- reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
483
- function_calls=self.state.function_calls,
484
- tool_calls=self.state.tool_calls
485
- ),
520
+ output,
486
521
  self.state.state_changes
487
522
  )
488
523
 
489
- def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
524
+ def __call__(self, piece: str, buffer: str, usage: Optional[LLMUsage] = None) -> tuple[str, LLMOutput, dict]:
490
525
  """Transform a piece of text and return the result.
491
526
 
492
527
  Args:
493
528
  piece: New piece of text to transform
494
529
  buffer: Existing buffer content
530
+ usage: Optional usage statistics
495
531
 
496
532
  Returns:
497
533
  Tuple of (new_buffer, output, state_changes)
498
534
  """
499
535
  self.state.buffer = buffer
536
+ if usage is not None:
537
+ self.state.usage = usage
500
538
  self.transform_chunk(piece)
501
539
  return self.build_output()
502
540
 
@@ -512,36 +550,108 @@ def stream_generate(
512
550
  max_tokens: int = 4096,
513
551
  stop: Optional[List[str]] = None,
514
552
  verbose: bool = False,
515
- ) -> Generator[LLMOutput, None, None]:
553
+ output_cls: type[BaseLLMOutput] = LLMOutput,
554
+ ) -> Generator[BaseLLMOutput, None, None]:
516
555
  """Stream generate from LLaMA.cpp model with timing and usage tracking."""
556
+
557
+ # Create queues for communication between threads
558
+ response_queue = Queue()
559
+ error_queue = Queue()
560
+ keep_alive_queue = Queue()
561
+
562
+ # Set the output class for the transformer
563
+ transformer.output_cls = output_cls
564
+
565
+ def _generate_worker():
566
+ """Worker thread to run the model generation."""
567
+ try:
568
+ # Build completion kwargs
569
+ completion_kwargs = {
570
+ "messages": messages,
571
+ "stream": True,
572
+ "temperature": temperature,
573
+ "top_p": top_p,
574
+ "max_tokens": max_tokens,
575
+ "stop": stop
576
+ }
577
+ if tools is not None:
578
+ completion_kwargs["tools"] = tools
579
+ if tool_choice is not None:
580
+ completion_kwargs["tool_choice"] = tool_choice
581
+
582
+ # Signal that we're starting
583
+ keep_alive_queue.put(("init", time.time()))
584
+
585
+ completion = model.create_chat_completion(**completion_kwargs)
586
+
587
+ for chunk in completion:
588
+ if verbose:
589
+ print(chunk)
590
+ response_queue.put(("chunk", chunk))
591
+ # Update keep-alive timestamp
592
+ keep_alive_queue.put(("alive", time.time()))
593
+
594
+ # Signal completion
595
+ response_queue.put(("done", None))
596
+
597
+ except Exception as e:
598
+ error_queue.put(e)
599
+ response_queue.put(("error", str(e)))
600
+
517
601
  with timing_context() as timing:
518
602
  transformer.timing = timing
519
603
 
520
- # Build completion kwargs
521
- completion_kwargs = {
522
- "messages": messages,
523
- "stream": True,
524
- "temperature": temperature,
525
- "top_p": top_p,
526
- "max_tokens": max_tokens,
527
- "stop": stop
528
- }
529
- if tools is not None:
530
- completion_kwargs["tools"] = tools
531
- if tool_choice is not None:
532
- completion_kwargs["tool_choice"] = tool_choice
604
+ # Start generation thread
605
+ generation_thread = Thread(target=_generate_worker, daemon=True)
606
+ generation_thread.start()
533
607
 
534
608
  # Initialize response state
535
609
  response = StreamResponse()
536
610
  buffer = ""
537
611
 
612
+ # Keep-alive tracking
613
+ last_activity = time.time()
614
+ init_timeout = 30.0 # 30 seconds for initial response
615
+ chunk_timeout = 10.0 # 10 seconds between chunks
616
+
538
617
  try:
539
- completion = model.create_chat_completion(**completion_kwargs)
618
+ # Wait for initial setup
619
+ try:
620
+ msg_type, timestamp = keep_alive_queue.get(timeout=init_timeout)
621
+ if msg_type != "init":
622
+ raise RuntimeError("Unexpected initialization message")
623
+ last_activity = timestamp
624
+ except Empty:
625
+ raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
540
626
 
541
- for chunk in completion:
542
- if verbose:
543
- print(chunk)
544
- # Mark first token time as soon as we get any response
627
+ while True:
628
+ # Check for errors
629
+ if not error_queue.empty():
630
+ raise error_queue.get()
631
+
632
+ # Check keep-alive
633
+ while not keep_alive_queue.empty():
634
+ _, timestamp = keep_alive_queue.get_nowait()
635
+ last_activity = timestamp
636
+
637
+ # Check for timeout
638
+ if time.time() - last_activity > chunk_timeout:
639
+ raise RuntimeError(f"No response from model for {chunk_timeout} seconds")
640
+
641
+ # Get next chunk
642
+ try:
643
+ msg_type, data = response_queue.get(timeout=0.1)
644
+ except Empty:
645
+ continue
646
+
647
+ if msg_type == "error":
648
+ raise RuntimeError(f"Generation error: {data}")
649
+ elif msg_type == "done":
650
+ break
651
+
652
+ chunk = data
653
+
654
+ # Mark first token time
545
655
  if not timing.first_token_time:
546
656
  timing.mark_first_token()
547
657
 
@@ -556,6 +666,13 @@ def stream_generate(
556
666
  # Break if we're done
557
667
  if response.finish_reason:
558
668
  break
669
+
670
+ # Wait for generation thread to finish
671
+ generation_thread.join(timeout=5.0) # Increased timeout to 5 seconds
672
+ if generation_thread.is_alive():
673
+ # Thread didn't finish - this shouldn't happen normally
674
+ # but we handle it gracefully
675
+ raise RuntimeError("Generation thread failed to finish")
559
676
 
560
677
  except Exception as e:
561
678
  # Ensure any error is properly propagated
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.26
3
+ Version: 0.2.28
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
2
2
  inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
3
3
  inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
4
4
  inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
5
- inferencesh/models/llm.py,sha256=3NZW6-_SXNPqVHu5LUDACYQupa9DHMJxDvay3x-AEUY,22383
5
+ inferencesh/models/llm.py,sha256=E2Mz56Cu_GODDhnNKE5gE5pOTgX4ekJv6UdO44wWON8,25806
6
6
  inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
7
7
  inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
8
8
  inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
9
- inferencesh-0.2.26.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
- inferencesh-0.2.26.dist-info/METADATA,sha256=cbycGHYSsZVicaxlQWqiDV4KhZsTkXDT0OU3hArQG04,2757
11
- inferencesh-0.2.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- inferencesh-0.2.26.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
- inferencesh-0.2.26.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
- inferencesh-0.2.26.dist-info/RECORD,,
9
+ inferencesh-0.2.28.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
+ inferencesh-0.2.28.dist-info/METADATA,sha256=9TxV1q5wsokL3de27EJKvRr9MFfOi86rxzoEEnKVTSU,2757
11
+ inferencesh-0.2.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ inferencesh-0.2.28.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
+ inferencesh-0.2.28.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
+ inferencesh-0.2.28.dist-info/RECORD,,