inferencesh 0.2.15__tar.gz → 0.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show
  1. {inferencesh-0.2.15/src/inferencesh.egg-info → inferencesh-0.2.17}/PKG-INFO +1 -1
  2. {inferencesh-0.2.15 → inferencesh-0.2.17}/pyproject.toml +1 -1
  3. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/models/llm.py +192 -63
  4. {inferencesh-0.2.15 → inferencesh-0.2.17/src/inferencesh.egg-info}/PKG-INFO +1 -1
  5. {inferencesh-0.2.15 → inferencesh-0.2.17}/LICENSE +0 -0
  6. {inferencesh-0.2.15 → inferencesh-0.2.17}/README.md +0 -0
  7. {inferencesh-0.2.15 → inferencesh-0.2.17}/setup.cfg +0 -0
  8. {inferencesh-0.2.15 → inferencesh-0.2.17}/setup.py +0 -0
  9. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/__init__.py +0 -0
  10. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/models/__init__.py +0 -0
  11. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/models/base.py +0 -0
  12. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/models/file.py +0 -0
  13. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/utils/__init__.py +0 -0
  14. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/utils/download.py +0 -0
  15. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh/utils/storage.py +0 -0
  16. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh.egg-info/SOURCES.txt +0 -0
  17. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh.egg-info/dependency_links.txt +0 -0
  18. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh.egg-info/entry_points.txt +0 -0
  19. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh.egg-info/requires.txt +0 -0
  20. {inferencesh-0.2.15 → inferencesh-0.2.17}/src/inferencesh.egg-info/top_level.txt +0 -0
  21. {inferencesh-0.2.15 → inferencesh-0.2.17}/tests/test_sdk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.15
3
+ Version: 0.2.17
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "inferencesh"
7
- version = "0.2.15"
7
+ version = "0.2.17"
8
8
  description = "inference.sh Python SDK"
9
9
  authors = [
10
10
  {name = "Inference Shell Inc.", email = "hello@inference.sh"},
@@ -88,7 +88,7 @@ class LLMInput(BaseAppInput):
88
88
  context_size: int = Field(default=4096)
89
89
 
90
90
  # Model specific flags
91
- enable_thinking: bool = Field(default=False)
91
+ reasoning: bool = Field(default=False)
92
92
 
93
93
  class LLMUsage(BaseAppOutput):
94
94
  stop_reason: str = ""
@@ -97,11 +97,13 @@ class LLMUsage(BaseAppOutput):
97
97
  prompt_tokens: int = 0
98
98
  completion_tokens: int = 0
99
99
  total_tokens: int = 0
100
+ reasoning_tokens: int = 0
101
+ reasoning_time: float = 0.0
100
102
 
101
103
 
102
104
  class LLMOutput(BaseAppOutput):
103
105
  response: str
104
- thinking_content: Optional[str] = None
106
+ reasoning: Optional[str] = None
105
107
  usage: Optional[LLMUsage] = None
106
108
 
107
109
 
@@ -112,11 +114,27 @@ def timing_context():
112
114
  def __init__(self):
113
115
  self.start_time = time.time()
114
116
  self.first_token_time = None
117
+ self.reasoning_start_time = None
118
+ self.total_reasoning_time = 0.0
119
+ self.reasoning_tokens = 0
120
+ self.in_reasoning = False
115
121
 
116
122
  def mark_first_token(self):
117
123
  if self.first_token_time is None:
118
124
  self.first_token_time = time.time()
119
125
 
126
+ def start_reasoning(self):
127
+ if not self.in_reasoning:
128
+ self.reasoning_start_time = time.time()
129
+ self.in_reasoning = True
130
+
131
+ def end_reasoning(self, token_count: int = 0):
132
+ if self.in_reasoning and self.reasoning_start_time:
133
+ self.total_reasoning_time += time.time() - self.reasoning_start_time
134
+ self.reasoning_tokens += token_count
135
+ self.reasoning_start_time = None
136
+ self.in_reasoning = False
137
+
120
138
  @property
121
139
  def stats(self):
122
140
  end_time = time.time()
@@ -128,7 +146,9 @@ def timing_context():
128
146
 
129
147
  return {
130
148
  "time_to_first_token": time_to_first,
131
- "generation_time": generation_time
149
+ "generation_time": generation_time,
150
+ "reasoning_time": self.total_reasoning_time,
151
+ "reasoning_tokens": self.reasoning_tokens
132
152
  }
133
153
 
134
154
  timing = TimingInfo()
@@ -186,29 +206,178 @@ def build_messages(
186
206
  return messages
187
207
 
188
208
 
209
+ class ResponseState:
210
+ """Holds the state of response transformation."""
211
+ def __init__(self):
212
+ self.buffer = ""
213
+ self.response = ""
214
+ self.reasoning = None
215
+ self.function_calls = None # For future function calling support
216
+ self.tool_calls = None # For future tool calling support
217
+ self.state_changes = {
218
+ "reasoning_started": False,
219
+ "reasoning_ended": False,
220
+ "function_call_started": False,
221
+ "function_call_ended": False,
222
+ "tool_call_started": False,
223
+ "tool_call_ended": False
224
+ }
225
+
226
+ class ResponseTransformer:
227
+ """Base class for transforming model responses."""
228
+ def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
229
+ self.state = ResponseState()
230
+ self.output_cls = output_cls
231
+ self.timing = None # Will be set by stream_generate
232
+
233
+ def clean_text(self, text: str) -> str:
234
+ """Clean common tokens from the text and apply model-specific cleaning.
235
+
236
+ Args:
237
+ text: Raw text to clean
238
+
239
+ Returns:
240
+ Cleaned text with common and model-specific tokens removed
241
+ """
242
+ # Common token cleaning across most models
243
+ cleaned = (text.replace("<|im_end|>", "")
244
+ .replace("<|im_start|>", "")
245
+ .replace("<start_of_turn>", "")
246
+ .replace("<end_of_turn>", "")
247
+ .replace("<eos>", ""))
248
+ return self.additional_cleaning(cleaned)
249
+
250
+ def additional_cleaning(self, text: str) -> str:
251
+ """Apply model-specific token cleaning.
252
+
253
+ Args:
254
+ text: Text that has had common tokens removed
255
+
256
+ Returns:
257
+ Text with model-specific tokens removed
258
+ """
259
+ return text
260
+
261
+ def handle_reasoning(self, text: str) -> None:
262
+ """Handle reasoning/thinking detection and extraction.
263
+
264
+ Args:
265
+ text: Cleaned text to process for reasoning
266
+ """
267
+ # Default implementation for <think> style reasoning
268
+ if "<think>" in text and not self.state.state_changes["reasoning_started"]:
269
+ self.state.state_changes["reasoning_started"] = True
270
+ if self.timing:
271
+ self.timing.start_reasoning()
272
+
273
+ if "</think>" in text and not self.state.state_changes["reasoning_ended"]:
274
+ self.state.state_changes["reasoning_ended"] = True
275
+ if self.timing:
276
+ # Estimate token count from character count (rough approximation)
277
+ token_count = len(self.state.buffer.split("<think>")[1].split("</think>")[0]) // 4
278
+ self.timing.end_reasoning(token_count)
279
+
280
+ if "<think>" in self.state.buffer:
281
+ parts = self.state.buffer.split("</think>", 1)
282
+ if len(parts) > 1:
283
+ self.state.reasoning = parts[0].split("<think>", 1)[1].strip()
284
+ self.state.response = parts[1].strip()
285
+ else:
286
+ self.state.reasoning = self.state.buffer.split("<think>", 1)[1].strip()
287
+ self.state.response = ""
288
+ else:
289
+ self.state.response = self.state.buffer
290
+
291
+ def handle_function_calls(self, text: str) -> None:
292
+ """Handle function call detection and extraction.
293
+
294
+ Args:
295
+ text: Cleaned text to process for function calls
296
+ """
297
+ # Default no-op implementation
298
+ # Models can override this to implement function call handling
299
+ pass
300
+
301
+ def handle_tool_calls(self, text: str) -> None:
302
+ """Handle tool call detection and extraction.
303
+
304
+ Args:
305
+ text: Cleaned text to process for tool calls
306
+ """
307
+ # Default no-op implementation
308
+ # Models can override this to implement tool call handling
309
+ pass
310
+
311
+ def transform_chunk(self, chunk: str) -> None:
312
+ """Transform a single chunk of model output.
313
+
314
+ This method orchestrates the transformation process by:
315
+ 1. Cleaning the text
316
+ 2. Updating the buffer
317
+ 3. Processing various capabilities (reasoning, function calls, etc)
318
+
319
+ Args:
320
+ chunk: Raw text chunk from the model
321
+ """
322
+ cleaned = self.clean_text(chunk)
323
+ self.state.buffer += cleaned
324
+
325
+ # Process different capabilities
326
+ self.handle_reasoning(cleaned)
327
+ self.handle_function_calls(cleaned)
328
+ self.handle_tool_calls(cleaned)
329
+
330
+ def build_output(self) -> tuple[str, LLMOutput, dict]:
331
+ """Build the final output tuple.
332
+
333
+ Returns:
334
+ Tuple of (buffer, LLMOutput, state_changes)
335
+ """
336
+ return (
337
+ self.state.buffer,
338
+ self.output_cls(
339
+ response=self.state.response.strip(),
340
+ reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
341
+ function_calls=self.state.function_calls,
342
+ tool_calls=self.state.tool_calls
343
+ ),
344
+ self.state.state_changes
345
+ )
346
+
347
+ def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
348
+ """Transform a piece of text and return the result.
349
+
350
+ Args:
351
+ piece: New piece of text to transform
352
+ buffer: Existing buffer content
353
+
354
+ Returns:
355
+ Tuple of (new_buffer, output, state_changes)
356
+ """
357
+ self.state.buffer = buffer
358
+ self.transform_chunk(piece)
359
+ return self.build_output()
360
+
361
+
189
362
  def stream_generate(
190
363
  model: Any,
191
364
  messages: List[Dict[str, Any]],
192
- output_cls: type[LLMOutput],
365
+ transformer: ResponseTransformer,
193
366
  temperature: float = 0.7,
194
367
  top_p: float = 0.95,
195
368
  max_tokens: int = 4096,
196
369
  stop: Optional[List[str]] = None,
197
- handle_thinking: bool = False,
198
- transform_response: Optional[Callable[[str, str], tuple[str, LLMOutput]]] = None,
199
370
  ) -> Generator[LLMOutput, None, None]:
200
371
  """Stream generate from LLaMA.cpp model with timing and usage tracking.
201
372
 
202
373
  Args:
203
374
  model: The LLaMA.cpp model instance
204
375
  messages: List of messages to send to the model
205
- output_cls: Output class type to use for responses
376
+ transformer: ResponseTransformer instance to use for processing output
206
377
  temperature: Sampling temperature
207
378
  top_p: Top-p sampling threshold
208
379
  max_tokens: Maximum tokens to generate
209
380
  stop: Optional list of stop sequences
210
- handle_thinking: Whether to handle thinking tags
211
- transform_response: Optional function to transform responses, takes (piece, buffer) and returns (new_buffer, output)
212
381
  """
213
382
  response_queue: Queue[Optional[tuple[str, dict]]] = Queue()
214
383
  thread_exception = None
@@ -220,6 +389,9 @@ def stream_generate(
220
389
  }
221
390
 
222
391
  with timing_context() as timing:
392
+ # Set timing context in transformer
393
+ transformer.timing = timing
394
+
223
395
  def generation_thread():
224
396
  nonlocal thread_exception, usage_stats
225
397
  try:
@@ -233,11 +405,9 @@ def stream_generate(
233
405
  )
234
406
 
235
407
  for chunk in completion:
236
- # Get usage from root level if present
237
408
  if "usage" in chunk and chunk["usage"] is not None:
238
409
  usage_stats.update(chunk["usage"])
239
410
 
240
- # Get content from choices
241
411
  delta = chunk.get("choices", [{}])[0]
242
412
  content = None
243
413
  finish_reason = None
@@ -265,15 +435,15 @@ def stream_generate(
265
435
  tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
266
436
  response_queue.put((None, {
267
437
  "time_to_first_token": timing_stats["time_to_first_token"],
268
- "tokens_per_second": tokens_per_second
438
+ "tokens_per_second": tokens_per_second,
439
+ "reasoning_time": timing_stats["reasoning_time"],
440
+ "reasoning_tokens": timing_stats["reasoning_tokens"]
269
441
  }))
270
442
 
271
443
  thread = Thread(target=generation_thread, daemon=True)
272
444
  thread.start()
273
445
 
274
446
  buffer = ""
275
- thinking_content = "" if handle_thinking else None
276
- in_thinking = handle_thinking
277
447
  try:
278
448
  while True:
279
449
  try:
@@ -290,59 +460,18 @@ def stream_generate(
290
460
  tokens_per_second=timing_stats["tokens_per_second"],
291
461
  prompt_tokens=usage_stats["prompt_tokens"],
292
462
  completion_tokens=usage_stats["completion_tokens"],
293
- total_tokens=usage_stats["total_tokens"]
463
+ total_tokens=usage_stats["total_tokens"],
464
+ reasoning_time=timing_stats["reasoning_time"],
465
+ reasoning_tokens=timing_stats["reasoning_tokens"]
294
466
  )
295
467
 
296
- if transform_response:
297
- buffer, output = transform_response(piece or "", buffer)
298
- output.usage = usage
299
- yield output
300
- else:
301
- # Handle thinking vs response content if enabled
302
- if handle_thinking and "</think>" in piece:
303
- parts = piece.split("</think>")
304
- if in_thinking:
305
- thinking_content += parts[0].replace("<think>", "")
306
- buffer = parts[1] if len(parts) > 1 else ""
307
- in_thinking = False
308
- else:
309
- buffer += piece
310
- else:
311
- if in_thinking:
312
- thinking_content += piece.replace("<think>", "")
313
- else:
314
- buffer += piece
315
-
316
- yield output_cls(
317
- response=buffer.strip(),
318
- thinking_content=thinking_content.strip() if thinking_content else None,
319
- usage=usage
320
- )
321
- break
322
-
323
- if transform_response:
324
- buffer, output = transform_response(piece, buffer)
468
+ buffer, output, _ = transformer(piece or "", buffer)
469
+ output.usage = usage
325
470
  yield output
326
- else:
327
- # Handle thinking vs response content if enabled
328
- if handle_thinking and "</think>" in piece:
329
- parts = piece.split("</think>")
330
- if in_thinking:
331
- thinking_content += parts[0].replace("<think>", "")
332
- buffer = parts[1] if len(parts) > 1 else ""
333
- in_thinking = False
334
- else:
335
- buffer += piece
336
- else:
337
- if in_thinking:
338
- thinking_content += piece.replace("<think>", "")
339
- else:
340
- buffer += piece
471
+ break
341
472
 
342
- yield output_cls(
343
- response=buffer.strip(),
344
- thinking_content=thinking_content.strip() if thinking_content else None
345
- )
473
+ buffer, output, _ = transformer(piece, buffer)
474
+ yield output
346
475
 
347
476
  except Exception as e:
348
477
  if thread_exception and isinstance(e, thread_exception.__class__):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.15
3
+ Version: 0.2.17
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
File without changes
File without changes
File without changes
File without changes