inferencesh 0.2.21__tar.gz → 0.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

Files changed (21) hide show
  1. {inferencesh-0.2.21/src/inferencesh.egg-info → inferencesh-0.2.23}/PKG-INFO +1 -1
  2. {inferencesh-0.2.21 → inferencesh-0.2.23}/pyproject.toml +1 -1
  3. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/models/llm.py +157 -145
  4. {inferencesh-0.2.21 → inferencesh-0.2.23/src/inferencesh.egg-info}/PKG-INFO +1 -1
  5. {inferencesh-0.2.21 → inferencesh-0.2.23}/LICENSE +0 -0
  6. {inferencesh-0.2.21 → inferencesh-0.2.23}/README.md +0 -0
  7. {inferencesh-0.2.21 → inferencesh-0.2.23}/setup.cfg +0 -0
  8. {inferencesh-0.2.21 → inferencesh-0.2.23}/setup.py +0 -0
  9. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/__init__.py +0 -0
  10. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/models/__init__.py +0 -0
  11. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/models/base.py +0 -0
  12. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/models/file.py +0 -0
  13. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/utils/__init__.py +0 -0
  14. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/utils/download.py +0 -0
  15. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh/utils/storage.py +0 -0
  16. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh.egg-info/SOURCES.txt +0 -0
  17. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh.egg-info/dependency_links.txt +0 -0
  18. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh.egg-info/entry_points.txt +0 -0
  19. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh.egg-info/requires.txt +0 -0
  20. {inferencesh-0.2.21 → inferencesh-0.2.23}/src/inferencesh.egg-info/top_level.txt +0 -0
  21. {inferencesh-0.2.21 → inferencesh-0.2.23}/tests/test_sdk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.21
3
+ Version: 0.2.23
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "inferencesh"
7
- version = "0.2.21"
7
+ version = "0.2.23"
8
8
  description = "inference.sh Python SDK"
9
9
  authors = [
10
10
  {name = "Inference Shell Inc.", email = "hello@inference.sh"},
@@ -116,7 +116,7 @@ def timing_context():
116
116
  class TimingInfo:
117
117
  def __init__(self):
118
118
  self.start_time = time.time()
119
- self.first_token_time = None
119
+ self.first_token_time = 0
120
120
  self.reasoning_start_time = None
121
121
  self.total_reasoning_time = 0.0
122
122
  self.reasoning_tokens = 0
@@ -209,6 +209,115 @@ def build_messages(
209
209
  return messages
210
210
 
211
211
 
212
+ class StreamResponse:
213
+ """Holds a single chunk of streamed response."""
214
+ def __init__(self):
215
+ self.content = ""
216
+ self.tool_calls = None # Changed from [] to None
217
+ self.finish_reason = None
218
+ self.timing_stats = {
219
+ "time_to_first_token": 0.0,
220
+ "generation_time": 0.0,
221
+ "reasoning_time": 0.0,
222
+ "reasoning_tokens": 0,
223
+ "tokens_per_second": 0.0
224
+ }
225
+ self.usage_stats = {
226
+ "prompt_tokens": 0,
227
+ "completion_tokens": 0,
228
+ "total_tokens": 0,
229
+ "stop_reason": ""
230
+ }
231
+
232
+ def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
233
+ """Update response state from a chunk."""
234
+ # Update usage stats if present
235
+ if "usage" in chunk and chunk["usage"] is not None:
236
+ self.usage_stats.update(chunk["usage"])
237
+
238
+ # Get the delta from the chunk
239
+ delta = chunk.get("choices", [{}])[0]
240
+
241
+ # Extract content and tool calls from either message or delta
242
+ if "message" in delta:
243
+ message = delta["message"]
244
+ self.content = message.get("content", "")
245
+ if message.get("tool_calls"):
246
+ self._update_tool_calls(message["tool_calls"])
247
+ self.finish_reason = delta.get("finish_reason")
248
+ elif "delta" in delta:
249
+ delta_content = delta["delta"]
250
+ self.content = delta_content.get("content", "")
251
+ if delta_content.get("tool_calls"):
252
+ self._update_tool_calls(delta_content["tool_calls"])
253
+ self.finish_reason = delta.get("finish_reason")
254
+
255
+ # Update timing stats while preserving tokens_per_second
256
+ timing_stats = timing.stats
257
+ generation_time = timing_stats["generation_time"]
258
+ completion_tokens = self.usage_stats.get("completion_tokens", 0)
259
+ tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
260
+
261
+ self.timing_stats.update({
262
+ **timing_stats,
263
+ "tokens_per_second": tokens_per_second
264
+ })
265
+
266
+ def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
267
+ """Update tool calls, handling both full and partial updates."""
268
+ if self.tool_calls is None:
269
+ self.tool_calls = []
270
+
271
+ for tool_delta in new_tool_calls:
272
+ tool_id = tool_delta.get("id")
273
+ if not tool_id:
274
+ continue
275
+
276
+ # Find or create tool call
277
+ current_tool = next((t for t in self.tool_calls if t["id"] == tool_id), None)
278
+ if not current_tool:
279
+ current_tool = {
280
+ "id": tool_id,
281
+ "type": tool_delta.get("type", "function"),
282
+ "function": {"name": "", "arguments": ""}
283
+ }
284
+ self.tool_calls.append(current_tool)
285
+
286
+ # Update tool call
287
+ if "function" in tool_delta:
288
+ func_delta = tool_delta["function"]
289
+ if "name" in func_delta:
290
+ current_tool["function"]["name"] = func_delta["name"]
291
+ if "arguments" in func_delta:
292
+ current_tool["function"]["arguments"] += func_delta["arguments"]
293
+
294
+ def has_updates(self) -> bool:
295
+ """Check if this response has any content or tool call updates."""
296
+ return bool(self.content) or bool(self.tool_calls)
297
+
298
+ def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
299
+ """Convert current state to LLMOutput."""
300
+ buffer, output, _ = transformer(self.content, buffer)
301
+
302
+ # Add tool calls if present
303
+ if self.tool_calls:
304
+ output.tool_calls = self.tool_calls
305
+
306
+ # Add usage stats if this is final
307
+ if self.finish_reason:
308
+ output.usage = LLMUsage(
309
+ stop_reason=self.usage_stats["stop_reason"],
310
+ time_to_first_token=self.timing_stats["time_to_first_token"],
311
+ tokens_per_second=self.timing_stats["tokens_per_second"],
312
+ prompt_tokens=self.usage_stats["prompt_tokens"],
313
+ completion_tokens=self.usage_stats["completion_tokens"],
314
+ total_tokens=self.usage_stats["total_tokens"],
315
+ reasoning_time=self.timing_stats["reasoning_time"],
316
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
317
+ )
318
+
319
+ return output, buffer
320
+
212
321
  class ResponseState:
213
322
  """Holds the state of response transformation."""
214
323
  def __init__(self):
@@ -216,7 +325,7 @@ class ResponseState:
216
325
  self.response = ""
217
326
  self.reasoning = None
218
327
  self.function_calls = None # For future function calling support
219
- self.tool_calls = [] # List to accumulate tool calls
328
+ self.tool_calls = None # List to accumulate tool calls
220
329
  self.current_tool_call = None # Track current tool call being built
221
330
  self.state_changes = {
222
331
  "reasoning_started": False,
@@ -243,6 +352,9 @@ class ResponseTransformer:
243
352
  Returns:
244
353
  Cleaned text with common and model-specific tokens removed
245
354
  """
355
+ if text is None:
356
+ return ""
357
+
246
358
  # Common token cleaning across most models
247
359
  cleaned = (text.replace("<|im_end|>", "")
248
360
  .replace("<|im_start|>", "")
@@ -366,159 +478,59 @@ class ResponseTransformer:
366
478
  def stream_generate(
367
479
  model: Any,
368
480
  messages: List[Dict[str, Any]],
369
- tools: List[Dict[str, Any]],
370
- tool_choice: Dict[str, Any],
371
- transformer: ResponseTransformer,
481
+ transformer: ResponseTransformer = ResponseTransformer(),
482
+ tools: Optional[List[Dict[str, Any]]] = None,
483
+ tool_choice: Optional[Dict[str, Any]] = None,
372
484
  temperature: float = 0.7,
373
485
  top_p: float = 0.95,
374
486
  max_tokens: int = 4096,
375
487
  stop: Optional[List[str]] = None,
488
+ verbose: bool = False,
376
489
  ) -> Generator[LLMOutput, None, None]:
377
490
  """Stream generate from LLaMA.cpp model with timing and usage tracking."""
378
- response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
379
- thread_exception = None
380
- usage_stats = {
381
- "prompt_tokens": 0,
382
- "completion_tokens": 0,
383
- "total_tokens": 0,
384
- "stop_reason": ""
385
- }
386
-
387
491
  with timing_context() as timing:
388
492
  transformer.timing = timing
389
493
 
390
- def generation_thread():
391
- nonlocal thread_exception, usage_stats
392
- try:
393
- completion = model.create_chat_completion(
394
- messages=messages,
395
- tools=tools,
396
- tool_choice=tool_choice,
397
- stream=True,
398
- temperature=temperature,
399
- top_p=top_p,
400
- max_tokens=max_tokens,
401
- stop=stop
402
- )
403
-
404
- tool_calls = []
405
- current_tool = None
406
-
407
- for chunk in completion:
408
- if "usage" in chunk and chunk["usage"] is not None:
409
- usage_stats.update(chunk["usage"])
410
-
411
- delta = chunk.get("choices", [{}])[0]
412
- content = ""
413
- finish_reason = None
414
-
415
- # Extract delta content from either message or delta
416
- if "message" in delta:
417
- message = delta["message"]
418
- content = message.get("content", "")
419
- if message.get("tool_calls"):
420
- for tool in message["tool_calls"]:
421
- if tool.get("id") not in {t.get("id") for t in tool_calls}:
422
- tool_calls.append(tool)
423
- finish_reason = delta.get("finish_reason")
424
- elif "delta" in delta:
425
- delta_content = delta["delta"]
426
- content = delta_content.get("content", "")
427
-
428
- # Handle streaming tool calls
429
- if delta_content.get("tool_calls"):
430
- for tool_delta in delta_content["tool_calls"]:
431
- tool_id = tool_delta.get("id")
432
-
433
- # Find or create tool call
434
- if tool_id:
435
- current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
436
- if not current_tool:
437
- current_tool = {
438
- "id": tool_id,
439
- "type": tool_delta.get("type", "function"),
440
- "function": {"name": "", "arguments": ""}
441
- }
442
- tool_calls.append(current_tool)
443
-
444
- # Update tool call
445
- if current_tool and "function" in tool_delta:
446
- func_delta = tool_delta["function"]
447
- if "name" in func_delta:
448
- current_tool["function"]["name"] = func_delta["name"]
449
- if "arguments" in func_delta:
450
- current_tool["function"]["arguments"] += func_delta["arguments"]
451
-
452
- finish_reason = delta.get("finish_reason")
453
-
454
- has_update = bool(content)
455
- has_tool_update = bool(
456
- (delta.get("message", {}) or {}).get("tool_calls") or
457
- (delta.get("delta", {}) or {}).get("tool_calls")
458
- )
459
-
460
- if has_update or has_tool_update:
461
- if not timing.first_token_time:
462
- timing.mark_first_token()
463
- response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
464
-
465
- if finish_reason:
466
- usage_stats["stop_reason"] = finish_reason
467
-
468
- except Exception as e:
469
- thread_exception = e
470
- finally:
471
- timing_stats = timing.stats
472
- generation_time = timing_stats["generation_time"]
473
- tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
474
- response_queue.put((None, {
475
- "time_to_first_token": timing_stats["time_to_first_token"],
476
- "tokens_per_second": tokens_per_second,
477
- "reasoning_time": timing_stats["reasoning_time"],
478
- "reasoning_tokens": timing_stats["reasoning_tokens"]
479
- }, tool_calls if tool_calls else None))
480
-
481
- thread = Thread(target=generation_thread, daemon=True)
482
- thread.start()
483
-
494
+ # Build completion kwargs
495
+ completion_kwargs = {
496
+ "messages": messages,
497
+ "stream": True,
498
+ "temperature": temperature,
499
+ "top_p": top_p,
500
+ "max_tokens": max_tokens,
501
+ "stop": stop
502
+ }
503
+ if tools is not None:
504
+ completion_kwargs["tools"] = tools
505
+ if tool_choice is not None:
506
+ completion_kwargs["tool_choice"] = tool_choice
507
+
508
+ # Initialize response state
509
+ response = StreamResponse()
484
510
  buffer = ""
511
+
485
512
  try:
486
- while True:
487
- try:
488
- result = response_queue.get(timeout=30.0)
489
- if thread_exception:
490
- raise thread_exception
491
-
492
- piece, timing_stats, tool_calls = result
493
- if piece is None:
494
- # Final yield with complete usage stats
495
- usage = LLMUsage(
496
- stop_reason=usage_stats["stop_reason"],
497
- time_to_first_token=timing_stats["time_to_first_token"],
498
- tokens_per_second=timing_stats["tokens_per_second"],
499
- prompt_tokens=usage_stats["prompt_tokens"],
500
- completion_tokens=usage_stats["completion_tokens"],
501
- total_tokens=usage_stats["total_tokens"],
502
- reasoning_time=timing_stats["reasoning_time"],
503
- reasoning_tokens=timing_stats["reasoning_tokens"]
504
- )
505
-
506
- buffer, output, _ = transformer(piece or "", buffer)
507
- output.usage = usage
508
- if tool_calls:
509
- output.tool_calls = tool_calls
510
- yield output
511
- break
512
-
513
- buffer, output, _ = transformer(piece, buffer)
514
- if tool_calls:
515
- output.tool_calls = tool_calls
513
+ completion = model.create_chat_completion(**completion_kwargs)
514
+
515
+ for chunk in completion:
516
+ if verbose:
517
+ print(chunk)
518
+ # Mark first token time as soon as we get any response
519
+ if not timing.first_token_time:
520
+ timing.mark_first_token()
521
+
522
+ # Update response state from chunk
523
+ response.update_from_chunk(chunk, timing)
524
+
525
+ # Yield output if we have updates
526
+ if response.has_updates():
527
+ output, buffer = response.to_output(buffer, transformer)
516
528
  yield output
517
-
518
- except Exception as e:
519
- if thread_exception and isinstance(e, thread_exception.__class__):
520
- raise thread_exception
529
+
530
+ # Break if we're done
531
+ if response.finish_reason:
521
532
  break
522
- finally:
523
- if thread and thread.is_alive():
524
- thread.join(timeout=2.0)
533
+
534
+ except Exception as e:
535
+ # Ensure any error is properly propagated
536
+ raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.21
3
+ Version: 0.2.23
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
File without changes
File without changes
File without changes
File without changes