inferencesh 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

inferencesh/models/llm.py CHANGED
@@ -10,7 +10,6 @@ import base64
10
10
  from .base import BaseAppInput, BaseAppOutput
11
11
  from .file import File
12
12
 
13
-
14
13
  class ContextMessageRole(str, Enum):
15
14
  USER = "user"
16
15
  ASSISTANT = "assistant"
@@ -140,12 +139,17 @@ def timing_context():
140
139
 
141
140
  @property
142
141
  def stats(self):
143
- end_time = time.time()
142
+ current_time = time.time()
144
143
  if self.first_token_time is None:
145
- self.first_token_time = end_time
144
+ return {
145
+ "time_to_first_token": 0.0,
146
+ "generation_time": 0.0,
147
+ "reasoning_time": self.total_reasoning_time,
148
+ "reasoning_tokens": self.reasoning_tokens
149
+ }
146
150
 
147
151
  time_to_first = self.first_token_time - self.start_time
148
- generation_time = end_time - self.first_token_time
152
+ generation_time = current_time - self.first_token_time
149
153
 
150
154
  return {
151
155
  "time_to_first_token": time_to_first,
@@ -209,6 +213,151 @@ def build_messages(
209
213
  return messages
210
214
 
211
215
 
216
+ class StreamResponse:
217
+ """Holds a single chunk of streamed response."""
218
+ def __init__(self):
219
+ self.content = ""
220
+ self.tool_calls = None # Changed from [] to None
221
+ self.finish_reason = None
222
+ self.timing_stats = {
223
+ "time_to_first_token": None, # Changed from 0.0 to None
224
+ "generation_time": 0.0,
225
+ "reasoning_time": 0.0,
226
+ "reasoning_tokens": 0,
227
+ "tokens_per_second": 0.0
228
+ }
229
+ self.usage_stats = {
230
+ "prompt_tokens": 0,
231
+ "completion_tokens": 0,
232
+ "total_tokens": 0,
233
+ "stop_reason": ""
234
+ }
235
+
236
+ def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
237
+ """Update response state from a chunk."""
238
+ print("DEBUG: Entering update_from_chunk")
239
+ print(f"DEBUG: Current usage stats: {self.usage_stats}")
240
+ print(f"DEBUG: Chunk: {chunk}")
241
+
242
+ # Update usage stats if present
243
+ if "usage" in chunk:
244
+ usage = chunk["usage"]
245
+ if usage is not None:
246
+ print(f"DEBUG: Updating usage stats with: {usage}")
247
+ # Update usage stats preserving existing values if not provided
248
+ self.usage_stats.update({
249
+ "prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
250
+ "completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
251
+ "total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
252
+ })
253
+ print(f"DEBUG: Updated usage stats: {self.usage_stats}")
254
+
255
+ # Get the delta from the chunk
256
+ delta = chunk.get("choices", [{}])[0]
257
+
258
+ # Extract content and tool calls from either message or delta
259
+ if "message" in delta:
260
+ message = delta["message"]
261
+ self.content = message.get("content", "")
262
+ if message.get("tool_calls"):
263
+ self._update_tool_calls(message["tool_calls"])
264
+ self.finish_reason = delta.get("finish_reason")
265
+ if self.finish_reason:
266
+ self.usage_stats["stop_reason"] = self.finish_reason
267
+ elif "delta" in delta:
268
+ delta_content = delta["delta"]
269
+ self.content = delta_content.get("content", "")
270
+ if delta_content.get("tool_calls"):
271
+ self._update_tool_calls(delta_content["tool_calls"])
272
+ self.finish_reason = delta.get("finish_reason")
273
+ if self.finish_reason:
274
+ self.usage_stats["stop_reason"] = self.finish_reason
275
+
276
+ # Update timing stats
277
+ timing_stats = timing.stats
278
+ if self.timing_stats["time_to_first_token"] is None:
279
+ self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
280
+
281
+ self.timing_stats.update({
282
+ "generation_time": timing_stats["generation_time"],
283
+ "reasoning_time": timing_stats["reasoning_time"],
284
+ "reasoning_tokens": timing_stats["reasoning_tokens"]
285
+ })
286
+
287
+ # Calculate tokens per second only if we have valid completion tokens and generation time
288
+ if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
289
+ self.timing_stats["tokens_per_second"] = (
290
+ self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
291
+ )
292
+
293
+ print(f"DEBUG: Final usage stats in update_from_chunk: {self.usage_stats}")
294
+
295
+ def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
296
+ """Update tool calls, handling both full and partial updates."""
297
+ if self.tool_calls is None:
298
+ self.tool_calls = []
299
+
300
+ for tool_delta in new_tool_calls:
301
+ tool_id = tool_delta.get("id")
302
+ if not tool_id:
303
+ continue
304
+
305
+ # Find or create tool call
306
+ current_tool = next((t for t in self.tool_calls if t["id"] == tool_id), None)
307
+ if not current_tool:
308
+ current_tool = {
309
+ "id": tool_id,
310
+ "type": tool_delta.get("type", "function"),
311
+ "function": {"name": "", "arguments": ""}
312
+ }
313
+ self.tool_calls.append(current_tool)
314
+
315
+ # Update tool call
316
+ if "function" in tool_delta:
317
+ func_delta = tool_delta["function"]
318
+ if "name" in func_delta:
319
+ current_tool["function"]["name"] = func_delta["name"]
320
+ if "arguments" in func_delta:
321
+ current_tool["function"]["arguments"] += func_delta["arguments"]
322
+
323
+ def has_updates(self) -> bool:
324
+ """Check if this response has any content, tool call, or usage updates."""
325
+ has_content = bool(self.content)
326
+ has_tool_calls = bool(self.tool_calls)
327
+ has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
328
+ has_finish = bool(self.finish_reason)
329
+
330
+ print(f"DEBUG: has_updates check - content: {has_content}, tool_calls: {has_tool_calls}, usage: {has_usage}, finish: {has_finish}")
331
+
332
+ return has_content or has_tool_calls or has_usage or has_finish
333
+
334
+ def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
335
+ """Convert current state to LLMOutput."""
336
+ print("DEBUG: Entering to_output")
337
+ print(f"DEBUG: Usage stats before conversion: {self.usage_stats}")
338
+
339
+ buffer, output, _ = transformer(self.content, buffer)
340
+
341
+ # Add tool calls if present
342
+ if self.tool_calls:
343
+ output.tool_calls = self.tool_calls
344
+
345
+ # Add usage stats
346
+ print(f"DEBUG: Creating LLMUsage with stats: {self.usage_stats}")
347
+ output.usage = LLMUsage(
348
+ stop_reason=self.usage_stats["stop_reason"],
349
+ time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
350
+ tokens_per_second=self.timing_stats["tokens_per_second"],
351
+ prompt_tokens=self.usage_stats["prompt_tokens"],
352
+ completion_tokens=self.usage_stats["completion_tokens"],
353
+ total_tokens=self.usage_stats["total_tokens"],
354
+ reasoning_time=self.timing_stats["reasoning_time"],
355
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
356
+ )
357
+ print(f"DEBUG: Created output usage: {output.usage}")
358
+
359
+ return output, buffer
360
+
212
361
  class ResponseState:
213
362
  """Holds the state of response transformation."""
214
363
  def __init__(self):
@@ -216,7 +365,7 @@ class ResponseState:
216
365
  self.response = ""
217
366
  self.reasoning = None
218
367
  self.function_calls = None # For future function calling support
219
- self.tool_calls = [] # List to accumulate tool calls
368
+ self.tool_calls = None # List to accumulate tool calls
220
369
  self.current_tool_call = None # Track current tool call being built
221
370
  self.state_changes = {
222
371
  "reasoning_started": False,
@@ -243,6 +392,9 @@ class ResponseTransformer:
243
392
  Returns:
244
393
  Cleaned text with common and model-specific tokens removed
245
394
  """
395
+ if text is None:
396
+ return ""
397
+
246
398
  # Common token cleaning across most models
247
399
  cleaned = (text.replace("<|im_end|>", "")
248
400
  .replace("<|im_start|>", "")
@@ -367,158 +519,58 @@ def stream_generate(
367
519
  model: Any,
368
520
  messages: List[Dict[str, Any]],
369
521
  transformer: ResponseTransformer = ResponseTransformer(),
370
- tools: List[Dict[str, Any]] = [],
371
- tool_choice: Dict[str, Any] = {},
522
+ tools: Optional[List[Dict[str, Any]]] = None,
523
+ tool_choice: Optional[Dict[str, Any]] = None,
372
524
  temperature: float = 0.7,
373
525
  top_p: float = 0.95,
374
526
  max_tokens: int = 4096,
375
527
  stop: Optional[List[str]] = None,
528
+ verbose: bool = False,
376
529
  ) -> Generator[LLMOutput, None, None]:
377
530
  """Stream generate from LLaMA.cpp model with timing and usage tracking."""
378
- response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
379
- thread_exception = None
380
- usage_stats = {
381
- "prompt_tokens": 0,
382
- "completion_tokens": 0,
383
- "total_tokens": 0,
384
- "stop_reason": ""
385
- }
386
-
387
531
  with timing_context() as timing:
388
532
  transformer.timing = timing
389
533
 
390
- def generation_thread():
391
- nonlocal thread_exception, usage_stats
392
- try:
393
- completion = model.create_chat_completion(
394
- messages=messages,
395
- tools=tools,
396
- tool_choice=tool_choice,
397
- stream=True,
398
- temperature=temperature,
399
- top_p=top_p,
400
- max_tokens=max_tokens,
401
- stop=stop
402
- )
403
-
404
- tool_calls = []
405
- current_tool = None
406
-
407
- for chunk in completion:
408
- if "usage" in chunk and chunk["usage"] is not None:
409
- usage_stats.update(chunk["usage"])
410
-
411
- delta = chunk.get("choices", [{}])[0]
412
- content = ""
413
- finish_reason = None
414
-
415
- # Extract delta content from either message or delta
416
- if "message" in delta:
417
- message = delta["message"]
418
- content = message.get("content", "")
419
- if message.get("tool_calls"):
420
- for tool in message["tool_calls"]:
421
- if tool.get("id") not in {t.get("id") for t in tool_calls}:
422
- tool_calls.append(tool)
423
- finish_reason = delta.get("finish_reason")
424
- elif "delta" in delta:
425
- delta_content = delta["delta"]
426
- content = delta_content.get("content", "")
427
-
428
- # Handle streaming tool calls
429
- if delta_content.get("tool_calls"):
430
- for tool_delta in delta_content["tool_calls"]:
431
- tool_id = tool_delta.get("id")
432
-
433
- # Find or create tool call
434
- if tool_id:
435
- current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
436
- if not current_tool:
437
- current_tool = {
438
- "id": tool_id,
439
- "type": tool_delta.get("type", "function"),
440
- "function": {"name": "", "arguments": ""}
441
- }
442
- tool_calls.append(current_tool)
443
-
444
- # Update tool call
445
- if current_tool and "function" in tool_delta:
446
- func_delta = tool_delta["function"]
447
- if "name" in func_delta:
448
- current_tool["function"]["name"] = func_delta["name"]
449
- if "arguments" in func_delta:
450
- current_tool["function"]["arguments"] += func_delta["arguments"]
451
-
452
- finish_reason = delta.get("finish_reason")
453
-
454
- has_update = bool(content)
455
- has_tool_update = bool(
456
- (delta.get("message", {}) or {}).get("tool_calls") or
457
- (delta.get("delta", {}) or {}).get("tool_calls")
458
- )
459
-
460
- if has_update or has_tool_update:
461
- if not timing.first_token_time:
462
- timing.mark_first_token()
463
- response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
464
-
465
- if finish_reason:
466
- usage_stats["stop_reason"] = finish_reason
467
-
468
- except Exception as e:
469
- thread_exception = e
470
- finally:
471
- timing_stats = timing.stats
472
- generation_time = timing_stats["generation_time"]
473
- tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
474
- response_queue.put((None, {
475
- "time_to_first_token": timing_stats["time_to_first_token"],
476
- "tokens_per_second": tokens_per_second,
477
- "reasoning_time": timing_stats["reasoning_time"],
478
- "reasoning_tokens": timing_stats["reasoning_tokens"]
479
- }, tool_calls if tool_calls else None))
480
-
481
- thread = Thread(target=generation_thread, daemon=True)
482
- thread.start()
483
-
534
+ # Build completion kwargs
535
+ completion_kwargs = {
536
+ "messages": messages,
537
+ "stream": True,
538
+ "temperature": temperature,
539
+ "top_p": top_p,
540
+ "max_tokens": max_tokens,
541
+ "stop": stop
542
+ }
543
+ if tools is not None:
544
+ completion_kwargs["tools"] = tools
545
+ if tool_choice is not None:
546
+ completion_kwargs["tool_choice"] = tool_choice
547
+
548
+ # Initialize response state
549
+ response = StreamResponse()
484
550
  buffer = ""
551
+
485
552
  try:
486
- while True:
487
- try:
488
- result = response_queue.get(timeout=30.0)
489
- if thread_exception:
490
- raise thread_exception
491
-
492
- piece, timing_stats, tool_calls = result
493
- if piece is None:
494
- # Final yield with complete usage stats
495
- usage = LLMUsage(
496
- stop_reason=usage_stats["stop_reason"],
497
- time_to_first_token=timing_stats["time_to_first_token"],
498
- tokens_per_second=timing_stats["tokens_per_second"],
499
- prompt_tokens=usage_stats["prompt_tokens"],
500
- completion_tokens=usage_stats["completion_tokens"],
501
- total_tokens=usage_stats["total_tokens"],
502
- reasoning_time=timing_stats["reasoning_time"],
503
- reasoning_tokens=timing_stats["reasoning_tokens"]
504
- )
505
-
506
- buffer, output, _ = transformer(piece or "", buffer)
507
- output.usage = usage
508
- if tool_calls:
509
- output.tool_calls = tool_calls
510
- yield output
511
- break
512
-
513
- buffer, output, _ = transformer(piece, buffer)
514
- if tool_calls:
515
- output.tool_calls = tool_calls
553
+ completion = model.create_chat_completion(**completion_kwargs)
554
+
555
+ for chunk in completion:
556
+ if verbose:
557
+ print(chunk)
558
+ # Mark first token time as soon as we get any response
559
+ if not timing.first_token_time:
560
+ timing.mark_first_token()
561
+
562
+ # Update response state from chunk
563
+ response.update_from_chunk(chunk, timing)
564
+
565
+ # Yield output if we have updates
566
+ if response.has_updates():
567
+ output, buffer = response.to_output(buffer, transformer)
516
568
  yield output
517
-
518
- except Exception as e:
519
- if thread_exception and isinstance(e, thread_exception.__class__):
520
- raise thread_exception
569
+
570
+ # Break if we're done
571
+ if response.finish_reason:
521
572
  break
522
- finally:
523
- if thread and thread.is_alive():
524
- thread.join(timeout=2.0)
573
+
574
+ except Exception as e:
575
+ # Ensure any error is properly propagated
576
+ raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.22
3
+ Version: 0.2.24
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
2
2
  inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
3
3
  inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
4
4
  inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
5
- inferencesh/models/llm.py,sha256=XVHsHANGHXhB54aXAS-YcQNcgM673Q_b90xa10gorbA,21729
5
+ inferencesh/models/llm.py,sha256=ycg20sSx3UJevjoTVukBZXwRyXY06tFZKAmlVp0MBzQ,23168
6
6
  inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
7
7
  inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
8
8
  inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
9
- inferencesh-0.2.22.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
- inferencesh-0.2.22.dist-info/METADATA,sha256=o78bpkWPq1MqQH_qgT3VTK1hJLKrZRHUX8e5PVuS_4M,2757
11
- inferencesh-0.2.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- inferencesh-0.2.22.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
- inferencesh-0.2.22.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
- inferencesh-0.2.22.dist-info/RECORD,,
9
+ inferencesh-0.2.24.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
+ inferencesh-0.2.24.dist-info/METADATA,sha256=kQq9qN65EU9DS-SAQHm3Sw73yzz-FZVQX6ueHSgktW8,2757
11
+ inferencesh-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ inferencesh-0.2.24.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
+ inferencesh-0.2.24.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
+ inferencesh-0.2.24.dist-info/RECORD,,