inferencesh 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +157 -145
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/METADATA +1 -1
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/RECORD +7 -7
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.21.dist-info → inferencesh-0.2.23.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -116,7 +116,7 @@ def timing_context():
|
|
|
116
116
|
class TimingInfo:
|
|
117
117
|
def __init__(self):
|
|
118
118
|
self.start_time = time.time()
|
|
119
|
-
self.first_token_time =
|
|
119
|
+
self.first_token_time = 0
|
|
120
120
|
self.reasoning_start_time = None
|
|
121
121
|
self.total_reasoning_time = 0.0
|
|
122
122
|
self.reasoning_tokens = 0
|
|
@@ -209,6 +209,115 @@ def build_messages(
|
|
|
209
209
|
return messages
|
|
210
210
|
|
|
211
211
|
|
|
212
|
+
class StreamResponse:
|
|
213
|
+
"""Holds a single chunk of streamed response."""
|
|
214
|
+
def __init__(self):
|
|
215
|
+
self.content = ""
|
|
216
|
+
self.tool_calls = None # Changed from [] to None
|
|
217
|
+
self.finish_reason = None
|
|
218
|
+
self.timing_stats = {
|
|
219
|
+
"time_to_first_token": 0.0,
|
|
220
|
+
"generation_time": 0.0,
|
|
221
|
+
"reasoning_time": 0.0,
|
|
222
|
+
"reasoning_tokens": 0,
|
|
223
|
+
"tokens_per_second": 0.0
|
|
224
|
+
}
|
|
225
|
+
self.usage_stats = {
|
|
226
|
+
"prompt_tokens": 0,
|
|
227
|
+
"completion_tokens": 0,
|
|
228
|
+
"total_tokens": 0,
|
|
229
|
+
"stop_reason": ""
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
|
|
233
|
+
"""Update response state from a chunk."""
|
|
234
|
+
# Update usage stats if present
|
|
235
|
+
if "usage" in chunk and chunk["usage"] is not None:
|
|
236
|
+
self.usage_stats.update(chunk["usage"])
|
|
237
|
+
|
|
238
|
+
# Get the delta from the chunk
|
|
239
|
+
delta = chunk.get("choices", [{}])[0]
|
|
240
|
+
|
|
241
|
+
# Extract content and tool calls from either message or delta
|
|
242
|
+
if "message" in delta:
|
|
243
|
+
message = delta["message"]
|
|
244
|
+
self.content = message.get("content", "")
|
|
245
|
+
if message.get("tool_calls"):
|
|
246
|
+
self._update_tool_calls(message["tool_calls"])
|
|
247
|
+
self.finish_reason = delta.get("finish_reason")
|
|
248
|
+
elif "delta" in delta:
|
|
249
|
+
delta_content = delta["delta"]
|
|
250
|
+
self.content = delta_content.get("content", "")
|
|
251
|
+
if delta_content.get("tool_calls"):
|
|
252
|
+
self._update_tool_calls(delta_content["tool_calls"])
|
|
253
|
+
self.finish_reason = delta.get("finish_reason")
|
|
254
|
+
|
|
255
|
+
# Update timing stats while preserving tokens_per_second
|
|
256
|
+
timing_stats = timing.stats
|
|
257
|
+
generation_time = timing_stats["generation_time"]
|
|
258
|
+
completion_tokens = self.usage_stats.get("completion_tokens", 0)
|
|
259
|
+
tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
|
|
260
|
+
|
|
261
|
+
self.timing_stats.update({
|
|
262
|
+
**timing_stats,
|
|
263
|
+
"tokens_per_second": tokens_per_second
|
|
264
|
+
})
|
|
265
|
+
|
|
266
|
+
def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
|
|
267
|
+
"""Update tool calls, handling both full and partial updates."""
|
|
268
|
+
if self.tool_calls is None:
|
|
269
|
+
self.tool_calls = []
|
|
270
|
+
|
|
271
|
+
for tool_delta in new_tool_calls:
|
|
272
|
+
tool_id = tool_delta.get("id")
|
|
273
|
+
if not tool_id:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
# Find or create tool call
|
|
277
|
+
current_tool = next((t for t in self.tool_calls if t["id"] == tool_id), None)
|
|
278
|
+
if not current_tool:
|
|
279
|
+
current_tool = {
|
|
280
|
+
"id": tool_id,
|
|
281
|
+
"type": tool_delta.get("type", "function"),
|
|
282
|
+
"function": {"name": "", "arguments": ""}
|
|
283
|
+
}
|
|
284
|
+
self.tool_calls.append(current_tool)
|
|
285
|
+
|
|
286
|
+
# Update tool call
|
|
287
|
+
if "function" in tool_delta:
|
|
288
|
+
func_delta = tool_delta["function"]
|
|
289
|
+
if "name" in func_delta:
|
|
290
|
+
current_tool["function"]["name"] = func_delta["name"]
|
|
291
|
+
if "arguments" in func_delta:
|
|
292
|
+
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
293
|
+
|
|
294
|
+
def has_updates(self) -> bool:
|
|
295
|
+
"""Check if this response has any content or tool call updates."""
|
|
296
|
+
return bool(self.content) or bool(self.tool_calls)
|
|
297
|
+
|
|
298
|
+
def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
|
|
299
|
+
"""Convert current state to LLMOutput."""
|
|
300
|
+
buffer, output, _ = transformer(self.content, buffer)
|
|
301
|
+
|
|
302
|
+
# Add tool calls if present
|
|
303
|
+
if self.tool_calls:
|
|
304
|
+
output.tool_calls = self.tool_calls
|
|
305
|
+
|
|
306
|
+
# Add usage stats if this is final
|
|
307
|
+
if self.finish_reason:
|
|
308
|
+
output.usage = LLMUsage(
|
|
309
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
310
|
+
time_to_first_token=self.timing_stats["time_to_first_token"],
|
|
311
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
312
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
313
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
314
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
315
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
316
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return output, buffer
|
|
320
|
+
|
|
212
321
|
class ResponseState:
|
|
213
322
|
"""Holds the state of response transformation."""
|
|
214
323
|
def __init__(self):
|
|
@@ -216,7 +325,7 @@ class ResponseState:
|
|
|
216
325
|
self.response = ""
|
|
217
326
|
self.reasoning = None
|
|
218
327
|
self.function_calls = None # For future function calling support
|
|
219
|
-
self.tool_calls =
|
|
328
|
+
self.tool_calls = None # List to accumulate tool calls
|
|
220
329
|
self.current_tool_call = None # Track current tool call being built
|
|
221
330
|
self.state_changes = {
|
|
222
331
|
"reasoning_started": False,
|
|
@@ -243,6 +352,9 @@ class ResponseTransformer:
|
|
|
243
352
|
Returns:
|
|
244
353
|
Cleaned text with common and model-specific tokens removed
|
|
245
354
|
"""
|
|
355
|
+
if text is None:
|
|
356
|
+
return ""
|
|
357
|
+
|
|
246
358
|
# Common token cleaning across most models
|
|
247
359
|
cleaned = (text.replace("<|im_end|>", "")
|
|
248
360
|
.replace("<|im_start|>", "")
|
|
@@ -366,159 +478,59 @@ class ResponseTransformer:
|
|
|
366
478
|
def stream_generate(
|
|
367
479
|
model: Any,
|
|
368
480
|
messages: List[Dict[str, Any]],
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
481
|
+
transformer: ResponseTransformer = ResponseTransformer(),
|
|
482
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
483
|
+
tool_choice: Optional[Dict[str, Any]] = None,
|
|
372
484
|
temperature: float = 0.7,
|
|
373
485
|
top_p: float = 0.95,
|
|
374
486
|
max_tokens: int = 4096,
|
|
375
487
|
stop: Optional[List[str]] = None,
|
|
488
|
+
verbose: bool = False,
|
|
376
489
|
) -> Generator[LLMOutput, None, None]:
|
|
377
490
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
378
|
-
response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
|
|
379
|
-
thread_exception = None
|
|
380
|
-
usage_stats = {
|
|
381
|
-
"prompt_tokens": 0,
|
|
382
|
-
"completion_tokens": 0,
|
|
383
|
-
"total_tokens": 0,
|
|
384
|
-
"stop_reason": ""
|
|
385
|
-
}
|
|
386
|
-
|
|
387
491
|
with timing_context() as timing:
|
|
388
492
|
transformer.timing = timing
|
|
389
493
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
for chunk in completion:
|
|
408
|
-
if "usage" in chunk and chunk["usage"] is not None:
|
|
409
|
-
usage_stats.update(chunk["usage"])
|
|
410
|
-
|
|
411
|
-
delta = chunk.get("choices", [{}])[0]
|
|
412
|
-
content = ""
|
|
413
|
-
finish_reason = None
|
|
414
|
-
|
|
415
|
-
# Extract delta content from either message or delta
|
|
416
|
-
if "message" in delta:
|
|
417
|
-
message = delta["message"]
|
|
418
|
-
content = message.get("content", "")
|
|
419
|
-
if message.get("tool_calls"):
|
|
420
|
-
for tool in message["tool_calls"]:
|
|
421
|
-
if tool.get("id") not in {t.get("id") for t in tool_calls}:
|
|
422
|
-
tool_calls.append(tool)
|
|
423
|
-
finish_reason = delta.get("finish_reason")
|
|
424
|
-
elif "delta" in delta:
|
|
425
|
-
delta_content = delta["delta"]
|
|
426
|
-
content = delta_content.get("content", "")
|
|
427
|
-
|
|
428
|
-
# Handle streaming tool calls
|
|
429
|
-
if delta_content.get("tool_calls"):
|
|
430
|
-
for tool_delta in delta_content["tool_calls"]:
|
|
431
|
-
tool_id = tool_delta.get("id")
|
|
432
|
-
|
|
433
|
-
# Find or create tool call
|
|
434
|
-
if tool_id:
|
|
435
|
-
current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
|
|
436
|
-
if not current_tool:
|
|
437
|
-
current_tool = {
|
|
438
|
-
"id": tool_id,
|
|
439
|
-
"type": tool_delta.get("type", "function"),
|
|
440
|
-
"function": {"name": "", "arguments": ""}
|
|
441
|
-
}
|
|
442
|
-
tool_calls.append(current_tool)
|
|
443
|
-
|
|
444
|
-
# Update tool call
|
|
445
|
-
if current_tool and "function" in tool_delta:
|
|
446
|
-
func_delta = tool_delta["function"]
|
|
447
|
-
if "name" in func_delta:
|
|
448
|
-
current_tool["function"]["name"] = func_delta["name"]
|
|
449
|
-
if "arguments" in func_delta:
|
|
450
|
-
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
451
|
-
|
|
452
|
-
finish_reason = delta.get("finish_reason")
|
|
453
|
-
|
|
454
|
-
has_update = bool(content)
|
|
455
|
-
has_tool_update = bool(
|
|
456
|
-
(delta.get("message", {}) or {}).get("tool_calls") or
|
|
457
|
-
(delta.get("delta", {}) or {}).get("tool_calls")
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
if has_update or has_tool_update:
|
|
461
|
-
if not timing.first_token_time:
|
|
462
|
-
timing.mark_first_token()
|
|
463
|
-
response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
|
|
464
|
-
|
|
465
|
-
if finish_reason:
|
|
466
|
-
usage_stats["stop_reason"] = finish_reason
|
|
467
|
-
|
|
468
|
-
except Exception as e:
|
|
469
|
-
thread_exception = e
|
|
470
|
-
finally:
|
|
471
|
-
timing_stats = timing.stats
|
|
472
|
-
generation_time = timing_stats["generation_time"]
|
|
473
|
-
tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
|
|
474
|
-
response_queue.put((None, {
|
|
475
|
-
"time_to_first_token": timing_stats["time_to_first_token"],
|
|
476
|
-
"tokens_per_second": tokens_per_second,
|
|
477
|
-
"reasoning_time": timing_stats["reasoning_time"],
|
|
478
|
-
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
479
|
-
}, tool_calls if tool_calls else None))
|
|
480
|
-
|
|
481
|
-
thread = Thread(target=generation_thread, daemon=True)
|
|
482
|
-
thread.start()
|
|
483
|
-
|
|
494
|
+
# Build completion kwargs
|
|
495
|
+
completion_kwargs = {
|
|
496
|
+
"messages": messages,
|
|
497
|
+
"stream": True,
|
|
498
|
+
"temperature": temperature,
|
|
499
|
+
"top_p": top_p,
|
|
500
|
+
"max_tokens": max_tokens,
|
|
501
|
+
"stop": stop
|
|
502
|
+
}
|
|
503
|
+
if tools is not None:
|
|
504
|
+
completion_kwargs["tools"] = tools
|
|
505
|
+
if tool_choice is not None:
|
|
506
|
+
completion_kwargs["tool_choice"] = tool_choice
|
|
507
|
+
|
|
508
|
+
# Initialize response state
|
|
509
|
+
response = StreamResponse()
|
|
484
510
|
buffer = ""
|
|
511
|
+
|
|
485
512
|
try:
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
total_tokens=usage_stats["total_tokens"],
|
|
502
|
-
reasoning_time=timing_stats["reasoning_time"],
|
|
503
|
-
reasoning_tokens=timing_stats["reasoning_tokens"]
|
|
504
|
-
)
|
|
505
|
-
|
|
506
|
-
buffer, output, _ = transformer(piece or "", buffer)
|
|
507
|
-
output.usage = usage
|
|
508
|
-
if tool_calls:
|
|
509
|
-
output.tool_calls = tool_calls
|
|
510
|
-
yield output
|
|
511
|
-
break
|
|
512
|
-
|
|
513
|
-
buffer, output, _ = transformer(piece, buffer)
|
|
514
|
-
if tool_calls:
|
|
515
|
-
output.tool_calls = tool_calls
|
|
513
|
+
completion = model.create_chat_completion(**completion_kwargs)
|
|
514
|
+
|
|
515
|
+
for chunk in completion:
|
|
516
|
+
if verbose:
|
|
517
|
+
print(chunk)
|
|
518
|
+
# Mark first token time as soon as we get any response
|
|
519
|
+
if not timing.first_token_time:
|
|
520
|
+
timing.mark_first_token()
|
|
521
|
+
|
|
522
|
+
# Update response state from chunk
|
|
523
|
+
response.update_from_chunk(chunk, timing)
|
|
524
|
+
|
|
525
|
+
# Yield output if we have updates
|
|
526
|
+
if response.has_updates():
|
|
527
|
+
output, buffer = response.to_output(buffer, transformer)
|
|
516
528
|
yield output
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
raise thread_exception
|
|
529
|
+
|
|
530
|
+
# Break if we're done
|
|
531
|
+
if response.finish_reason:
|
|
521
532
|
break
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
533
|
+
|
|
534
|
+
except Exception as e:
|
|
535
|
+
# Ensure any error is properly propagated
|
|
536
|
+
raise e
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=knvwpKECQb67rG8VIt-VmZu0aDVpABzQiifrytAfv9s,20932
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.23.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.23.dist-info/METADATA,sha256=w5AOt2foy30CdqgfcivGhBflpWOvtm1B7tHEJo_ipVE,2757
|
|
11
|
+
inferencesh-0.2.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.23.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.23.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.23.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|