inferencesh 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +46 -20
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/METADATA +1 -1
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/RECORD +7 -7
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.19.dist-info → inferencesh-0.2.21.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -216,7 +216,8 @@ class ResponseState:
|
|
|
216
216
|
self.response = ""
|
|
217
217
|
self.reasoning = None
|
|
218
218
|
self.function_calls = None # For future function calling support
|
|
219
|
-
self.tool_calls =
|
|
219
|
+
self.tool_calls = [] # List to accumulate tool calls
|
|
220
|
+
self.current_tool_call = None # Track current tool call being built
|
|
220
221
|
self.state_changes = {
|
|
221
222
|
"reasoning_started": False,
|
|
222
223
|
"reasoning_ended": False,
|
|
@@ -373,17 +374,7 @@ def stream_generate(
|
|
|
373
374
|
max_tokens: int = 4096,
|
|
374
375
|
stop: Optional[List[str]] = None,
|
|
375
376
|
) -> Generator[LLMOutput, None, None]:
|
|
376
|
-
"""Stream generate from LLaMA.cpp model with timing and usage tracking.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
model: The LLaMA.cpp model instance
|
|
380
|
-
messages: List of messages to send to the model
|
|
381
|
-
transformer: ResponseTransformer instance to use for processing output
|
|
382
|
-
temperature: Sampling temperature
|
|
383
|
-
top_p: Top-p sampling threshold
|
|
384
|
-
max_tokens: Maximum tokens to generate
|
|
385
|
-
stop: Optional list of stop sequences
|
|
386
|
-
"""
|
|
377
|
+
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
387
378
|
response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
|
|
388
379
|
thread_exception = None
|
|
389
380
|
usage_stats = {
|
|
@@ -394,7 +385,6 @@ def stream_generate(
|
|
|
394
385
|
}
|
|
395
386
|
|
|
396
387
|
with timing_context() as timing:
|
|
397
|
-
# Set timing context in transformer
|
|
398
388
|
transformer.timing = timing
|
|
399
389
|
|
|
400
390
|
def generation_thread():
|
|
@@ -411,30 +401,66 @@ def stream_generate(
|
|
|
411
401
|
stop=stop
|
|
412
402
|
)
|
|
413
403
|
|
|
404
|
+
tool_calls = []
|
|
405
|
+
current_tool = None
|
|
406
|
+
|
|
414
407
|
for chunk in completion:
|
|
415
408
|
if "usage" in chunk and chunk["usage"] is not None:
|
|
416
409
|
usage_stats.update(chunk["usage"])
|
|
417
410
|
|
|
418
411
|
delta = chunk.get("choices", [{}])[0]
|
|
419
|
-
content =
|
|
412
|
+
content = ""
|
|
420
413
|
finish_reason = None
|
|
421
|
-
tool_calls = None
|
|
422
414
|
|
|
415
|
+
# Extract delta content from either message or delta
|
|
423
416
|
if "message" in delta:
|
|
424
417
|
message = delta["message"]
|
|
425
418
|
content = message.get("content", "")
|
|
426
|
-
|
|
419
|
+
if message.get("tool_calls"):
|
|
420
|
+
for tool in message["tool_calls"]:
|
|
421
|
+
if tool.get("id") not in {t.get("id") for t in tool_calls}:
|
|
422
|
+
tool_calls.append(tool)
|
|
427
423
|
finish_reason = delta.get("finish_reason")
|
|
428
424
|
elif "delta" in delta:
|
|
429
425
|
delta_content = delta["delta"]
|
|
430
426
|
content = delta_content.get("content", "")
|
|
431
|
-
|
|
427
|
+
|
|
428
|
+
# Handle streaming tool calls
|
|
429
|
+
if delta_content.get("tool_calls"):
|
|
430
|
+
for tool_delta in delta_content["tool_calls"]:
|
|
431
|
+
tool_id = tool_delta.get("id")
|
|
432
|
+
|
|
433
|
+
# Find or create tool call
|
|
434
|
+
if tool_id:
|
|
435
|
+
current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
|
|
436
|
+
if not current_tool:
|
|
437
|
+
current_tool = {
|
|
438
|
+
"id": tool_id,
|
|
439
|
+
"type": tool_delta.get("type", "function"),
|
|
440
|
+
"function": {"name": "", "arguments": ""}
|
|
441
|
+
}
|
|
442
|
+
tool_calls.append(current_tool)
|
|
443
|
+
|
|
444
|
+
# Update tool call
|
|
445
|
+
if current_tool and "function" in tool_delta:
|
|
446
|
+
func_delta = tool_delta["function"]
|
|
447
|
+
if "name" in func_delta:
|
|
448
|
+
current_tool["function"]["name"] = func_delta["name"]
|
|
449
|
+
if "arguments" in func_delta:
|
|
450
|
+
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
451
|
+
|
|
432
452
|
finish_reason = delta.get("finish_reason")
|
|
433
453
|
|
|
434
|
-
|
|
454
|
+
has_update = bool(content)
|
|
455
|
+
has_tool_update = bool(
|
|
456
|
+
(delta.get("message", {}) or {}).get("tool_calls") or
|
|
457
|
+
(delta.get("delta", {}) or {}).get("tool_calls")
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
if has_update or has_tool_update:
|
|
435
461
|
if not timing.first_token_time:
|
|
436
462
|
timing.mark_first_token()
|
|
437
|
-
response_queue.put((content
|
|
463
|
+
response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
|
|
438
464
|
|
|
439
465
|
if finish_reason:
|
|
440
466
|
usage_stats["stop_reason"] = finish_reason
|
|
@@ -450,7 +476,7 @@ def stream_generate(
|
|
|
450
476
|
"tokens_per_second": tokens_per_second,
|
|
451
477
|
"reasoning_time": timing_stats["reasoning_time"],
|
|
452
478
|
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
453
|
-
}, None))
|
|
479
|
+
}, tool_calls if tool_calls else None))
|
|
454
480
|
|
|
455
481
|
thread = Thread(target=generation_thread, daemon=True)
|
|
456
482
|
thread.start()
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=jzTpOp65DtZSqQUtnwNF-_OBQVqCQHX3GOhOvSqkmbc,21695
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.21.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.21.dist-info/METADATA,sha256=qMs9bH6l5e194tUwq6egASxXTbEehhPZF_4QsQlQZrA,2757
|
|
11
|
+
inferencesh-0.2.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.21.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.21.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.21.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|