inferencesh 0.2.18__tar.gz → 0.2.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- {inferencesh-0.2.18/src/inferencesh.egg-info → inferencesh-0.2.20}/PKG-INFO +1 -1
- {inferencesh-0.2.18 → inferencesh-0.2.20}/pyproject.toml +1 -1
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/models/llm.py +41 -21
- {inferencesh-0.2.18 → inferencesh-0.2.20/src/inferencesh.egg-info}/PKG-INFO +1 -1
- {inferencesh-0.2.18 → inferencesh-0.2.20}/LICENSE +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/README.md +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/setup.cfg +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/setup.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/__init__.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/models/__init__.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/models/base.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/models/file.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/utils/__init__.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/utils/download.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh/utils/storage.py +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh.egg-info/SOURCES.txt +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh.egg-info/dependency_links.txt +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh.egg-info/entry_points.txt +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh.egg-info/requires.txt +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/src/inferencesh.egg-info/top_level.txt +0 -0
- {inferencesh-0.2.18 → inferencesh-0.2.20}/tests/test_sdk.py +0 -0
|
@@ -192,7 +192,7 @@ def build_messages(
|
|
|
192
192
|
raise ValueError("Image content requires multipart support")
|
|
193
193
|
|
|
194
194
|
multipart = any(m.image for m in input_data.context) or input_data.image is not None
|
|
195
|
-
messages = [{"role": "system", "content": input_data.system_prompt}]
|
|
195
|
+
messages = [{"role": "system", "content": input_data.system_prompt}] if input_data.system_prompt is not None and input_data.system_prompt != "" else []
|
|
196
196
|
|
|
197
197
|
for msg in input_data.context:
|
|
198
198
|
messages.append({
|
|
@@ -216,7 +216,8 @@ class ResponseState:
|
|
|
216
216
|
self.response = ""
|
|
217
217
|
self.reasoning = None
|
|
218
218
|
self.function_calls = None # For future function calling support
|
|
219
|
-
self.tool_calls =
|
|
219
|
+
self.tool_calls = [] # List to accumulate tool calls
|
|
220
|
+
self.current_tool_call = None # Track current tool call being built
|
|
220
221
|
self.state_changes = {
|
|
221
222
|
"reasoning_started": False,
|
|
222
223
|
"reasoning_ended": False,
|
|
@@ -373,17 +374,7 @@ def stream_generate(
|
|
|
373
374
|
max_tokens: int = 4096,
|
|
374
375
|
stop: Optional[List[str]] = None,
|
|
375
376
|
) -> Generator[LLMOutput, None, None]:
|
|
376
|
-
"""Stream generate from LLaMA.cpp model with timing and usage tracking.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
model: The LLaMA.cpp model instance
|
|
380
|
-
messages: List of messages to send to the model
|
|
381
|
-
transformer: ResponseTransformer instance to use for processing output
|
|
382
|
-
temperature: Sampling temperature
|
|
383
|
-
top_p: Top-p sampling threshold
|
|
384
|
-
max_tokens: Maximum tokens to generate
|
|
385
|
-
stop: Optional list of stop sequences
|
|
386
|
-
"""
|
|
377
|
+
"""Stream generate from LLaMA.cpp model with timing and usage tracking."""
|
|
387
378
|
response_queue: Queue[Optional[tuple[str, dict, Optional[List[Dict[str, Any]]]]]] = Queue()
|
|
388
379
|
thread_exception = None
|
|
389
380
|
usage_stats = {
|
|
@@ -394,7 +385,6 @@ def stream_generate(
|
|
|
394
385
|
}
|
|
395
386
|
|
|
396
387
|
with timing_context() as timing:
|
|
397
|
-
# Set timing context in transformer
|
|
398
388
|
transformer.timing = timing
|
|
399
389
|
|
|
400
390
|
def generation_thread():
|
|
@@ -411,30 +401,60 @@ def stream_generate(
|
|
|
411
401
|
stop=stop
|
|
412
402
|
)
|
|
413
403
|
|
|
404
|
+
tool_calls = []
|
|
405
|
+
current_tool = None
|
|
406
|
+
|
|
414
407
|
for chunk in completion:
|
|
415
408
|
if "usage" in chunk and chunk["usage"] is not None:
|
|
416
409
|
usage_stats.update(chunk["usage"])
|
|
417
410
|
|
|
418
411
|
delta = chunk.get("choices", [{}])[0]
|
|
419
|
-
content =
|
|
412
|
+
content = ""
|
|
420
413
|
finish_reason = None
|
|
421
|
-
tool_calls = None
|
|
422
414
|
|
|
415
|
+
# Extract delta content from either message or delta
|
|
423
416
|
if "message" in delta:
|
|
424
417
|
message = delta["message"]
|
|
425
418
|
content = message.get("content", "")
|
|
426
|
-
tool_calls
|
|
419
|
+
if "tool_calls" in message:
|
|
420
|
+
for tool in message["tool_calls"]:
|
|
421
|
+
if tool.get("id") not in {t.get("id") for t in tool_calls}:
|
|
422
|
+
tool_calls.append(tool)
|
|
427
423
|
finish_reason = delta.get("finish_reason")
|
|
428
424
|
elif "delta" in delta:
|
|
429
425
|
delta_content = delta["delta"]
|
|
430
426
|
content = delta_content.get("content", "")
|
|
431
|
-
|
|
427
|
+
|
|
428
|
+
# Handle streaming tool calls
|
|
429
|
+
if "tool_calls" in delta_content:
|
|
430
|
+
for tool_delta in delta_content["tool_calls"]:
|
|
431
|
+
tool_id = tool_delta.get("id")
|
|
432
|
+
|
|
433
|
+
# Find or create tool call
|
|
434
|
+
if tool_id:
|
|
435
|
+
current_tool = next((t for t in tool_calls if t["id"] == tool_id), None)
|
|
436
|
+
if not current_tool:
|
|
437
|
+
current_tool = {
|
|
438
|
+
"id": tool_id,
|
|
439
|
+
"type": tool_delta.get("type", "function"),
|
|
440
|
+
"function": {"name": "", "arguments": ""}
|
|
441
|
+
}
|
|
442
|
+
tool_calls.append(current_tool)
|
|
443
|
+
|
|
444
|
+
# Update tool call
|
|
445
|
+
if current_tool and "function" in tool_delta:
|
|
446
|
+
func_delta = tool_delta["function"]
|
|
447
|
+
if "name" in func_delta:
|
|
448
|
+
current_tool["function"]["name"] = func_delta["name"]
|
|
449
|
+
if "arguments" in func_delta:
|
|
450
|
+
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
451
|
+
|
|
432
452
|
finish_reason = delta.get("finish_reason")
|
|
433
453
|
|
|
434
|
-
if content or tool_calls:
|
|
454
|
+
if content or "tool_calls" in (delta.get("message", {}) or delta.get("delta", {})):
|
|
435
455
|
if not timing.first_token_time:
|
|
436
456
|
timing.mark_first_token()
|
|
437
|
-
response_queue.put((content
|
|
457
|
+
response_queue.put((content, {}, tool_calls[:] if tool_calls else None))
|
|
438
458
|
|
|
439
459
|
if finish_reason:
|
|
440
460
|
usage_stats["stop_reason"] = finish_reason
|
|
@@ -450,7 +470,7 @@ def stream_generate(
|
|
|
450
470
|
"tokens_per_second": tokens_per_second,
|
|
451
471
|
"reasoning_time": timing_stats["reasoning_time"],
|
|
452
472
|
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
453
|
-
}, None))
|
|
473
|
+
}, tool_calls if tool_calls else None))
|
|
454
474
|
|
|
455
475
|
thread = Thread(target=generation_thread, daemon=True)
|
|
456
476
|
thread.start()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|