inferencesh 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +192 -63
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/METADATA +1 -1
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/RECORD +7 -7
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.17.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -88,7 +88,7 @@ class LLMInput(BaseAppInput):
|
|
|
88
88
|
context_size: int = Field(default=4096)
|
|
89
89
|
|
|
90
90
|
# Model specific flags
|
|
91
|
-
|
|
91
|
+
reasoning: bool = Field(default=False)
|
|
92
92
|
|
|
93
93
|
class LLMUsage(BaseAppOutput):
|
|
94
94
|
stop_reason: str = ""
|
|
@@ -97,11 +97,13 @@ class LLMUsage(BaseAppOutput):
|
|
|
97
97
|
prompt_tokens: int = 0
|
|
98
98
|
completion_tokens: int = 0
|
|
99
99
|
total_tokens: int = 0
|
|
100
|
+
reasoning_tokens: int = 0
|
|
101
|
+
reasoning_time: float = 0.0
|
|
100
102
|
|
|
101
103
|
|
|
102
104
|
class LLMOutput(BaseAppOutput):
|
|
103
105
|
response: str
|
|
104
|
-
|
|
106
|
+
reasoning: Optional[str] = None
|
|
105
107
|
usage: Optional[LLMUsage] = None
|
|
106
108
|
|
|
107
109
|
|
|
@@ -112,11 +114,27 @@ def timing_context():
|
|
|
112
114
|
def __init__(self):
|
|
113
115
|
self.start_time = time.time()
|
|
114
116
|
self.first_token_time = None
|
|
117
|
+
self.reasoning_start_time = None
|
|
118
|
+
self.total_reasoning_time = 0.0
|
|
119
|
+
self.reasoning_tokens = 0
|
|
120
|
+
self.in_reasoning = False
|
|
115
121
|
|
|
116
122
|
def mark_first_token(self):
|
|
117
123
|
if self.first_token_time is None:
|
|
118
124
|
self.first_token_time = time.time()
|
|
119
125
|
|
|
126
|
+
def start_reasoning(self):
|
|
127
|
+
if not self.in_reasoning:
|
|
128
|
+
self.reasoning_start_time = time.time()
|
|
129
|
+
self.in_reasoning = True
|
|
130
|
+
|
|
131
|
+
def end_reasoning(self, token_count: int = 0):
|
|
132
|
+
if self.in_reasoning and self.reasoning_start_time:
|
|
133
|
+
self.total_reasoning_time += time.time() - self.reasoning_start_time
|
|
134
|
+
self.reasoning_tokens += token_count
|
|
135
|
+
self.reasoning_start_time = None
|
|
136
|
+
self.in_reasoning = False
|
|
137
|
+
|
|
120
138
|
@property
|
|
121
139
|
def stats(self):
|
|
122
140
|
end_time = time.time()
|
|
@@ -128,7 +146,9 @@ def timing_context():
|
|
|
128
146
|
|
|
129
147
|
return {
|
|
130
148
|
"time_to_first_token": time_to_first,
|
|
131
|
-
"generation_time": generation_time
|
|
149
|
+
"generation_time": generation_time,
|
|
150
|
+
"reasoning_time": self.total_reasoning_time,
|
|
151
|
+
"reasoning_tokens": self.reasoning_tokens
|
|
132
152
|
}
|
|
133
153
|
|
|
134
154
|
timing = TimingInfo()
|
|
@@ -186,29 +206,178 @@ def build_messages(
|
|
|
186
206
|
return messages
|
|
187
207
|
|
|
188
208
|
|
|
209
|
+
class ResponseState:
|
|
210
|
+
"""Holds the state of response transformation."""
|
|
211
|
+
def __init__(self):
|
|
212
|
+
self.buffer = ""
|
|
213
|
+
self.response = ""
|
|
214
|
+
self.reasoning = None
|
|
215
|
+
self.function_calls = None # For future function calling support
|
|
216
|
+
self.tool_calls = None # For future tool calling support
|
|
217
|
+
self.state_changes = {
|
|
218
|
+
"reasoning_started": False,
|
|
219
|
+
"reasoning_ended": False,
|
|
220
|
+
"function_call_started": False,
|
|
221
|
+
"function_call_ended": False,
|
|
222
|
+
"tool_call_started": False,
|
|
223
|
+
"tool_call_ended": False
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
class ResponseTransformer:
|
|
227
|
+
"""Base class for transforming model responses."""
|
|
228
|
+
def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
|
|
229
|
+
self.state = ResponseState()
|
|
230
|
+
self.output_cls = output_cls
|
|
231
|
+
self.timing = None # Will be set by stream_generate
|
|
232
|
+
|
|
233
|
+
def clean_text(self, text: str) -> str:
|
|
234
|
+
"""Clean common tokens from the text and apply model-specific cleaning.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
text: Raw text to clean
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Cleaned text with common and model-specific tokens removed
|
|
241
|
+
"""
|
|
242
|
+
# Common token cleaning across most models
|
|
243
|
+
cleaned = (text.replace("<|im_end|>", "")
|
|
244
|
+
.replace("<|im_start|>", "")
|
|
245
|
+
.replace("<start_of_turn>", "")
|
|
246
|
+
.replace("<end_of_turn>", "")
|
|
247
|
+
.replace("<eos>", ""))
|
|
248
|
+
return self.additional_cleaning(cleaned)
|
|
249
|
+
|
|
250
|
+
def additional_cleaning(self, text: str) -> str:
|
|
251
|
+
"""Apply model-specific token cleaning.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
text: Text that has had common tokens removed
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Text with model-specific tokens removed
|
|
258
|
+
"""
|
|
259
|
+
return text
|
|
260
|
+
|
|
261
|
+
def handle_reasoning(self, text: str) -> None:
|
|
262
|
+
"""Handle reasoning/thinking detection and extraction.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
text: Cleaned text to process for reasoning
|
|
266
|
+
"""
|
|
267
|
+
# Default implementation for <think> style reasoning
|
|
268
|
+
if "<think>" in text and not self.state.state_changes["reasoning_started"]:
|
|
269
|
+
self.state.state_changes["reasoning_started"] = True
|
|
270
|
+
if self.timing:
|
|
271
|
+
self.timing.start_reasoning()
|
|
272
|
+
|
|
273
|
+
if "</think>" in text and not self.state.state_changes["reasoning_ended"]:
|
|
274
|
+
self.state.state_changes["reasoning_ended"] = True
|
|
275
|
+
if self.timing:
|
|
276
|
+
# Estimate token count from character count (rough approximation)
|
|
277
|
+
token_count = len(self.state.buffer.split("<think>")[1].split("</think>")[0]) // 4
|
|
278
|
+
self.timing.end_reasoning(token_count)
|
|
279
|
+
|
|
280
|
+
if "<think>" in self.state.buffer:
|
|
281
|
+
parts = self.state.buffer.split("</think>", 1)
|
|
282
|
+
if len(parts) > 1:
|
|
283
|
+
self.state.reasoning = parts[0].split("<think>", 1)[1].strip()
|
|
284
|
+
self.state.response = parts[1].strip()
|
|
285
|
+
else:
|
|
286
|
+
self.state.reasoning = self.state.buffer.split("<think>", 1)[1].strip()
|
|
287
|
+
self.state.response = ""
|
|
288
|
+
else:
|
|
289
|
+
self.state.response = self.state.buffer
|
|
290
|
+
|
|
291
|
+
def handle_function_calls(self, text: str) -> None:
|
|
292
|
+
"""Handle function call detection and extraction.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
text: Cleaned text to process for function calls
|
|
296
|
+
"""
|
|
297
|
+
# Default no-op implementation
|
|
298
|
+
# Models can override this to implement function call handling
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
def handle_tool_calls(self, text: str) -> None:
|
|
302
|
+
"""Handle tool call detection and extraction.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
text: Cleaned text to process for tool calls
|
|
306
|
+
"""
|
|
307
|
+
# Default no-op implementation
|
|
308
|
+
# Models can override this to implement tool call handling
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
def transform_chunk(self, chunk: str) -> None:
|
|
312
|
+
"""Transform a single chunk of model output.
|
|
313
|
+
|
|
314
|
+
This method orchestrates the transformation process by:
|
|
315
|
+
1. Cleaning the text
|
|
316
|
+
2. Updating the buffer
|
|
317
|
+
3. Processing various capabilities (reasoning, function calls, etc)
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
chunk: Raw text chunk from the model
|
|
321
|
+
"""
|
|
322
|
+
cleaned = self.clean_text(chunk)
|
|
323
|
+
self.state.buffer += cleaned
|
|
324
|
+
|
|
325
|
+
# Process different capabilities
|
|
326
|
+
self.handle_reasoning(cleaned)
|
|
327
|
+
self.handle_function_calls(cleaned)
|
|
328
|
+
self.handle_tool_calls(cleaned)
|
|
329
|
+
|
|
330
|
+
def build_output(self) -> tuple[str, LLMOutput, dict]:
|
|
331
|
+
"""Build the final output tuple.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Tuple of (buffer, LLMOutput, state_changes)
|
|
335
|
+
"""
|
|
336
|
+
return (
|
|
337
|
+
self.state.buffer,
|
|
338
|
+
self.output_cls(
|
|
339
|
+
response=self.state.response.strip(),
|
|
340
|
+
reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
|
|
341
|
+
function_calls=self.state.function_calls,
|
|
342
|
+
tool_calls=self.state.tool_calls
|
|
343
|
+
),
|
|
344
|
+
self.state.state_changes
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
|
|
348
|
+
"""Transform a piece of text and return the result.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
piece: New piece of text to transform
|
|
352
|
+
buffer: Existing buffer content
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Tuple of (new_buffer, output, state_changes)
|
|
356
|
+
"""
|
|
357
|
+
self.state.buffer = buffer
|
|
358
|
+
self.transform_chunk(piece)
|
|
359
|
+
return self.build_output()
|
|
360
|
+
|
|
361
|
+
|
|
189
362
|
def stream_generate(
|
|
190
363
|
model: Any,
|
|
191
364
|
messages: List[Dict[str, Any]],
|
|
192
|
-
|
|
365
|
+
transformer: ResponseTransformer,
|
|
193
366
|
temperature: float = 0.7,
|
|
194
367
|
top_p: float = 0.95,
|
|
195
368
|
max_tokens: int = 4096,
|
|
196
369
|
stop: Optional[List[str]] = None,
|
|
197
|
-
handle_thinking: bool = False,
|
|
198
|
-
transform_response: Optional[Callable[[str, str], tuple[str, LLMOutput]]] = None,
|
|
199
370
|
) -> Generator[LLMOutput, None, None]:
|
|
200
371
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking.
|
|
201
372
|
|
|
202
373
|
Args:
|
|
203
374
|
model: The LLaMA.cpp model instance
|
|
204
375
|
messages: List of messages to send to the model
|
|
205
|
-
|
|
376
|
+
transformer: ResponseTransformer instance to use for processing output
|
|
206
377
|
temperature: Sampling temperature
|
|
207
378
|
top_p: Top-p sampling threshold
|
|
208
379
|
max_tokens: Maximum tokens to generate
|
|
209
380
|
stop: Optional list of stop sequences
|
|
210
|
-
handle_thinking: Whether to handle thinking tags
|
|
211
|
-
transform_response: Optional function to transform responses, takes (piece, buffer) and returns (new_buffer, output)
|
|
212
381
|
"""
|
|
213
382
|
response_queue: Queue[Optional[tuple[str, dict]]] = Queue()
|
|
214
383
|
thread_exception = None
|
|
@@ -220,6 +389,9 @@ def stream_generate(
|
|
|
220
389
|
}
|
|
221
390
|
|
|
222
391
|
with timing_context() as timing:
|
|
392
|
+
# Set timing context in transformer
|
|
393
|
+
transformer.timing = timing
|
|
394
|
+
|
|
223
395
|
def generation_thread():
|
|
224
396
|
nonlocal thread_exception, usage_stats
|
|
225
397
|
try:
|
|
@@ -233,11 +405,9 @@ def stream_generate(
|
|
|
233
405
|
)
|
|
234
406
|
|
|
235
407
|
for chunk in completion:
|
|
236
|
-
# Get usage from root level if present
|
|
237
408
|
if "usage" in chunk and chunk["usage"] is not None:
|
|
238
409
|
usage_stats.update(chunk["usage"])
|
|
239
410
|
|
|
240
|
-
# Get content from choices
|
|
241
411
|
delta = chunk.get("choices", [{}])[0]
|
|
242
412
|
content = None
|
|
243
413
|
finish_reason = None
|
|
@@ -265,15 +435,15 @@ def stream_generate(
|
|
|
265
435
|
tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
|
|
266
436
|
response_queue.put((None, {
|
|
267
437
|
"time_to_first_token": timing_stats["time_to_first_token"],
|
|
268
|
-
"tokens_per_second": tokens_per_second
|
|
438
|
+
"tokens_per_second": tokens_per_second,
|
|
439
|
+
"reasoning_time": timing_stats["reasoning_time"],
|
|
440
|
+
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
269
441
|
}))
|
|
270
442
|
|
|
271
443
|
thread = Thread(target=generation_thread, daemon=True)
|
|
272
444
|
thread.start()
|
|
273
445
|
|
|
274
446
|
buffer = ""
|
|
275
|
-
thinking_content = "" if handle_thinking else None
|
|
276
|
-
in_thinking = handle_thinking
|
|
277
447
|
try:
|
|
278
448
|
while True:
|
|
279
449
|
try:
|
|
@@ -290,59 +460,18 @@ def stream_generate(
|
|
|
290
460
|
tokens_per_second=timing_stats["tokens_per_second"],
|
|
291
461
|
prompt_tokens=usage_stats["prompt_tokens"],
|
|
292
462
|
completion_tokens=usage_stats["completion_tokens"],
|
|
293
|
-
total_tokens=usage_stats["total_tokens"]
|
|
463
|
+
total_tokens=usage_stats["total_tokens"],
|
|
464
|
+
reasoning_time=timing_stats["reasoning_time"],
|
|
465
|
+
reasoning_tokens=timing_stats["reasoning_tokens"]
|
|
294
466
|
)
|
|
295
467
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
output.usage = usage
|
|
299
|
-
yield output
|
|
300
|
-
else:
|
|
301
|
-
# Handle thinking vs response content if enabled
|
|
302
|
-
if handle_thinking and "</think>" in piece:
|
|
303
|
-
parts = piece.split("</think>")
|
|
304
|
-
if in_thinking:
|
|
305
|
-
thinking_content += parts[0].replace("<think>", "")
|
|
306
|
-
buffer = parts[1] if len(parts) > 1 else ""
|
|
307
|
-
in_thinking = False
|
|
308
|
-
else:
|
|
309
|
-
buffer += piece
|
|
310
|
-
else:
|
|
311
|
-
if in_thinking:
|
|
312
|
-
thinking_content += piece.replace("<think>", "")
|
|
313
|
-
else:
|
|
314
|
-
buffer += piece
|
|
315
|
-
|
|
316
|
-
yield output_cls(
|
|
317
|
-
response=buffer.strip(),
|
|
318
|
-
thinking_content=thinking_content.strip() if thinking_content else None,
|
|
319
|
-
usage=usage
|
|
320
|
-
)
|
|
321
|
-
break
|
|
322
|
-
|
|
323
|
-
if transform_response:
|
|
324
|
-
buffer, output = transform_response(piece, buffer)
|
|
468
|
+
buffer, output, _ = transformer(piece or "", buffer)
|
|
469
|
+
output.usage = usage
|
|
325
470
|
yield output
|
|
326
|
-
|
|
327
|
-
# Handle thinking vs response content if enabled
|
|
328
|
-
if handle_thinking and "</think>" in piece:
|
|
329
|
-
parts = piece.split("</think>")
|
|
330
|
-
if in_thinking:
|
|
331
|
-
thinking_content += parts[0].replace("<think>", "")
|
|
332
|
-
buffer = parts[1] if len(parts) > 1 else ""
|
|
333
|
-
in_thinking = False
|
|
334
|
-
else:
|
|
335
|
-
buffer += piece
|
|
336
|
-
else:
|
|
337
|
-
if in_thinking:
|
|
338
|
-
thinking_content += piece.replace("<think>", "")
|
|
339
|
-
else:
|
|
340
|
-
buffer += piece
|
|
471
|
+
break
|
|
341
472
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
thinking_content=thinking_content.strip() if thinking_content else None
|
|
345
|
-
)
|
|
473
|
+
buffer, output, _ = transformer(piece, buffer)
|
|
474
|
+
yield output
|
|
346
475
|
|
|
347
476
|
except Exception as e:
|
|
348
477
|
if thread_exception and isinstance(e, thread_exception.__class__):
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=ndRFWbnAD1-BQFTTEb2YShGP-gjVd7w80ItT4XFV1_U,18983
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.17.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.17.dist-info/METADATA,sha256=gOHSF2qhb_XT_tXlHqp5HAVblDDZzOjM9kGcbp99JQ4,2757
|
|
11
|
+
inferencesh-0.2.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.17.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.17.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.17.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|