inferencesh 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +181 -63
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/METADATA +1 -1
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/RECORD +7 -7
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.15.dist-info → inferencesh-0.2.16.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -88,7 +88,7 @@ class LLMInput(BaseAppInput):
|
|
|
88
88
|
context_size: int = Field(default=4096)
|
|
89
89
|
|
|
90
90
|
# Model specific flags
|
|
91
|
-
|
|
91
|
+
reasoning: bool = Field(default=False)
|
|
92
92
|
|
|
93
93
|
class LLMUsage(BaseAppOutput):
|
|
94
94
|
stop_reason: str = ""
|
|
@@ -97,11 +97,13 @@ class LLMUsage(BaseAppOutput):
|
|
|
97
97
|
prompt_tokens: int = 0
|
|
98
98
|
completion_tokens: int = 0
|
|
99
99
|
total_tokens: int = 0
|
|
100
|
+
reasoning_tokens: int = 0
|
|
101
|
+
reasoning_time: float = 0.0
|
|
100
102
|
|
|
101
103
|
|
|
102
104
|
class LLMOutput(BaseAppOutput):
|
|
103
105
|
response: str
|
|
104
|
-
|
|
106
|
+
reasoning: Optional[str] = None
|
|
105
107
|
usage: Optional[LLMUsage] = None
|
|
106
108
|
|
|
107
109
|
|
|
@@ -112,11 +114,27 @@ def timing_context():
|
|
|
112
114
|
def __init__(self):
|
|
113
115
|
self.start_time = time.time()
|
|
114
116
|
self.first_token_time = None
|
|
117
|
+
self.reasoning_start_time = None
|
|
118
|
+
self.total_reasoning_time = 0.0
|
|
119
|
+
self.reasoning_tokens = 0
|
|
120
|
+
self.in_reasoning = False
|
|
115
121
|
|
|
116
122
|
def mark_first_token(self):
|
|
117
123
|
if self.first_token_time is None:
|
|
118
124
|
self.first_token_time = time.time()
|
|
119
125
|
|
|
126
|
+
def start_reasoning(self):
|
|
127
|
+
if not self.in_reasoning:
|
|
128
|
+
self.reasoning_start_time = time.time()
|
|
129
|
+
self.in_reasoning = True
|
|
130
|
+
|
|
131
|
+
def end_reasoning(self, token_count: int = 0):
|
|
132
|
+
if self.in_reasoning and self.reasoning_start_time:
|
|
133
|
+
self.total_reasoning_time += time.time() - self.reasoning_start_time
|
|
134
|
+
self.reasoning_tokens += token_count
|
|
135
|
+
self.reasoning_start_time = None
|
|
136
|
+
self.in_reasoning = False
|
|
137
|
+
|
|
120
138
|
@property
|
|
121
139
|
def stats(self):
|
|
122
140
|
end_time = time.time()
|
|
@@ -128,7 +146,9 @@ def timing_context():
|
|
|
128
146
|
|
|
129
147
|
return {
|
|
130
148
|
"time_to_first_token": time_to_first,
|
|
131
|
-
"generation_time": generation_time
|
|
149
|
+
"generation_time": generation_time,
|
|
150
|
+
"reasoning_time": self.total_reasoning_time,
|
|
151
|
+
"reasoning_tokens": self.reasoning_tokens
|
|
132
152
|
}
|
|
133
153
|
|
|
134
154
|
timing = TimingInfo()
|
|
@@ -186,29 +206,170 @@ def build_messages(
|
|
|
186
206
|
return messages
|
|
187
207
|
|
|
188
208
|
|
|
209
|
+
class ResponseState:
|
|
210
|
+
"""Holds the state of response transformation."""
|
|
211
|
+
def __init__(self):
|
|
212
|
+
self.buffer = ""
|
|
213
|
+
self.response = ""
|
|
214
|
+
self.reasoning = None
|
|
215
|
+
self.function_calls = None # For future function calling support
|
|
216
|
+
self.tool_calls = None # For future tool calling support
|
|
217
|
+
self.state_changes = {
|
|
218
|
+
"reasoning_started": False,
|
|
219
|
+
"reasoning_ended": False,
|
|
220
|
+
"function_call_started": False,
|
|
221
|
+
"function_call_ended": False,
|
|
222
|
+
"tool_call_started": False,
|
|
223
|
+
"tool_call_ended": False
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
class ResponseTransformer:
|
|
227
|
+
"""Base class for transforming model responses."""
|
|
228
|
+
def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
|
|
229
|
+
self.state = ResponseState()
|
|
230
|
+
self.output_cls = output_cls
|
|
231
|
+
|
|
232
|
+
def clean_text(self, text: str) -> str:
|
|
233
|
+
"""Clean common tokens from the text and apply model-specific cleaning.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
text: Raw text to clean
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Cleaned text with common and model-specific tokens removed
|
|
240
|
+
"""
|
|
241
|
+
# Common token cleaning across most models
|
|
242
|
+
cleaned = (text.replace("<|im_end|>", "")
|
|
243
|
+
.replace("<|im_start|>", "")
|
|
244
|
+
.replace("<start_of_turn>", "")
|
|
245
|
+
.replace("<end_of_turn>", "")
|
|
246
|
+
.replace("<eos>", ""))
|
|
247
|
+
return self.additional_cleaning(cleaned)
|
|
248
|
+
|
|
249
|
+
def additional_cleaning(self, text: str) -> str:
|
|
250
|
+
"""Apply model-specific token cleaning.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
text: Text that has had common tokens removed
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Text with model-specific tokens removed
|
|
257
|
+
"""
|
|
258
|
+
return text
|
|
259
|
+
|
|
260
|
+
def handle_reasoning(self, text: str) -> None:
|
|
261
|
+
"""Handle reasoning/thinking detection and extraction.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
text: Cleaned text to process for reasoning
|
|
265
|
+
"""
|
|
266
|
+
# Default implementation for <think> style reasoning
|
|
267
|
+
if "<think>" in text:
|
|
268
|
+
self.state.state_changes["reasoning_started"] = True
|
|
269
|
+
if "</think>" in text:
|
|
270
|
+
self.state.state_changes["reasoning_ended"] = True
|
|
271
|
+
|
|
272
|
+
if "<think>" in self.state.buffer:
|
|
273
|
+
parts = self.state.buffer.split("</think>", 1)
|
|
274
|
+
if len(parts) > 1:
|
|
275
|
+
self.state.reasoning = parts[0].split("<think>", 1)[1].strip()
|
|
276
|
+
self.state.response = parts[1].strip()
|
|
277
|
+
else:
|
|
278
|
+
self.state.reasoning = self.state.buffer.split("<think>", 1)[1].strip()
|
|
279
|
+
self.state.response = ""
|
|
280
|
+
else:
|
|
281
|
+
self.state.response = self.state.buffer
|
|
282
|
+
|
|
283
|
+
def handle_function_calls(self, text: str) -> None:
|
|
284
|
+
"""Handle function call detection and extraction.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
text: Cleaned text to process for function calls
|
|
288
|
+
"""
|
|
289
|
+
# Default no-op implementation
|
|
290
|
+
# Models can override this to implement function call handling
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
def handle_tool_calls(self, text: str) -> None:
|
|
294
|
+
"""Handle tool call detection and extraction.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
text: Cleaned text to process for tool calls
|
|
298
|
+
"""
|
|
299
|
+
# Default no-op implementation
|
|
300
|
+
# Models can override this to implement tool call handling
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
def transform_chunk(self, chunk: str) -> None:
|
|
304
|
+
"""Transform a single chunk of model output.
|
|
305
|
+
|
|
306
|
+
This method orchestrates the transformation process by:
|
|
307
|
+
1. Cleaning the text
|
|
308
|
+
2. Updating the buffer
|
|
309
|
+
3. Processing various capabilities (reasoning, function calls, etc)
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
chunk: Raw text chunk from the model
|
|
313
|
+
"""
|
|
314
|
+
cleaned = self.clean_text(chunk)
|
|
315
|
+
self.state.buffer += cleaned
|
|
316
|
+
|
|
317
|
+
# Process different capabilities
|
|
318
|
+
self.handle_reasoning(cleaned)
|
|
319
|
+
self.handle_function_calls(cleaned)
|
|
320
|
+
self.handle_tool_calls(cleaned)
|
|
321
|
+
|
|
322
|
+
def build_output(self) -> tuple[str, LLMOutput, dict]:
|
|
323
|
+
"""Build the final output tuple.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Tuple of (buffer, LLMOutput, state_changes)
|
|
327
|
+
"""
|
|
328
|
+
return (
|
|
329
|
+
self.state.buffer,
|
|
330
|
+
self.output_cls(
|
|
331
|
+
response=self.state.response.strip(),
|
|
332
|
+
reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
|
|
333
|
+
function_calls=self.state.function_calls,
|
|
334
|
+
tool_calls=self.state.tool_calls
|
|
335
|
+
),
|
|
336
|
+
self.state.state_changes
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
|
|
340
|
+
"""Transform a piece of text and return the result.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
piece: New piece of text to transform
|
|
344
|
+
buffer: Existing buffer content
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Tuple of (new_buffer, output, state_changes)
|
|
348
|
+
"""
|
|
349
|
+
self.state.buffer = buffer
|
|
350
|
+
self.transform_chunk(piece)
|
|
351
|
+
return self.build_output()
|
|
352
|
+
|
|
353
|
+
|
|
189
354
|
def stream_generate(
|
|
190
355
|
model: Any,
|
|
191
356
|
messages: List[Dict[str, Any]],
|
|
192
|
-
|
|
357
|
+
transformer: ResponseTransformer,
|
|
193
358
|
temperature: float = 0.7,
|
|
194
359
|
top_p: float = 0.95,
|
|
195
360
|
max_tokens: int = 4096,
|
|
196
361
|
stop: Optional[List[str]] = None,
|
|
197
|
-
handle_thinking: bool = False,
|
|
198
|
-
transform_response: Optional[Callable[[str, str], tuple[str, LLMOutput]]] = None,
|
|
199
362
|
) -> Generator[LLMOutput, None, None]:
|
|
200
363
|
"""Stream generate from LLaMA.cpp model with timing and usage tracking.
|
|
201
364
|
|
|
202
365
|
Args:
|
|
203
366
|
model: The LLaMA.cpp model instance
|
|
204
367
|
messages: List of messages to send to the model
|
|
205
|
-
|
|
368
|
+
transformer: ResponseTransformer instance to use for processing output
|
|
206
369
|
temperature: Sampling temperature
|
|
207
370
|
top_p: Top-p sampling threshold
|
|
208
371
|
max_tokens: Maximum tokens to generate
|
|
209
372
|
stop: Optional list of stop sequences
|
|
210
|
-
handle_thinking: Whether to handle thinking tags
|
|
211
|
-
transform_response: Optional function to transform responses, takes (piece, buffer) and returns (new_buffer, output)
|
|
212
373
|
"""
|
|
213
374
|
response_queue: Queue[Optional[tuple[str, dict]]] = Queue()
|
|
214
375
|
thread_exception = None
|
|
@@ -233,11 +394,9 @@ def stream_generate(
|
|
|
233
394
|
)
|
|
234
395
|
|
|
235
396
|
for chunk in completion:
|
|
236
|
-
# Get usage from root level if present
|
|
237
397
|
if "usage" in chunk and chunk["usage"] is not None:
|
|
238
398
|
usage_stats.update(chunk["usage"])
|
|
239
399
|
|
|
240
|
-
# Get content from choices
|
|
241
400
|
delta = chunk.get("choices", [{}])[0]
|
|
242
401
|
content = None
|
|
243
402
|
finish_reason = None
|
|
@@ -265,15 +424,15 @@ def stream_generate(
|
|
|
265
424
|
tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
|
|
266
425
|
response_queue.put((None, {
|
|
267
426
|
"time_to_first_token": timing_stats["time_to_first_token"],
|
|
268
|
-
"tokens_per_second": tokens_per_second
|
|
427
|
+
"tokens_per_second": tokens_per_second,
|
|
428
|
+
"reasoning_time": timing_stats["reasoning_time"],
|
|
429
|
+
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
269
430
|
}))
|
|
270
431
|
|
|
271
432
|
thread = Thread(target=generation_thread, daemon=True)
|
|
272
433
|
thread.start()
|
|
273
434
|
|
|
274
435
|
buffer = ""
|
|
275
|
-
thinking_content = "" if handle_thinking else None
|
|
276
|
-
in_thinking = handle_thinking
|
|
277
436
|
try:
|
|
278
437
|
while True:
|
|
279
438
|
try:
|
|
@@ -290,59 +449,18 @@ def stream_generate(
|
|
|
290
449
|
tokens_per_second=timing_stats["tokens_per_second"],
|
|
291
450
|
prompt_tokens=usage_stats["prompt_tokens"],
|
|
292
451
|
completion_tokens=usage_stats["completion_tokens"],
|
|
293
|
-
total_tokens=usage_stats["total_tokens"]
|
|
452
|
+
total_tokens=usage_stats["total_tokens"],
|
|
453
|
+
reasoning_time=timing_stats["reasoning_time"],
|
|
454
|
+
reasoning_tokens=timing_stats["reasoning_tokens"]
|
|
294
455
|
)
|
|
295
456
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
output.usage = usage
|
|
299
|
-
yield output
|
|
300
|
-
else:
|
|
301
|
-
# Handle thinking vs response content if enabled
|
|
302
|
-
if handle_thinking and "</think>" in piece:
|
|
303
|
-
parts = piece.split("</think>")
|
|
304
|
-
if in_thinking:
|
|
305
|
-
thinking_content += parts[0].replace("<think>", "")
|
|
306
|
-
buffer = parts[1] if len(parts) > 1 else ""
|
|
307
|
-
in_thinking = False
|
|
308
|
-
else:
|
|
309
|
-
buffer += piece
|
|
310
|
-
else:
|
|
311
|
-
if in_thinking:
|
|
312
|
-
thinking_content += piece.replace("<think>", "")
|
|
313
|
-
else:
|
|
314
|
-
buffer += piece
|
|
315
|
-
|
|
316
|
-
yield output_cls(
|
|
317
|
-
response=buffer.strip(),
|
|
318
|
-
thinking_content=thinking_content.strip() if thinking_content else None,
|
|
319
|
-
usage=usage
|
|
320
|
-
)
|
|
321
|
-
break
|
|
322
|
-
|
|
323
|
-
if transform_response:
|
|
324
|
-
buffer, output = transform_response(piece, buffer)
|
|
457
|
+
buffer, output, _ = transformer(piece or "", buffer)
|
|
458
|
+
output.usage = usage
|
|
325
459
|
yield output
|
|
326
|
-
|
|
327
|
-
# Handle thinking vs response content if enabled
|
|
328
|
-
if handle_thinking and "</think>" in piece:
|
|
329
|
-
parts = piece.split("</think>")
|
|
330
|
-
if in_thinking:
|
|
331
|
-
thinking_content += parts[0].replace("<think>", "")
|
|
332
|
-
buffer = parts[1] if len(parts) > 1 else ""
|
|
333
|
-
in_thinking = False
|
|
334
|
-
else:
|
|
335
|
-
buffer += piece
|
|
336
|
-
else:
|
|
337
|
-
if in_thinking:
|
|
338
|
-
thinking_content += piece.replace("<think>", "")
|
|
339
|
-
else:
|
|
340
|
-
buffer += piece
|
|
460
|
+
break
|
|
341
461
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
thinking_content=thinking_content.strip() if thinking_content else None
|
|
345
|
-
)
|
|
462
|
+
buffer, output, _ = transformer(piece, buffer)
|
|
463
|
+
yield output
|
|
346
464
|
|
|
347
465
|
except Exception as e:
|
|
348
466
|
if thread_exception and isinstance(e, thread_exception.__class__):
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=8wZATOPTYNgtzl34vKeQaCXDbpyTMCzA4jOGrxbo5L4,18380
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.16.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.16.dist-info/METADATA,sha256=wCILA3L4dmdzeKDEc3oNKH1etYwwYZjsCC7bfzepCa0,2757
|
|
11
|
+
inferencesh-0.2.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.16.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.16.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|