inferencesh 0.2.23__py3-none-any.whl → 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +55 -29
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/METADATA +1 -1
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/RECORD +7 -7
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.25.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -10,7 +10,6 @@ import base64
|
|
|
10
10
|
from .base import BaseAppInput, BaseAppOutput
|
|
11
11
|
from .file import File
|
|
12
12
|
|
|
13
|
-
|
|
14
13
|
class ContextMessageRole(str, Enum):
|
|
15
14
|
USER = "user"
|
|
16
15
|
ASSISTANT = "assistant"
|
|
@@ -116,7 +115,7 @@ def timing_context():
|
|
|
116
115
|
class TimingInfo:
|
|
117
116
|
def __init__(self):
|
|
118
117
|
self.start_time = time.time()
|
|
119
|
-
self.first_token_time =
|
|
118
|
+
self.first_token_time = None
|
|
120
119
|
self.reasoning_start_time = None
|
|
121
120
|
self.total_reasoning_time = 0.0
|
|
122
121
|
self.reasoning_tokens = 0
|
|
@@ -140,12 +139,17 @@ def timing_context():
|
|
|
140
139
|
|
|
141
140
|
@property
|
|
142
141
|
def stats(self):
|
|
143
|
-
|
|
142
|
+
current_time = time.time()
|
|
144
143
|
if self.first_token_time is None:
|
|
145
|
-
|
|
144
|
+
return {
|
|
145
|
+
"time_to_first_token": 0.0,
|
|
146
|
+
"generation_time": 0.0,
|
|
147
|
+
"reasoning_time": self.total_reasoning_time,
|
|
148
|
+
"reasoning_tokens": self.reasoning_tokens
|
|
149
|
+
}
|
|
146
150
|
|
|
147
151
|
time_to_first = self.first_token_time - self.start_time
|
|
148
|
-
generation_time =
|
|
152
|
+
generation_time = current_time - self.first_token_time
|
|
149
153
|
|
|
150
154
|
return {
|
|
151
155
|
"time_to_first_token": time_to_first,
|
|
@@ -216,7 +220,7 @@ class StreamResponse:
|
|
|
216
220
|
self.tool_calls = None # Changed from [] to None
|
|
217
221
|
self.finish_reason = None
|
|
218
222
|
self.timing_stats = {
|
|
219
|
-
"time_to_first_token": 0.0
|
|
223
|
+
"time_to_first_token": None, # Changed from 0.0 to None
|
|
220
224
|
"generation_time": 0.0,
|
|
221
225
|
"reasoning_time": 0.0,
|
|
222
226
|
"reasoning_tokens": 0,
|
|
@@ -232,8 +236,15 @@ class StreamResponse:
|
|
|
232
236
|
def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
|
|
233
237
|
"""Update response state from a chunk."""
|
|
234
238
|
# Update usage stats if present
|
|
235
|
-
if "usage" in chunk
|
|
236
|
-
|
|
239
|
+
if "usage" in chunk:
|
|
240
|
+
usage = chunk["usage"]
|
|
241
|
+
if usage is not None:
|
|
242
|
+
# Update usage stats preserving existing values if not provided
|
|
243
|
+
self.usage_stats.update({
|
|
244
|
+
"prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
|
|
245
|
+
"completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
|
|
246
|
+
"total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
|
|
247
|
+
})
|
|
237
248
|
|
|
238
249
|
# Get the delta from the chunk
|
|
239
250
|
delta = chunk.get("choices", [{}])[0]
|
|
@@ -245,23 +256,34 @@ class StreamResponse:
|
|
|
245
256
|
if message.get("tool_calls"):
|
|
246
257
|
self._update_tool_calls(message["tool_calls"])
|
|
247
258
|
self.finish_reason = delta.get("finish_reason")
|
|
259
|
+
if self.finish_reason:
|
|
260
|
+
self.usage_stats["stop_reason"] = self.finish_reason
|
|
248
261
|
elif "delta" in delta:
|
|
249
262
|
delta_content = delta["delta"]
|
|
250
263
|
self.content = delta_content.get("content", "")
|
|
251
264
|
if delta_content.get("tool_calls"):
|
|
252
265
|
self._update_tool_calls(delta_content["tool_calls"])
|
|
253
266
|
self.finish_reason = delta.get("finish_reason")
|
|
267
|
+
if self.finish_reason:
|
|
268
|
+
self.usage_stats["stop_reason"] = self.finish_reason
|
|
254
269
|
|
|
255
|
-
# Update timing stats
|
|
270
|
+
# Update timing stats
|
|
256
271
|
timing_stats = timing.stats
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
|
|
272
|
+
if self.timing_stats["time_to_first_token"] is None:
|
|
273
|
+
self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
|
|
260
274
|
|
|
261
275
|
self.timing_stats.update({
|
|
262
|
-
|
|
263
|
-
"
|
|
276
|
+
"generation_time": timing_stats["generation_time"],
|
|
277
|
+
"reasoning_time": timing_stats["reasoning_time"],
|
|
278
|
+
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
264
279
|
})
|
|
280
|
+
|
|
281
|
+
# Calculate tokens per second only if we have valid completion tokens and generation time
|
|
282
|
+
if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
|
|
283
|
+
self.timing_stats["tokens_per_second"] = (
|
|
284
|
+
self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
|
|
285
|
+
)
|
|
286
|
+
|
|
265
287
|
|
|
266
288
|
def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
|
|
267
289
|
"""Update tool calls, handling both full and partial updates."""
|
|
@@ -292,29 +314,33 @@ class StreamResponse:
|
|
|
292
314
|
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
293
315
|
|
|
294
316
|
def has_updates(self) -> bool:
|
|
295
|
-
"""Check if this response has any content
|
|
296
|
-
|
|
317
|
+
"""Check if this response has any content, tool call, or usage updates."""
|
|
318
|
+
has_content = bool(self.content)
|
|
319
|
+
has_tool_calls = bool(self.tool_calls)
|
|
320
|
+
has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
|
|
321
|
+
has_finish = bool(self.finish_reason)
|
|
322
|
+
|
|
323
|
+
return has_content or has_tool_calls or has_usage or has_finish
|
|
297
324
|
|
|
298
325
|
def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
|
|
299
|
-
"""Convert current state to LLMOutput."""
|
|
326
|
+
"""Convert current state to LLMOutput."""
|
|
300
327
|
buffer, output, _ = transformer(self.content, buffer)
|
|
301
328
|
|
|
302
329
|
# Add tool calls if present
|
|
303
330
|
if self.tool_calls:
|
|
304
331
|
output.tool_calls = self.tool_calls
|
|
305
332
|
|
|
306
|
-
# Add usage stats
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
)
|
|
333
|
+
# Add usage stats
|
|
334
|
+
output.usage = LLMUsage(
|
|
335
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
336
|
+
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
337
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
338
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
339
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
340
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
341
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
342
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
343
|
+
)
|
|
318
344
|
|
|
319
345
|
return output, buffer
|
|
320
346
|
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=9d4JOlieJ-2bvwZQEAA69Qkmqk045gwibyzMcgkreFE,22362
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.25.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.25.dist-info/METADATA,sha256=6PsBcetzAtbzXTGHBHVNkzoNVkRB9W49oaXJ0WYhBzY,2757
|
|
11
|
+
inferencesh-0.2.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.25.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.25.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|