inferencesh 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of inferencesh might be problematic. Click here for more details.
- inferencesh/models/llm.py +68 -28
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/METADATA +1 -1
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/RECORD +7 -7
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/WHEEL +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/entry_points.txt +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/licenses/LICENSE +0 -0
- {inferencesh-0.2.23.dist-info → inferencesh-0.2.24.dist-info}/top_level.txt +0 -0
inferencesh/models/llm.py
CHANGED
|
@@ -10,7 +10,6 @@ import base64
|
|
|
10
10
|
from .base import BaseAppInput, BaseAppOutput
|
|
11
11
|
from .file import File
|
|
12
12
|
|
|
13
|
-
|
|
14
13
|
class ContextMessageRole(str, Enum):
|
|
15
14
|
USER = "user"
|
|
16
15
|
ASSISTANT = "assistant"
|
|
@@ -116,7 +115,7 @@ def timing_context():
|
|
|
116
115
|
class TimingInfo:
|
|
117
116
|
def __init__(self):
|
|
118
117
|
self.start_time = time.time()
|
|
119
|
-
self.first_token_time =
|
|
118
|
+
self.first_token_time = None
|
|
120
119
|
self.reasoning_start_time = None
|
|
121
120
|
self.total_reasoning_time = 0.0
|
|
122
121
|
self.reasoning_tokens = 0
|
|
@@ -140,12 +139,17 @@ def timing_context():
|
|
|
140
139
|
|
|
141
140
|
@property
|
|
142
141
|
def stats(self):
|
|
143
|
-
|
|
142
|
+
current_time = time.time()
|
|
144
143
|
if self.first_token_time is None:
|
|
145
|
-
|
|
144
|
+
return {
|
|
145
|
+
"time_to_first_token": 0.0,
|
|
146
|
+
"generation_time": 0.0,
|
|
147
|
+
"reasoning_time": self.total_reasoning_time,
|
|
148
|
+
"reasoning_tokens": self.reasoning_tokens
|
|
149
|
+
}
|
|
146
150
|
|
|
147
151
|
time_to_first = self.first_token_time - self.start_time
|
|
148
|
-
generation_time =
|
|
152
|
+
generation_time = current_time - self.first_token_time
|
|
149
153
|
|
|
150
154
|
return {
|
|
151
155
|
"time_to_first_token": time_to_first,
|
|
@@ -216,7 +220,7 @@ class StreamResponse:
|
|
|
216
220
|
self.tool_calls = None # Changed from [] to None
|
|
217
221
|
self.finish_reason = None
|
|
218
222
|
self.timing_stats = {
|
|
219
|
-
"time_to_first_token": 0.0
|
|
223
|
+
"time_to_first_token": None, # Changed from 0.0 to None
|
|
220
224
|
"generation_time": 0.0,
|
|
221
225
|
"reasoning_time": 0.0,
|
|
222
226
|
"reasoning_tokens": 0,
|
|
@@ -231,9 +235,22 @@ class StreamResponse:
|
|
|
231
235
|
|
|
232
236
|
def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
|
|
233
237
|
"""Update response state from a chunk."""
|
|
238
|
+
print("DEBUG: Entering update_from_chunk")
|
|
239
|
+
print(f"DEBUG: Current usage stats: {self.usage_stats}")
|
|
240
|
+
print(f"DEBUG: Chunk: {chunk}")
|
|
241
|
+
|
|
234
242
|
# Update usage stats if present
|
|
235
|
-
if "usage" in chunk
|
|
236
|
-
|
|
243
|
+
if "usage" in chunk:
|
|
244
|
+
usage = chunk["usage"]
|
|
245
|
+
if usage is not None:
|
|
246
|
+
print(f"DEBUG: Updating usage stats with: {usage}")
|
|
247
|
+
# Update usage stats preserving existing values if not provided
|
|
248
|
+
self.usage_stats.update({
|
|
249
|
+
"prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
|
|
250
|
+
"completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
|
|
251
|
+
"total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
|
|
252
|
+
})
|
|
253
|
+
print(f"DEBUG: Updated usage stats: {self.usage_stats}")
|
|
237
254
|
|
|
238
255
|
# Get the delta from the chunk
|
|
239
256
|
delta = chunk.get("choices", [{}])[0]
|
|
@@ -245,23 +262,35 @@ class StreamResponse:
|
|
|
245
262
|
if message.get("tool_calls"):
|
|
246
263
|
self._update_tool_calls(message["tool_calls"])
|
|
247
264
|
self.finish_reason = delta.get("finish_reason")
|
|
265
|
+
if self.finish_reason:
|
|
266
|
+
self.usage_stats["stop_reason"] = self.finish_reason
|
|
248
267
|
elif "delta" in delta:
|
|
249
268
|
delta_content = delta["delta"]
|
|
250
269
|
self.content = delta_content.get("content", "")
|
|
251
270
|
if delta_content.get("tool_calls"):
|
|
252
271
|
self._update_tool_calls(delta_content["tool_calls"])
|
|
253
272
|
self.finish_reason = delta.get("finish_reason")
|
|
273
|
+
if self.finish_reason:
|
|
274
|
+
self.usage_stats["stop_reason"] = self.finish_reason
|
|
254
275
|
|
|
255
|
-
# Update timing stats
|
|
276
|
+
# Update timing stats
|
|
256
277
|
timing_stats = timing.stats
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
|
|
278
|
+
if self.timing_stats["time_to_first_token"] is None:
|
|
279
|
+
self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
|
|
260
280
|
|
|
261
281
|
self.timing_stats.update({
|
|
262
|
-
|
|
263
|
-
"
|
|
282
|
+
"generation_time": timing_stats["generation_time"],
|
|
283
|
+
"reasoning_time": timing_stats["reasoning_time"],
|
|
284
|
+
"reasoning_tokens": timing_stats["reasoning_tokens"]
|
|
264
285
|
})
|
|
286
|
+
|
|
287
|
+
# Calculate tokens per second only if we have valid completion tokens and generation time
|
|
288
|
+
if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
|
|
289
|
+
self.timing_stats["tokens_per_second"] = (
|
|
290
|
+
self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
print(f"DEBUG: Final usage stats in update_from_chunk: {self.usage_stats}")
|
|
265
294
|
|
|
266
295
|
def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
|
|
267
296
|
"""Update tool calls, handling both full and partial updates."""
|
|
@@ -292,29 +321,40 @@ class StreamResponse:
|
|
|
292
321
|
current_tool["function"]["arguments"] += func_delta["arguments"]
|
|
293
322
|
|
|
294
323
|
def has_updates(self) -> bool:
|
|
295
|
-
"""Check if this response has any content
|
|
296
|
-
|
|
324
|
+
"""Check if this response has any content, tool call, or usage updates."""
|
|
325
|
+
has_content = bool(self.content)
|
|
326
|
+
has_tool_calls = bool(self.tool_calls)
|
|
327
|
+
has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
|
|
328
|
+
has_finish = bool(self.finish_reason)
|
|
329
|
+
|
|
330
|
+
print(f"DEBUG: has_updates check - content: {has_content}, tool_calls: {has_tool_calls}, usage: {has_usage}, finish: {has_finish}")
|
|
331
|
+
|
|
332
|
+
return has_content or has_tool_calls or has_usage or has_finish
|
|
297
333
|
|
|
298
334
|
def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
|
|
299
335
|
"""Convert current state to LLMOutput."""
|
|
336
|
+
print("DEBUG: Entering to_output")
|
|
337
|
+
print(f"DEBUG: Usage stats before conversion: {self.usage_stats}")
|
|
338
|
+
|
|
300
339
|
buffer, output, _ = transformer(self.content, buffer)
|
|
301
340
|
|
|
302
341
|
# Add tool calls if present
|
|
303
342
|
if self.tool_calls:
|
|
304
343
|
output.tool_calls = self.tool_calls
|
|
305
344
|
|
|
306
|
-
# Add usage stats
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
345
|
+
# Add usage stats
|
|
346
|
+
print(f"DEBUG: Creating LLMUsage with stats: {self.usage_stats}")
|
|
347
|
+
output.usage = LLMUsage(
|
|
348
|
+
stop_reason=self.usage_stats["stop_reason"],
|
|
349
|
+
time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
|
|
350
|
+
tokens_per_second=self.timing_stats["tokens_per_second"],
|
|
351
|
+
prompt_tokens=self.usage_stats["prompt_tokens"],
|
|
352
|
+
completion_tokens=self.usage_stats["completion_tokens"],
|
|
353
|
+
total_tokens=self.usage_stats["total_tokens"],
|
|
354
|
+
reasoning_time=self.timing_stats["reasoning_time"],
|
|
355
|
+
reasoning_tokens=self.timing_stats["reasoning_tokens"]
|
|
356
|
+
)
|
|
357
|
+
print(f"DEBUG: Created output usage: {output.usage}")
|
|
318
358
|
|
|
319
359
|
return output, buffer
|
|
320
360
|
|
|
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
|
|
|
2
2
|
inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
|
|
3
3
|
inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
|
|
4
4
|
inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
|
|
5
|
-
inferencesh/models/llm.py,sha256=
|
|
5
|
+
inferencesh/models/llm.py,sha256=ycg20sSx3UJevjoTVukBZXwRyXY06tFZKAmlVp0MBzQ,23168
|
|
6
6
|
inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
|
|
7
7
|
inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
|
|
8
8
|
inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
|
|
9
|
-
inferencesh-0.2.
|
|
10
|
-
inferencesh-0.2.
|
|
11
|
-
inferencesh-0.2.
|
|
12
|
-
inferencesh-0.2.
|
|
13
|
-
inferencesh-0.2.
|
|
14
|
-
inferencesh-0.2.
|
|
9
|
+
inferencesh-0.2.24.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
|
|
10
|
+
inferencesh-0.2.24.dist-info/METADATA,sha256=kQq9qN65EU9DS-SAQHm3Sw73yzz-FZVQX6ueHSgktW8,2757
|
|
11
|
+
inferencesh-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
inferencesh-0.2.24.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
|
|
13
|
+
inferencesh-0.2.24.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
|
|
14
|
+
inferencesh-0.2.24.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|