inferencesh 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

inferencesh/models/llm.py CHANGED
@@ -10,7 +10,6 @@ import base64
10
10
  from .base import BaseAppInput, BaseAppOutput
11
11
  from .file import File
12
12
 
13
-
14
13
  class ContextMessageRole(str, Enum):
15
14
  USER = "user"
16
15
  ASSISTANT = "assistant"
@@ -116,7 +115,7 @@ def timing_context():
116
115
  class TimingInfo:
117
116
  def __init__(self):
118
117
  self.start_time = time.time()
119
- self.first_token_time = 0
118
+ self.first_token_time = None
120
119
  self.reasoning_start_time = None
121
120
  self.total_reasoning_time = 0.0
122
121
  self.reasoning_tokens = 0
@@ -140,12 +139,17 @@ def timing_context():
140
139
 
141
140
  @property
142
141
  def stats(self):
143
- end_time = time.time()
142
+ current_time = time.time()
144
143
  if self.first_token_time is None:
145
- self.first_token_time = end_time
144
+ return {
145
+ "time_to_first_token": 0.0,
146
+ "generation_time": 0.0,
147
+ "reasoning_time": self.total_reasoning_time,
148
+ "reasoning_tokens": self.reasoning_tokens
149
+ }
146
150
 
147
151
  time_to_first = self.first_token_time - self.start_time
148
- generation_time = end_time - self.first_token_time
152
+ generation_time = current_time - self.first_token_time
149
153
 
150
154
  return {
151
155
  "time_to_first_token": time_to_first,
@@ -216,7 +220,7 @@ class StreamResponse:
216
220
  self.tool_calls = None # Changed from [] to None
217
221
  self.finish_reason = None
218
222
  self.timing_stats = {
219
- "time_to_first_token": 0.0,
223
+ "time_to_first_token": None, # Changed from 0.0 to None
220
224
  "generation_time": 0.0,
221
225
  "reasoning_time": 0.0,
222
226
  "reasoning_tokens": 0,
@@ -231,9 +235,22 @@ class StreamResponse:
231
235
 
232
236
  def update_from_chunk(self, chunk: Dict[str, Any], timing: Any) -> None:
233
237
  """Update response state from a chunk."""
238
+ print("DEBUG: Entering update_from_chunk")
239
+ print(f"DEBUG: Current usage stats: {self.usage_stats}")
240
+ print(f"DEBUG: Chunk: {chunk}")
241
+
234
242
  # Update usage stats if present
235
- if "usage" in chunk and chunk["usage"] is not None:
236
- self.usage_stats.update(chunk["usage"])
243
+ if "usage" in chunk:
244
+ usage = chunk["usage"]
245
+ if usage is not None:
246
+ print(f"DEBUG: Updating usage stats with: {usage}")
247
+ # Update usage stats preserving existing values if not provided
248
+ self.usage_stats.update({
249
+ "prompt_tokens": usage.get("prompt_tokens", self.usage_stats["prompt_tokens"]),
250
+ "completion_tokens": usage.get("completion_tokens", self.usage_stats["completion_tokens"]),
251
+ "total_tokens": usage.get("total_tokens", self.usage_stats["total_tokens"])
252
+ })
253
+ print(f"DEBUG: Updated usage stats: {self.usage_stats}")
237
254
 
238
255
  # Get the delta from the chunk
239
256
  delta = chunk.get("choices", [{}])[0]
@@ -245,23 +262,35 @@ class StreamResponse:
245
262
  if message.get("tool_calls"):
246
263
  self._update_tool_calls(message["tool_calls"])
247
264
  self.finish_reason = delta.get("finish_reason")
265
+ if self.finish_reason:
266
+ self.usage_stats["stop_reason"] = self.finish_reason
248
267
  elif "delta" in delta:
249
268
  delta_content = delta["delta"]
250
269
  self.content = delta_content.get("content", "")
251
270
  if delta_content.get("tool_calls"):
252
271
  self._update_tool_calls(delta_content["tool_calls"])
253
272
  self.finish_reason = delta.get("finish_reason")
273
+ if self.finish_reason:
274
+ self.usage_stats["stop_reason"] = self.finish_reason
254
275
 
255
- # Update timing stats while preserving tokens_per_second
276
+ # Update timing stats
256
277
  timing_stats = timing.stats
257
- generation_time = timing_stats["generation_time"]
258
- completion_tokens = self.usage_stats.get("completion_tokens", 0)
259
- tokens_per_second = (completion_tokens / generation_time) if generation_time > 0 and completion_tokens > 0 else 0.0
278
+ if self.timing_stats["time_to_first_token"] is None:
279
+ self.timing_stats["time_to_first_token"] = timing_stats["time_to_first_token"]
260
280
 
261
281
  self.timing_stats.update({
262
- **timing_stats,
263
- "tokens_per_second": tokens_per_second
282
+ "generation_time": timing_stats["generation_time"],
283
+ "reasoning_time": timing_stats["reasoning_time"],
284
+ "reasoning_tokens": timing_stats["reasoning_tokens"]
264
285
  })
286
+
287
+ # Calculate tokens per second only if we have valid completion tokens and generation time
288
+ if self.usage_stats["completion_tokens"] > 0 and timing_stats["generation_time"] > 0:
289
+ self.timing_stats["tokens_per_second"] = (
290
+ self.usage_stats["completion_tokens"] / timing_stats["generation_time"]
291
+ )
292
+
293
+ print(f"DEBUG: Final usage stats in update_from_chunk: {self.usage_stats}")
265
294
 
266
295
  def _update_tool_calls(self, new_tool_calls: List[Dict[str, Any]]) -> None:
267
296
  """Update tool calls, handling both full and partial updates."""
@@ -292,29 +321,40 @@ class StreamResponse:
292
321
  current_tool["function"]["arguments"] += func_delta["arguments"]
293
322
 
294
323
  def has_updates(self) -> bool:
295
- """Check if this response has any content or tool call updates."""
296
- return bool(self.content) or bool(self.tool_calls)
324
+ """Check if this response has any content, tool call, or usage updates."""
325
+ has_content = bool(self.content)
326
+ has_tool_calls = bool(self.tool_calls)
327
+ has_usage = self.usage_stats["prompt_tokens"] > 0 or self.usage_stats["completion_tokens"] > 0
328
+ has_finish = bool(self.finish_reason)
329
+
330
+ print(f"DEBUG: has_updates check - content: {has_content}, tool_calls: {has_tool_calls}, usage: {has_usage}, finish: {has_finish}")
331
+
332
+ return has_content or has_tool_calls or has_usage or has_finish
297
333
 
298
334
  def to_output(self, buffer: str, transformer: Any) -> LLMOutput:
299
335
  """Convert current state to LLMOutput."""
336
+ print("DEBUG: Entering to_output")
337
+ print(f"DEBUG: Usage stats before conversion: {self.usage_stats}")
338
+
300
339
  buffer, output, _ = transformer(self.content, buffer)
301
340
 
302
341
  # Add tool calls if present
303
342
  if self.tool_calls:
304
343
  output.tool_calls = self.tool_calls
305
344
 
306
- # Add usage stats if this is final
307
- if self.finish_reason:
308
- output.usage = LLMUsage(
309
- stop_reason=self.usage_stats["stop_reason"],
310
- time_to_first_token=self.timing_stats["time_to_first_token"],
311
- tokens_per_second=self.timing_stats["tokens_per_second"],
312
- prompt_tokens=self.usage_stats["prompt_tokens"],
313
- completion_tokens=self.usage_stats["completion_tokens"],
314
- total_tokens=self.usage_stats["total_tokens"],
315
- reasoning_time=self.timing_stats["reasoning_time"],
316
- reasoning_tokens=self.timing_stats["reasoning_tokens"]
317
- )
345
+ # Add usage stats
346
+ print(f"DEBUG: Creating LLMUsage with stats: {self.usage_stats}")
347
+ output.usage = LLMUsage(
348
+ stop_reason=self.usage_stats["stop_reason"],
349
+ time_to_first_token=self.timing_stats["time_to_first_token"] or 0.0,
350
+ tokens_per_second=self.timing_stats["tokens_per_second"],
351
+ prompt_tokens=self.usage_stats["prompt_tokens"],
352
+ completion_tokens=self.usage_stats["completion_tokens"],
353
+ total_tokens=self.usage_stats["total_tokens"],
354
+ reasoning_time=self.timing_stats["reasoning_time"],
355
+ reasoning_tokens=self.timing_stats["reasoning_tokens"]
356
+ )
357
+ print(f"DEBUG: Created output usage: {output.usage}")
318
358
 
319
359
  return output, buffer
320
360
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.23
3
+ Version: 0.2.24
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
2
2
  inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
3
3
  inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
4
4
  inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
5
- inferencesh/models/llm.py,sha256=knvwpKECQb67rG8VIt-VmZu0aDVpABzQiifrytAfv9s,20932
5
+ inferencesh/models/llm.py,sha256=ycg20sSx3UJevjoTVukBZXwRyXY06tFZKAmlVp0MBzQ,23168
6
6
  inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
7
7
  inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
8
8
  inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
9
- inferencesh-0.2.23.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
- inferencesh-0.2.23.dist-info/METADATA,sha256=w5AOt2foy30CdqgfcivGhBflpWOvtm1B7tHEJo_ipVE,2757
11
- inferencesh-0.2.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- inferencesh-0.2.23.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
- inferencesh-0.2.23.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
- inferencesh-0.2.23.dist-info/RECORD,,
9
+ inferencesh-0.2.24.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
+ inferencesh-0.2.24.dist-info/METADATA,sha256=kQq9qN65EU9DS-SAQHm3Sw73yzz-FZVQX6ueHSgktW8,2757
11
+ inferencesh-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ inferencesh-0.2.24.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
+ inferencesh-0.2.24.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
+ inferencesh-0.2.24.dist-info/RECORD,,