inferencesh 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of inferencesh might be problematic. Click here for more details.

inferencesh/models/llm.py CHANGED
@@ -88,7 +88,7 @@ class LLMInput(BaseAppInput):
88
88
  context_size: int = Field(default=4096)
89
89
 
90
90
  # Model specific flags
91
- enable_thinking: bool = Field(default=False)
91
+ reasoning: bool = Field(default=False)
92
92
 
93
93
  class LLMUsage(BaseAppOutput):
94
94
  stop_reason: str = ""
@@ -97,11 +97,13 @@ class LLMUsage(BaseAppOutput):
97
97
  prompt_tokens: int = 0
98
98
  completion_tokens: int = 0
99
99
  total_tokens: int = 0
100
+ reasoning_tokens: int = 0
101
+ reasoning_time: float = 0.0
100
102
 
101
103
 
102
104
  class LLMOutput(BaseAppOutput):
103
105
  response: str
104
- thinking_content: Optional[str] = None
106
+ reasoning: Optional[str] = None
105
107
  usage: Optional[LLMUsage] = None
106
108
 
107
109
 
@@ -112,11 +114,27 @@ def timing_context():
112
114
  def __init__(self):
113
115
  self.start_time = time.time()
114
116
  self.first_token_time = None
117
+ self.reasoning_start_time = None
118
+ self.total_reasoning_time = 0.0
119
+ self.reasoning_tokens = 0
120
+ self.in_reasoning = False
115
121
 
116
122
  def mark_first_token(self):
117
123
  if self.first_token_time is None:
118
124
  self.first_token_time = time.time()
119
125
 
126
+ def start_reasoning(self):
127
+ if not self.in_reasoning:
128
+ self.reasoning_start_time = time.time()
129
+ self.in_reasoning = True
130
+
131
+ def end_reasoning(self, token_count: int = 0):
132
+ if self.in_reasoning and self.reasoning_start_time:
133
+ self.total_reasoning_time += time.time() - self.reasoning_start_time
134
+ self.reasoning_tokens += token_count
135
+ self.reasoning_start_time = None
136
+ self.in_reasoning = False
137
+
120
138
  @property
121
139
  def stats(self):
122
140
  end_time = time.time()
@@ -128,7 +146,9 @@ def timing_context():
128
146
 
129
147
  return {
130
148
  "time_to_first_token": time_to_first,
131
- "generation_time": generation_time
149
+ "generation_time": generation_time,
150
+ "reasoning_time": self.total_reasoning_time,
151
+ "reasoning_tokens": self.reasoning_tokens
132
152
  }
133
153
 
134
154
  timing = TimingInfo()
@@ -147,78 +167,209 @@ def build_messages(
147
167
  transform_user_message: Optional[Callable[[str], str]] = None
148
168
  ) -> List[Dict[str, Any]]:
149
169
  """Build messages for LLaMA.cpp chat completion.
150
-
151
- Args:
152
- input_data: The input data
153
- transform_user_message: Optional function to transform user message text before building messages
154
- """
155
- messages = [
156
- {
157
- "role": "system",
158
- "content": [{"type": "text", "text": input_data.system_prompt}],
159
- }
160
- ]
161
170
 
162
- # Add context messages
163
- for msg in input_data.context:
164
- message_content = []
165
- text = msg.text
166
- if transform_user_message and msg.role == ContextMessageRole.USER:
167
- text = transform_user_message(text)
171
+ If any message includes image content, builds OpenAI-style multipart format.
172
+ Otherwise, uses plain string-only format.
173
+ """
174
+ def render_message(msg: ContextMessage, allow_multipart: bool) -> str | List[dict]:
175
+ parts = []
176
+ text = transform_user_message(msg.text) if transform_user_message and msg.role == ContextMessageRole.USER else msg.text
168
177
  if text:
169
- message_content.append({"type": "text", "text": text})
170
- if hasattr(msg, 'image') and msg.image:
178
+ parts.append({"type": "text", "text": text})
179
+ if msg.image:
171
180
  if msg.image.path:
172
181
  image_data_uri = image_to_base64_data_uri(msg.image.path)
173
- message_content.append({"type": "image_url", "image_url": {"url": image_data_uri}})
182
+ parts.append({"type": "image_url", "image_url": {"url": image_data_uri}})
174
183
  elif msg.image.uri:
175
- message_content.append({"type": "image_url", "image_url": {"url": msg.image.uri}})
184
+ parts.append({"type": "image_url", "image_url": {"url": msg.image.uri}})
185
+ if allow_multipart:
186
+ return parts
187
+ if len(parts) == 1 and parts[0]["type"] == "text":
188
+ return parts[0]["text"]
189
+ raise ValueError("Image content requires multipart support")
190
+
191
+ multipart = any(m.image for m in input_data.context) or input_data.image is not None
192
+ messages = [{"role": "system", "content": input_data.system_prompt}]
193
+
194
+ for msg in input_data.context:
176
195
  messages.append({
177
196
  "role": msg.role,
178
- "content": message_content
197
+ "content": render_message(msg, allow_multipart=multipart)
179
198
  })
180
199
 
181
- # Add user message
182
- user_content = []
183
- text = input_data.text
184
- if transform_user_message:
185
- text = transform_user_message(text)
186
- if text:
187
- user_content.append({"type": "text", "text": text})
188
- if hasattr(input_data, 'image') and input_data.image:
189
- if input_data.image.path:
190
- image_data_uri = image_to_base64_data_uri(input_data.image.path)
191
- user_content.append({"type": "image_url", "image_url": {"url": image_data_uri}})
192
- elif input_data.image.uri:
193
- user_content.append({"type": "image_url", "image_url": {"url": input_data.image.uri}})
194
- messages.append({"role": "user", "content": user_content})
200
+ user_msg = ContextMessage(role=ContextMessageRole.USER, text=input_data.text, image=input_data.image)
201
+ messages.append({
202
+ "role": "user",
203
+ "content": render_message(user_msg, allow_multipart=multipart)
204
+ })
195
205
 
196
206
  return messages
197
207
 
198
208
 
209
+ class ResponseState:
210
+ """Holds the state of response transformation."""
211
+ def __init__(self):
212
+ self.buffer = ""
213
+ self.response = ""
214
+ self.reasoning = None
215
+ self.function_calls = None # For future function calling support
216
+ self.tool_calls = None # For future tool calling support
217
+ self.state_changes = {
218
+ "reasoning_started": False,
219
+ "reasoning_ended": False,
220
+ "function_call_started": False,
221
+ "function_call_ended": False,
222
+ "tool_call_started": False,
223
+ "tool_call_ended": False
224
+ }
225
+
226
+ class ResponseTransformer:
227
+ """Base class for transforming model responses."""
228
+ def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
229
+ self.state = ResponseState()
230
+ self.output_cls = output_cls
231
+
232
+ def clean_text(self, text: str) -> str:
233
+ """Clean common tokens from the text and apply model-specific cleaning.
234
+
235
+ Args:
236
+ text: Raw text to clean
237
+
238
+ Returns:
239
+ Cleaned text with common and model-specific tokens removed
240
+ """
241
+ # Common token cleaning across most models
242
+ cleaned = (text.replace("<|im_end|>", "")
243
+ .replace("<|im_start|>", "")
244
+ .replace("<start_of_turn>", "")
245
+ .replace("<end_of_turn>", "")
246
+ .replace("<eos>", ""))
247
+ return self.additional_cleaning(cleaned)
248
+
249
+ def additional_cleaning(self, text: str) -> str:
250
+ """Apply model-specific token cleaning.
251
+
252
+ Args:
253
+ text: Text that has had common tokens removed
254
+
255
+ Returns:
256
+ Text with model-specific tokens removed
257
+ """
258
+ return text
259
+
260
+ def handle_reasoning(self, text: str) -> None:
261
+ """Handle reasoning/thinking detection and extraction.
262
+
263
+ Args:
264
+ text: Cleaned text to process for reasoning
265
+ """
266
+ # Default implementation for <think> style reasoning
267
+ if "<think>" in text:
268
+ self.state.state_changes["reasoning_started"] = True
269
+ if "</think>" in text:
270
+ self.state.state_changes["reasoning_ended"] = True
271
+
272
+ if "<think>" in self.state.buffer:
273
+ parts = self.state.buffer.split("</think>", 1)
274
+ if len(parts) > 1:
275
+ self.state.reasoning = parts[0].split("<think>", 1)[1].strip()
276
+ self.state.response = parts[1].strip()
277
+ else:
278
+ self.state.reasoning = self.state.buffer.split("<think>", 1)[1].strip()
279
+ self.state.response = ""
280
+ else:
281
+ self.state.response = self.state.buffer
282
+
283
+ def handle_function_calls(self, text: str) -> None:
284
+ """Handle function call detection and extraction.
285
+
286
+ Args:
287
+ text: Cleaned text to process for function calls
288
+ """
289
+ # Default no-op implementation
290
+ # Models can override this to implement function call handling
291
+ pass
292
+
293
+ def handle_tool_calls(self, text: str) -> None:
294
+ """Handle tool call detection and extraction.
295
+
296
+ Args:
297
+ text: Cleaned text to process for tool calls
298
+ """
299
+ # Default no-op implementation
300
+ # Models can override this to implement tool call handling
301
+ pass
302
+
303
+ def transform_chunk(self, chunk: str) -> None:
304
+ """Transform a single chunk of model output.
305
+
306
+ This method orchestrates the transformation process by:
307
+ 1. Cleaning the text
308
+ 2. Updating the buffer
309
+ 3. Processing various capabilities (reasoning, function calls, etc)
310
+
311
+ Args:
312
+ chunk: Raw text chunk from the model
313
+ """
314
+ cleaned = self.clean_text(chunk)
315
+ self.state.buffer += cleaned
316
+
317
+ # Process different capabilities
318
+ self.handle_reasoning(cleaned)
319
+ self.handle_function_calls(cleaned)
320
+ self.handle_tool_calls(cleaned)
321
+
322
+ def build_output(self) -> tuple[str, LLMOutput, dict]:
323
+ """Build the final output tuple.
324
+
325
+ Returns:
326
+ Tuple of (buffer, LLMOutput, state_changes)
327
+ """
328
+ return (
329
+ self.state.buffer,
330
+ self.output_cls(
331
+ response=self.state.response.strip(),
332
+ reasoning=self.state.reasoning.strip() if self.state.reasoning else None,
333
+ function_calls=self.state.function_calls,
334
+ tool_calls=self.state.tool_calls
335
+ ),
336
+ self.state.state_changes
337
+ )
338
+
339
+ def __call__(self, piece: str, buffer: str) -> tuple[str, LLMOutput, dict]:
340
+ """Transform a piece of text and return the result.
341
+
342
+ Args:
343
+ piece: New piece of text to transform
344
+ buffer: Existing buffer content
345
+
346
+ Returns:
347
+ Tuple of (new_buffer, output, state_changes)
348
+ """
349
+ self.state.buffer = buffer
350
+ self.transform_chunk(piece)
351
+ return self.build_output()
352
+
353
+
199
354
  def stream_generate(
200
355
  model: Any,
201
356
  messages: List[Dict[str, Any]],
202
- output_cls: type[LLMOutput],
357
+ transformer: ResponseTransformer,
203
358
  temperature: float = 0.7,
204
359
  top_p: float = 0.95,
205
360
  max_tokens: int = 4096,
206
361
  stop: Optional[List[str]] = None,
207
- handle_thinking: bool = False,
208
- transform_response: Optional[Callable[[str, str], tuple[str, LLMOutput]]] = None,
209
362
  ) -> Generator[LLMOutput, None, None]:
210
363
  """Stream generate from LLaMA.cpp model with timing and usage tracking.
211
364
 
212
365
  Args:
213
366
  model: The LLaMA.cpp model instance
214
367
  messages: List of messages to send to the model
215
- output_cls: Output class type to use for responses
368
+ transformer: ResponseTransformer instance to use for processing output
216
369
  temperature: Sampling temperature
217
370
  top_p: Top-p sampling threshold
218
371
  max_tokens: Maximum tokens to generate
219
372
  stop: Optional list of stop sequences
220
- handle_thinking: Whether to handle thinking tags
221
- transform_response: Optional function to transform responses, takes (piece, buffer) and returns (new_buffer, output)
222
373
  """
223
374
  response_queue: Queue[Optional[tuple[str, dict]]] = Queue()
224
375
  thread_exception = None
@@ -243,11 +394,9 @@ def stream_generate(
243
394
  )
244
395
 
245
396
  for chunk in completion:
246
- # Get usage from root level if present
247
397
  if "usage" in chunk and chunk["usage"] is not None:
248
398
  usage_stats.update(chunk["usage"])
249
399
 
250
- # Get content from choices
251
400
  delta = chunk.get("choices", [{}])[0]
252
401
  content = None
253
402
  finish_reason = None
@@ -275,15 +424,15 @@ def stream_generate(
275
424
  tokens_per_second = (usage_stats["completion_tokens"] / generation_time) if generation_time > 0 else 0
276
425
  response_queue.put((None, {
277
426
  "time_to_first_token": timing_stats["time_to_first_token"],
278
- "tokens_per_second": tokens_per_second
427
+ "tokens_per_second": tokens_per_second,
428
+ "reasoning_time": timing_stats["reasoning_time"],
429
+ "reasoning_tokens": timing_stats["reasoning_tokens"]
279
430
  }))
280
431
 
281
432
  thread = Thread(target=generation_thread, daemon=True)
282
433
  thread.start()
283
434
 
284
435
  buffer = ""
285
- thinking_content = "" if handle_thinking else None
286
- in_thinking = handle_thinking
287
436
  try:
288
437
  while True:
289
438
  try:
@@ -300,59 +449,18 @@ def stream_generate(
300
449
  tokens_per_second=timing_stats["tokens_per_second"],
301
450
  prompt_tokens=usage_stats["prompt_tokens"],
302
451
  completion_tokens=usage_stats["completion_tokens"],
303
- total_tokens=usage_stats["total_tokens"]
452
+ total_tokens=usage_stats["total_tokens"],
453
+ reasoning_time=timing_stats["reasoning_time"],
454
+ reasoning_tokens=timing_stats["reasoning_tokens"]
304
455
  )
305
456
 
306
- if transform_response:
307
- buffer, output = transform_response(piece or "", buffer)
308
- output.usage = usage
309
- yield output
310
- else:
311
- # Handle thinking vs response content if enabled
312
- if handle_thinking and "</think>" in piece:
313
- parts = piece.split("</think>")
314
- if in_thinking:
315
- thinking_content += parts[0].replace("<think>", "")
316
- buffer = parts[1] if len(parts) > 1 else ""
317
- in_thinking = False
318
- else:
319
- buffer += piece
320
- else:
321
- if in_thinking:
322
- thinking_content += piece.replace("<think>", "")
323
- else:
324
- buffer += piece
325
-
326
- yield output_cls(
327
- response=buffer.strip(),
328
- thinking_content=thinking_content.strip() if thinking_content else None,
329
- usage=usage
330
- )
331
- break
332
-
333
- if transform_response:
334
- buffer, output = transform_response(piece, buffer)
457
+ buffer, output, _ = transformer(piece or "", buffer)
458
+ output.usage = usage
335
459
  yield output
336
- else:
337
- # Handle thinking vs response content if enabled
338
- if handle_thinking and "</think>" in piece:
339
- parts = piece.split("</think>")
340
- if in_thinking:
341
- thinking_content += parts[0].replace("<think>", "")
342
- buffer = parts[1] if len(parts) > 1 else ""
343
- in_thinking = False
344
- else:
345
- buffer += piece
346
- else:
347
- if in_thinking:
348
- thinking_content += piece.replace("<think>", "")
349
- else:
350
- buffer += piece
460
+ break
351
461
 
352
- yield output_cls(
353
- response=buffer.strip(),
354
- thinking_content=thinking_content.strip() if thinking_content else None
355
- )
462
+ buffer, output, _ = transformer(piece, buffer)
463
+ yield output
356
464
 
357
465
  except Exception as e:
358
466
  if thread_exception and isinstance(e, thread_exception.__class__):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inferencesh
3
- Version: 0.2.14
3
+ Version: 0.2.16
4
4
  Summary: inference.sh Python SDK
5
5
  Author: Inference Shell Inc.
6
6
  Author-email: "Inference Shell Inc." <hello@inference.sh>
@@ -2,13 +2,13 @@ inferencesh/__init__.py,sha256=WdADtOhfa3HDOunoE9HLFCTFlXRykYstBIH1FpyWvj8,613
2
2
  inferencesh/models/__init__.py,sha256=FDwcdtT6c4hbRitymjmN-hZMlQa8RbKSftkZZyjtUXA,536
3
3
  inferencesh/models/base.py,sha256=4gZQRi8J7y9U6PrGD9pRIehd1MJVJAqGakPQDs2AKFM,3251
4
4
  inferencesh/models/file.py,sha256=5xnpypcRahM1YcEjj64rv9g2gTimxrZb41YT4r440hU,7393
5
- inferencesh/models/llm.py,sha256=x5nOLnE0A0NGpveETnVjz8ut0stHi9zRI_RNGClLzmY,15153
5
+ inferencesh/models/llm.py,sha256=8wZATOPTYNgtzl34vKeQaCXDbpyTMCzA4jOGrxbo5L4,18380
6
6
  inferencesh/utils/__init__.py,sha256=-xiD6uo2XzcrPAWFb_fUbaimmnW4KFKc-8IvBzaxNd4,148
7
7
  inferencesh/utils/download.py,sha256=7n5twvoNYDcFnKJyefImaj2YfzRI7vddQw4usZbj38c,1521
8
8
  inferencesh/utils/storage.py,sha256=E4J8emd4eFKdmdDgAqzz3TpaaDd3n0l8gYlMHuY8yIU,519
9
- inferencesh-0.2.14.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
- inferencesh-0.2.14.dist-info/METADATA,sha256=P7wIUeprhVrBlpUKi5tyAiIQ-NZqMpFtjgbJWauho6I,2757
11
- inferencesh-0.2.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- inferencesh-0.2.14.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
- inferencesh-0.2.14.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
- inferencesh-0.2.14.dist-info/RECORD,,
9
+ inferencesh-0.2.16.dist-info/licenses/LICENSE,sha256=OsgqEWIh2el_QMj0y8O1A5Q5Dl-dxqqYbFE6fszuR4s,1086
10
+ inferencesh-0.2.16.dist-info/METADATA,sha256=wCILA3L4dmdzeKDEc3oNKH1etYwwYZjsCC7bfzepCa0,2757
11
+ inferencesh-0.2.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ inferencesh-0.2.16.dist-info/entry_points.txt,sha256=6IC-fyozAqW3ljsMLGCXxJ0_ui2Jb-2fLHtoH1RTnEE,45
13
+ inferencesh-0.2.16.dist-info/top_level.txt,sha256=TSMHg3T1ThMl1HGAWmzBClwOYH1ump5neof9BfHIwaA,12
14
+ inferencesh-0.2.16.dist-info/RECORD,,