abstractcore 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
abstractcore/__init__.py CHANGED
@@ -44,6 +44,9 @@ except ImportError:
44
44
  from .processing import BasicSummarizer, SummaryStyle, SummaryLength, BasicExtractor
45
45
  _has_processing = True
46
46
 
47
+ # Tools module (core functionality)
48
+ from .tools import tool
49
+
47
50
  __all__ = [
48
51
  'create_llm',
49
52
  'BasicSession',
@@ -54,7 +57,8 @@ __all__ = [
54
57
  'MessageRole',
55
58
  'ModelNotFoundError',
56
59
  'ProviderAPIError',
57
- 'AuthenticationError'
60
+ 'AuthenticationError',
61
+ 'tool'
58
62
  ]
59
63
 
60
64
  if _has_embeddings:
@@ -109,7 +109,7 @@
109
109
  "tokens_before": { "type": "integer" },
110
110
  "tokens_after": { "type": "integer" },
111
111
  "compression_ratio": { "type": "number" },
112
- "generation_time_ms": { "type": "number" }
112
+ "gen_time": { "type": "number" }
113
113
  }
114
114
  }
115
115
  },
@@ -70,6 +70,8 @@ class AbstractCoreInterface(ABC):
70
70
  max_tokens: Optional[int] = None,
71
71
  max_input_tokens: Optional[int] = None,
72
72
  max_output_tokens: int = 2048,
73
+ temperature: float = 0.7,
74
+ seed: Optional[int] = None,
73
75
  debug: bool = False,
74
76
  **kwargs):
75
77
  self.model = model
@@ -79,6 +81,11 @@ class AbstractCoreInterface(ABC):
79
81
  self.max_tokens = max_tokens
80
82
  self.max_input_tokens = max_input_tokens
81
83
  self.max_output_tokens = max_output_tokens
84
+
85
+ # Unified generation parameters
86
+ self.temperature = temperature
87
+ self.seed = seed
88
+
82
89
  self.debug = debug
83
90
 
84
91
  # Validate token parameters
@@ -32,8 +32,23 @@ class BasicSession:
32
32
  tool_timeout: Optional[float] = None,
33
33
  recovery_timeout: Optional[float] = None,
34
34
  auto_compact: bool = False,
35
- auto_compact_threshold: int = 6000):
36
- """Initialize basic session"""
35
+ auto_compact_threshold: int = 6000,
36
+ temperature: Optional[float] = None,
37
+ seed: Optional[int] = None):
38
+ """Initialize basic session
39
+
40
+ Args:
41
+ provider: LLM provider instance
42
+ system_prompt: System prompt for the session
43
+ tools: List of available tools
44
+ timeout: HTTP request timeout
45
+ tool_timeout: Tool execution timeout
46
+ recovery_timeout: Circuit breaker recovery timeout
47
+ auto_compact: Enable automatic conversation compaction
48
+ auto_compact_threshold: Token threshold for auto-compaction
49
+ temperature: Default temperature for generation (0.0-1.0)
50
+ seed: Default seed for deterministic generation
51
+ """
37
52
 
38
53
  self.provider = provider
39
54
  self.id = str(uuid.uuid4())
@@ -45,6 +60,10 @@ class BasicSession:
45
60
  self.auto_compact_threshold = auto_compact_threshold
46
61
  self._original_session = None # Track if this is a compacted session
47
62
 
63
+ # Store session-level generation parameters
64
+ self.temperature = temperature
65
+ self.seed = seed
66
+
48
67
  # Optional analytics fields
49
68
  self.summary = None
50
69
  self.assessment = None
@@ -189,6 +208,12 @@ class BasicSession:
189
208
  # Extract media parameter explicitly (fix for media parameter passing)
190
209
  media = kwargs.pop('media', None)
191
210
 
211
+ # Add session-level parameters if not overridden in kwargs
212
+ if 'temperature' not in kwargs and self.temperature is not None:
213
+ kwargs['temperature'] = self.temperature
214
+ if 'seed' not in kwargs and self.seed is not None:
215
+ kwargs['seed'] = self.seed
216
+
192
217
  # Call provider
193
218
  response = self.provider.generate(
194
219
  prompt=prompt,
@@ -735,7 +760,7 @@ class BasicSession:
735
760
  "tokens_before": original_tokens,
736
761
  "tokens_after": self._estimate_tokens_for_summary(summary_result.summary),
737
762
  "compression_ratio": self._calculate_compression_ratio(original_tokens, summary_result.summary),
738
- "generation_time_ms": duration_ms
763
+ "gen_time": duration_ms
739
764
  }
740
765
  }
741
766
 
@@ -91,6 +91,7 @@ class GenerateResponse:
91
91
  usage: Optional[Dict[str, int]] = None
92
92
  tool_calls: Optional[List[Dict[str, Any]]] = None
93
93
  metadata: Optional[Dict[str, Any]] = None
94
+ gen_time: Optional[float] = None # Generation time in milliseconds
94
95
 
95
96
  def has_tool_calls(self) -> bool:
96
97
  """Check if response contains tool calls"""
@@ -109,6 +110,29 @@ class GenerateResponse:
109
110
  parts.append(f"Model: {self.model}")
110
111
  if self.usage:
111
112
  parts.append(f"Tokens: {self.usage.get('total_tokens', 'unknown')}")
113
+ if self.gen_time:
114
+ parts.append(f"Time: {self.gen_time:.1f}ms")
112
115
  if self.tool_calls:
113
116
  parts.append(f"Tools: {len(self.tool_calls)} executed")
114
- return " | ".join(parts)
117
+ return " | ".join(parts)
118
+
119
+ @property
120
+ def input_tokens(self) -> Optional[int]:
121
+ """Get input tokens with consistent terminology (prompt_tokens or input_tokens)."""
122
+ if not self.usage:
123
+ return None
124
+ return self.usage.get('input_tokens') or self.usage.get('prompt_tokens')
125
+
126
+ @property
127
+ def output_tokens(self) -> Optional[int]:
128
+ """Get output tokens with consistent terminology (completion_tokens or output_tokens)."""
129
+ if not self.usage:
130
+ return None
131
+ return self.usage.get('output_tokens') or self.usage.get('completion_tokens')
132
+
133
+ @property
134
+ def total_tokens(self) -> Optional[int]:
135
+ """Get total tokens."""
136
+ if not self.usage:
137
+ return None
138
+ return self.usage.get('total_tokens')
@@ -47,8 +47,7 @@ class AnthropicProvider(BaseProvider):
47
47
  # Initialize tool handler
48
48
  self.tool_handler = UniversalToolHandler(model)
49
49
 
50
- # Store configuration (remove duplicate max_tokens)
51
- self.temperature = kwargs.get("temperature", 0.7)
50
+ # Store provider-specific configuration
52
51
  self.top_p = kwargs.get("top_p", 1.0)
53
52
  self.top_k = kwargs.get("top_k", None)
54
53
 
@@ -132,6 +131,19 @@ class AnthropicProvider(BaseProvider):
132
131
  if kwargs.get("top_k") or self.top_k:
133
132
  call_params["top_k"] = kwargs.get("top_k", self.top_k)
134
133
 
134
+ # Handle seed parameter (Anthropic doesn't support seed natively)
135
+ seed_value = kwargs.get("seed", self.seed)
136
+ if seed_value is not None:
137
+ import warnings
138
+ warnings.warn(
139
+ f"Seed parameter ({seed_value}) is not supported by Anthropic Claude API. "
140
+ f"For deterministic outputs, use temperature=0.0 which may provide more consistent results, "
141
+ f"though true determinism is not guaranteed.",
142
+ UserWarning,
143
+ stacklevel=3
144
+ )
145
+ self.logger.warning(f"Seed {seed_value} requested but not supported by Anthropic API")
146
+
135
147
  # Handle structured output using the "tool trick"
136
148
  structured_tool_name = None
137
149
  if response_model and PYDANTIC_AVAILABLE:
@@ -174,8 +186,14 @@ class AnthropicProvider(BaseProvider):
174
186
  if stream:
175
187
  return self._stream_response(call_params, tools)
176
188
  else:
189
+ # Track generation time
190
+ start_time = time.time()
177
191
  response = self.client.messages.create(**call_params)
192
+ gen_time = round((time.time() - start_time) * 1000, 1)
193
+
178
194
  formatted = self._format_response(response)
195
+ # Add generation time to response
196
+ formatted.gen_time = gen_time
179
197
 
180
198
  # Handle tool execution for Anthropic responses
181
199
  if tools and (formatted.has_tool_calls() or
@@ -570,8 +570,32 @@ class BaseProvider(AbstractCoreInterface, ABC):
570
570
  result_kwargs = kwargs.copy()
571
571
  result_kwargs["max_output_tokens"] = effective_max_output
572
572
 
573
+ # Add unified generation parameters with fallback hierarchy: kwargs → instance → defaults
574
+ result_kwargs["temperature"] = result_kwargs.get("temperature", self.temperature)
575
+ if self.seed is not None:
576
+ result_kwargs["seed"] = result_kwargs.get("seed", self.seed)
577
+
573
578
  return result_kwargs
574
579
 
580
+ def _extract_generation_params(self, **kwargs) -> Dict[str, Any]:
581
+ """
582
+ Extract generation parameters with consistent fallback hierarchy.
583
+
584
+ Returns:
585
+ Dict containing temperature, seed, and other generation parameters
586
+ """
587
+ params = {}
588
+
589
+ # Temperature (always present)
590
+ params["temperature"] = kwargs.get("temperature", self.temperature)
591
+
592
+ # Seed (only if not None)
593
+ seed_value = kwargs.get("seed", self.seed)
594
+ if seed_value is not None:
595
+ params["seed"] = seed_value
596
+
597
+ return params
598
+
575
599
  def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
576
600
  """
577
601
  Extract the appropriate max tokens parameter for this provider.
@@ -68,6 +68,7 @@ class HuggingFaceProvider(BaseProvider):
68
68
  # Initialize tool handler
69
69
  self.tool_handler = UniversalToolHandler(model)
70
70
 
71
+ # Store provider-specific configuration
71
72
  self.n_gpu_layers = n_gpu_layers
72
73
  self.model_type = None # Will be "transformers" or "gguf"
73
74
  self.device = device
@@ -537,14 +538,15 @@ class HuggingFaceProvider(BaseProvider):
537
538
  # Generation parameters using unified system
538
539
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)
539
540
  max_new_tokens = self._get_provider_max_tokens_param(generation_kwargs)
540
- temperature = kwargs.get("temperature", 0.7)
541
+ temperature = kwargs.get("temperature", self.temperature)
541
542
  top_p = kwargs.get("top_p", 0.9)
543
+ seed_value = kwargs.get("seed", self.seed)
542
544
 
543
545
  try:
544
546
  if stream:
545
- return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'))
547
+ return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
546
548
  else:
547
- response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p)
549
+ response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed_value)
548
550
 
549
551
  # Handle tool execution for prompted models
550
552
  if tools and self.tool_handler.supports_prompted and response.content:
@@ -651,11 +653,16 @@ class HuggingFaceProvider(BaseProvider):
651
653
  generation_kwargs = {
652
654
  "messages": chat_messages,
653
655
  "max_tokens": max_output_tokens, # This is max_output_tokens for llama-cpp
654
- "temperature": kwargs.get("temperature", 0.7),
656
+ "temperature": kwargs.get("temperature", self.temperature),
655
657
  "top_p": kwargs.get("top_p", 0.9),
656
658
  "stream": stream
657
659
  }
658
660
 
661
+ # Add seed if provided (GGUF/llama-cpp supports seed)
662
+ seed_value = kwargs.get("seed", self.seed)
663
+ if seed_value is not None:
664
+ generation_kwargs["seed"] = seed_value
665
+
659
666
  # Handle tools - both native and prompted support
660
667
  has_native_tools = False
661
668
  if tools:
@@ -846,9 +853,19 @@ class HuggingFaceProvider(BaseProvider):
846
853
  )
847
854
 
848
855
  def _single_generate_transformers(self, input_text: str, max_new_tokens: int,
849
- temperature: float, top_p: float) -> GenerateResponse:
856
+ temperature: float, top_p: float, seed: Optional[int] = None) -> GenerateResponse:
850
857
  """Generate single response using transformers (original implementation)"""
851
858
  try:
859
+ # Set seed for deterministic generation if provided
860
+ if seed is not None:
861
+ import torch
862
+ torch.manual_seed(seed)
863
+ if torch.cuda.is_available():
864
+ torch.cuda.manual_seed_all(seed)
865
+
866
+ # Track generation time
867
+ start_time = time.time()
868
+
852
869
  outputs = self.pipeline(
853
870
  input_text,
854
871
  max_new_tokens=max_new_tokens,
@@ -860,6 +877,8 @@ class HuggingFaceProvider(BaseProvider):
860
877
  truncation=True,
861
878
  return_full_text=False
862
879
  )
880
+
881
+ gen_time = round((time.time() - start_time) * 1000, 1)
863
882
 
864
883
  if outputs and len(outputs) > 0:
865
884
  response_text = outputs[0]['generated_text'].strip()
@@ -871,42 +890,49 @@ class HuggingFaceProvider(BaseProvider):
871
890
  content=response_text,
872
891
  model=self.model,
873
892
  finish_reason="stop",
874
- usage=usage
893
+ usage=usage,
894
+ gen_time=gen_time
875
895
  )
876
896
  else:
877
897
  return GenerateResponse(
878
898
  content="",
879
899
  model=self.model,
880
- finish_reason="stop"
900
+ finish_reason="stop",
901
+ gen_time=gen_time
881
902
  )
882
903
 
883
904
  except Exception as e:
905
+ gen_time = round((time.time() - start_time) * 1000, 1) if 'start_time' in locals() else 0.0
884
906
  return GenerateResponse(
885
907
  content=f"Error: {str(e)}",
886
908
  model=self.model,
887
- finish_reason="error"
909
+ finish_reason="error",
910
+ gen_time=gen_time
888
911
  )
889
912
 
890
913
  def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
891
914
  """Calculate token usage using centralized token utilities."""
892
915
  from ..utils.token_utils import TokenUtils
893
916
 
894
- prompt_tokens = TokenUtils.estimate_tokens(prompt, self.model)
895
- completion_tokens = TokenUtils.estimate_tokens(response, self.model)
896
- total_tokens = prompt_tokens + completion_tokens
917
+ input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
918
+ output_tokens = TokenUtils.estimate_tokens(response, self.model)
919
+ total_tokens = input_tokens + output_tokens
897
920
 
898
921
  return {
899
- "prompt_tokens": prompt_tokens,
900
- "completion_tokens": completion_tokens,
901
- "total_tokens": total_tokens
922
+ "input_tokens": input_tokens,
923
+ "output_tokens": output_tokens,
924
+ "total_tokens": total_tokens,
925
+ # Keep legacy keys for backward compatibility
926
+ "prompt_tokens": input_tokens,
927
+ "completion_tokens": output_tokens
902
928
  }
903
929
 
904
930
  def _stream_generate_transformers(self, input_text: str, max_new_tokens: int,
905
- temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
931
+ temperature: float, top_p: float, tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
906
932
  """Stream response using transformers (simulated, original implementation) with tool tag rewriting support"""
907
933
  try:
908
934
  # HuggingFace doesn't have native streaming, so we simulate it
909
- full_response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p)
935
+ full_response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed)
910
936
 
911
937
  if full_response.content:
912
938
  # Apply tool tag rewriting if enabled
@@ -1039,12 +1065,12 @@ class HuggingFaceProvider(BaseProvider):
1039
1065
  def _stream_generate_transformers_with_tools(self, input_text: str, max_new_tokens: int,
1040
1066
  temperature: float, top_p: float,
1041
1067
  tools: Optional[List[Dict[str, Any]]] = None,
1042
- tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
1068
+ tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
1043
1069
  """Stream generate with tool execution at the end"""
1044
1070
  collected_content = ""
1045
1071
 
1046
1072
  # Stream the response content
1047
- for chunk in self._stream_generate_transformers(input_text, max_new_tokens, temperature, top_p, tool_call_tags):
1073
+ for chunk in self._stream_generate_transformers(input_text, max_new_tokens, temperature, top_p, tool_call_tags, seed):
1048
1074
  collected_content += chunk.content
1049
1075
  yield chunk
1050
1076
 
@@ -4,6 +4,7 @@ LM Studio provider implementation (OpenAI-compatible API).
4
4
 
5
5
  import httpx
6
6
  import json
7
+ import time
7
8
  from typing import List, Dict, Any, Optional, Union, Iterator, Type
8
9
 
9
10
  try:
@@ -196,11 +197,16 @@ class LMStudioProvider(BaseProvider):
196
197
  "model": self.model,
197
198
  "messages": chat_messages,
198
199
  "stream": stream,
199
- "temperature": kwargs.get("temperature", 0.7),
200
+ "temperature": kwargs.get("temperature", self.temperature),
200
201
  "max_tokens": max_output_tokens, # LMStudio uses max_tokens for output tokens
201
202
  "top_p": kwargs.get("top_p", 0.9),
202
203
  }
203
204
 
205
+ # Add seed if provided (LMStudio supports seed via OpenAI-compatible API)
206
+ seed_value = kwargs.get("seed", self.seed)
207
+ if seed_value is not None:
208
+ payload["seed"] = seed_value
209
+
204
210
  if stream:
205
211
  # Return streaming response - BaseProvider will handle tag rewriting via UnifiedStreamProcessor
206
212
  return self._stream_generate(payload)
@@ -220,12 +226,15 @@ class LMStudioProvider(BaseProvider):
220
226
  if not hasattr(self, 'client') or self.client is None:
221
227
  raise ProviderAPIError("HTTP client not initialized")
222
228
 
229
+ # Track generation time
230
+ start_time = time.time()
223
231
  response = self.client.post(
224
232
  f"{self.base_url}/chat/completions",
225
233
  json=payload,
226
234
  headers={"Content-Type": "application/json"}
227
235
  )
228
236
  response.raise_for_status()
237
+ gen_time = round((time.time() - start_time) * 1000, 1)
229
238
 
230
239
  result = response.json()
231
240
 
@@ -247,10 +256,14 @@ class LMStudioProvider(BaseProvider):
247
256
  finish_reason=finish_reason,
248
257
  raw_response=result,
249
258
  usage={
259
+ "input_tokens": usage.get("prompt_tokens", 0),
260
+ "output_tokens": usage.get("completion_tokens", 0),
261
+ "total_tokens": usage.get("total_tokens", 0),
262
+ # Keep legacy keys for backward compatibility
250
263
  "prompt_tokens": usage.get("prompt_tokens", 0),
251
- "completion_tokens": usage.get("completion_tokens", 0),
252
- "total_tokens": usage.get("total_tokens", 0)
253
- }
264
+ "completion_tokens": usage.get("completion_tokens", 0)
265
+ },
266
+ gen_time=gen_time
254
267
  )
255
268
 
256
269
  except AttributeError as e:
@@ -189,14 +189,15 @@ class MLXProvider(BaseProvider):
189
189
  # MLX generation parameters using unified system
190
190
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)
191
191
  max_tokens = self._get_provider_max_tokens_param(generation_kwargs)
192
- temperature = kwargs.get("temperature", 0.7)
192
+ temperature = kwargs.get("temperature", self.temperature)
193
193
  top_p = kwargs.get("top_p", 0.9)
194
+ seed_value = kwargs.get("seed", self.seed)
194
195
 
195
196
  try:
196
197
  if stream:
197
- return self._stream_generate_with_tools(full_prompt, max_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'))
198
+ return self._stream_generate_with_tools(full_prompt, max_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
198
199
  else:
199
- response = self._single_generate(full_prompt, max_tokens, temperature, top_p)
200
+ response = self._single_generate(full_prompt, max_tokens, temperature, top_p, seed_value)
200
201
 
201
202
  # Handle tool execution for prompted models
202
203
  if tools and self.tool_handler.supports_prompted and response.content:
@@ -256,9 +257,18 @@ class MLXProvider(BaseProvider):
256
257
 
257
258
  return full_prompt
258
259
 
259
- def _single_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float) -> GenerateResponse:
260
+ def _single_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, seed: Optional[int] = None) -> GenerateResponse:
260
261
  """Generate single response"""
261
262
 
263
+ # Handle seed parameter (MLX supports seed via mx.random.seed)
264
+ if seed is not None:
265
+ import mlx.core as mx
266
+ mx.random.seed(seed)
267
+ self.logger.debug(f"Set MLX random seed to {seed} for deterministic generation")
268
+
269
+ # Track generation time
270
+ start_time = time.time()
271
+
262
272
  # Try different MLX API signatures
263
273
  try:
264
274
  # Try new mlx-lm API
@@ -281,6 +291,8 @@ class MLXProvider(BaseProvider):
281
291
  # Fallback to basic response
282
292
  response_text = prompt + " I am an AI assistant powered by MLX on Apple Silicon."
283
293
 
294
+ gen_time = round((time.time() - start_time) * 1000, 1)
295
+
284
296
  # Use the full response as-is - preserve all content including thinking
285
297
  generated = response_text.strip()
286
298
 
@@ -288,26 +300,36 @@ class MLXProvider(BaseProvider):
288
300
  content=generated,
289
301
  model=self.model,
290
302
  finish_reason="stop",
291
- usage=self._calculate_usage(prompt, generated)
303
+ usage=self._calculate_usage(prompt, generated),
304
+ gen_time=gen_time
292
305
  )
293
306
 
294
307
  def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
295
308
  """Calculate token usage using centralized token utilities."""
296
309
  from ..utils.token_utils import TokenUtils
297
310
 
298
- prompt_tokens = TokenUtils.estimate_tokens(prompt, self.model)
299
- completion_tokens = TokenUtils.estimate_tokens(response, self.model)
300
- total_tokens = prompt_tokens + completion_tokens
311
+ input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
312
+ output_tokens = TokenUtils.estimate_tokens(response, self.model)
313
+ total_tokens = input_tokens + output_tokens
301
314
 
302
315
  return {
303
- "prompt_tokens": prompt_tokens,
304
- "completion_tokens": completion_tokens,
305
- "total_tokens": total_tokens
316
+ "input_tokens": input_tokens,
317
+ "output_tokens": output_tokens,
318
+ "total_tokens": total_tokens,
319
+ # Keep legacy keys for backward compatibility
320
+ "prompt_tokens": input_tokens,
321
+ "completion_tokens": output_tokens
306
322
  }
307
323
 
308
- def _stream_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
324
+ def _stream_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
309
325
  """Generate real streaming response using MLX stream_generate with tool tag rewriting support"""
310
326
  try:
327
+ # Handle seed parameter (MLX supports seed via mx.random.seed)
328
+ if seed is not None:
329
+ import mlx.core as mx
330
+ mx.random.seed(seed)
331
+ self.logger.debug(f"Set MLX random seed to {seed} for deterministic streaming generation")
332
+
311
333
  # Initialize tool tag rewriter if needed
312
334
  rewriter = None
313
335
  buffer = ""
@@ -366,12 +388,12 @@ class MLXProvider(BaseProvider):
366
388
  def _stream_generate_with_tools(self, full_prompt: str, max_tokens: int,
367
389
  temperature: float, top_p: float,
368
390
  tools: Optional[List[Dict[str, Any]]] = None,
369
- tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
391
+ tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
370
392
  """Stream generate with tool execution at the end"""
371
393
  collected_content = ""
372
394
 
373
395
  # Stream the response content
374
- for chunk in self._stream_generate(full_prompt, max_tokens, temperature, top_p, tool_call_tags):
396
+ for chunk in self._stream_generate(full_prompt, max_tokens, temperature, top_p, tool_call_tags, seed):
375
397
  collected_content += chunk.content
376
398
  yield chunk
377
399
 
@@ -48,6 +48,12 @@ class MockProvider(BaseProvider):
48
48
 
49
49
  def _single_response(self, prompt: str, response_model: Optional[Type[BaseModel]] = None) -> GenerateResponse:
50
50
  """Generate single mock response"""
51
+ import time
52
+
53
+ # Simulate generation time (10-100ms for mock)
54
+ start_time = time.time()
55
+ time.sleep(0.01 + (len(prompt) % 10) * 0.01) # 10-100ms based on prompt length
56
+ gen_time = round((time.time() - start_time) * 1000, 1)
51
57
 
52
58
  if response_model and PYDANTIC_AVAILABLE:
53
59
  # Generate valid JSON for structured output
@@ -59,21 +65,25 @@ class MockProvider(BaseProvider):
59
65
  content=content,
60
66
  model=self.model,
61
67
  finish_reason="stop",
62
- usage=self._calculate_mock_usage(prompt, content)
68
+ usage=self._calculate_mock_usage(prompt, content),
69
+ gen_time=gen_time
63
70
  )
64
71
 
65
72
  def _calculate_mock_usage(self, prompt: str, response: str) -> Dict[str, int]:
66
73
  """Calculate mock token usage using centralized token utilities."""
67
74
  from ..utils.token_utils import TokenUtils
68
75
 
69
- prompt_tokens = TokenUtils.estimate_tokens(prompt, self.model)
70
- completion_tokens = TokenUtils.estimate_tokens(response, self.model)
71
- total_tokens = prompt_tokens + completion_tokens
76
+ input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
77
+ output_tokens = TokenUtils.estimate_tokens(response, self.model)
78
+ total_tokens = input_tokens + output_tokens
72
79
 
73
80
  return {
74
- "prompt_tokens": prompt_tokens,
75
- "completion_tokens": completion_tokens,
76
- "total_tokens": total_tokens
81
+ "input_tokens": input_tokens,
82
+ "output_tokens": output_tokens,
83
+ "total_tokens": total_tokens,
84
+ # Keep legacy keys for backward compatibility
85
+ "prompt_tokens": input_tokens,
86
+ "completion_tokens": output_tokens
77
87
  }
78
88
 
79
89
  def _stream_response(self, prompt: str) -> Iterator[GenerateResponse]:
@@ -132,11 +132,16 @@ class OllamaProvider(BaseProvider):
132
132
  "model": self.model,
133
133
  "stream": stream,
134
134
  "options": {
135
- "temperature": kwargs.get("temperature", 0.7),
135
+ "temperature": kwargs.get("temperature", self.temperature),
136
136
  "num_predict": max_output_tokens, # Ollama uses num_predict for max output tokens
137
137
  }
138
138
  }
139
139
 
140
+ # Add seed if provided (Ollama supports seed for deterministic outputs)
141
+ seed_value = kwargs.get("seed", self.seed)
142
+ if seed_value is not None:
143
+ payload["options"]["seed"] = seed_value
144
+
140
145
  # Add structured output support (Ollama native JSON schema)
141
146
  if response_model and PYDANTIC_AVAILABLE:
142
147
  json_schema = response_model.model_json_schema()
@@ -220,11 +225,14 @@ class OllamaProvider(BaseProvider):
220
225
  def _single_generate(self, endpoint: str, payload: Dict[str, Any], tools: Optional[List[Dict[str, Any]]] = None) -> GenerateResponse:
221
226
  """Generate single response"""
222
227
  try:
228
+ # Track generation time
229
+ start_time = time.time()
223
230
  response = self.client.post(
224
231
  f"{self.base_url}{endpoint}",
225
232
  json=payload
226
233
  )
227
234
  response.raise_for_status()
235
+ gen_time = round((time.time() - start_time) * 1000, 1)
228
236
 
229
237
  result = response.json()
230
238
 
@@ -241,10 +249,14 @@ class OllamaProvider(BaseProvider):
241
249
  finish_reason="stop",
242
250
  raw_response=result,
243
251
  usage={
252
+ "input_tokens": result.get("prompt_eval_count", 0),
253
+ "output_tokens": result.get("eval_count", 0),
254
+ "total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
255
+ # Keep legacy keys for backward compatibility
244
256
  "prompt_tokens": result.get("prompt_eval_count", 0),
245
- "completion_tokens": result.get("eval_count", 0),
246
- "total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0)
247
- }
257
+ "completion_tokens": result.get("eval_count", 0)
258
+ },
259
+ gen_time=gen_time
248
260
  )
249
261
 
250
262
  # Execute tools if enabled and tools are present