abstractcore 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +5 -1
- abstractcore/assets/session_schema.json +1 -1
- abstractcore/core/interface.py +7 -0
- abstractcore/core/session.py +28 -3
- abstractcore/core/types.py +25 -1
- abstractcore/providers/anthropic_provider.py +20 -2
- abstractcore/providers/base.py +24 -0
- abstractcore/providers/huggingface_provider.py +44 -18
- abstractcore/providers/lmstudio_provider.py +17 -4
- abstractcore/providers/mlx_provider.py +36 -14
- abstractcore/providers/mock_provider.py +17 -7
- abstractcore/providers/ollama_provider.py +16 -4
- abstractcore/providers/openai_provider.py +18 -5
- abstractcore/tools/common_tools.py +651 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/METADATA +108 -12
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/RECORD +21 -21
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/top_level.txt +0 -0
abstractcore/__init__.py
CHANGED
|
@@ -44,6 +44,9 @@ except ImportError:
|
|
|
44
44
|
from .processing import BasicSummarizer, SummaryStyle, SummaryLength, BasicExtractor
|
|
45
45
|
_has_processing = True
|
|
46
46
|
|
|
47
|
+
# Tools module (core functionality)
|
|
48
|
+
from .tools import tool
|
|
49
|
+
|
|
47
50
|
__all__ = [
|
|
48
51
|
'create_llm',
|
|
49
52
|
'BasicSession',
|
|
@@ -54,7 +57,8 @@ __all__ = [
|
|
|
54
57
|
'MessageRole',
|
|
55
58
|
'ModelNotFoundError',
|
|
56
59
|
'ProviderAPIError',
|
|
57
|
-
'AuthenticationError'
|
|
60
|
+
'AuthenticationError',
|
|
61
|
+
'tool'
|
|
58
62
|
]
|
|
59
63
|
|
|
60
64
|
if _has_embeddings:
|
abstractcore/core/interface.py
CHANGED
|
@@ -70,6 +70,8 @@ class AbstractCoreInterface(ABC):
|
|
|
70
70
|
max_tokens: Optional[int] = None,
|
|
71
71
|
max_input_tokens: Optional[int] = None,
|
|
72
72
|
max_output_tokens: int = 2048,
|
|
73
|
+
temperature: float = 0.7,
|
|
74
|
+
seed: Optional[int] = None,
|
|
73
75
|
debug: bool = False,
|
|
74
76
|
**kwargs):
|
|
75
77
|
self.model = model
|
|
@@ -79,6 +81,11 @@ class AbstractCoreInterface(ABC):
|
|
|
79
81
|
self.max_tokens = max_tokens
|
|
80
82
|
self.max_input_tokens = max_input_tokens
|
|
81
83
|
self.max_output_tokens = max_output_tokens
|
|
84
|
+
|
|
85
|
+
# Unified generation parameters
|
|
86
|
+
self.temperature = temperature
|
|
87
|
+
self.seed = seed
|
|
88
|
+
|
|
82
89
|
self.debug = debug
|
|
83
90
|
|
|
84
91
|
# Validate token parameters
|
abstractcore/core/session.py
CHANGED
|
@@ -32,8 +32,23 @@ class BasicSession:
|
|
|
32
32
|
tool_timeout: Optional[float] = None,
|
|
33
33
|
recovery_timeout: Optional[float] = None,
|
|
34
34
|
auto_compact: bool = False,
|
|
35
|
-
auto_compact_threshold: int = 6000
|
|
36
|
-
|
|
35
|
+
auto_compact_threshold: int = 6000,
|
|
36
|
+
temperature: Optional[float] = None,
|
|
37
|
+
seed: Optional[int] = None):
|
|
38
|
+
"""Initialize basic session
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
provider: LLM provider instance
|
|
42
|
+
system_prompt: System prompt for the session
|
|
43
|
+
tools: List of available tools
|
|
44
|
+
timeout: HTTP request timeout
|
|
45
|
+
tool_timeout: Tool execution timeout
|
|
46
|
+
recovery_timeout: Circuit breaker recovery timeout
|
|
47
|
+
auto_compact: Enable automatic conversation compaction
|
|
48
|
+
auto_compact_threshold: Token threshold for auto-compaction
|
|
49
|
+
temperature: Default temperature for generation (0.0-1.0)
|
|
50
|
+
seed: Default seed for deterministic generation
|
|
51
|
+
"""
|
|
37
52
|
|
|
38
53
|
self.provider = provider
|
|
39
54
|
self.id = str(uuid.uuid4())
|
|
@@ -45,6 +60,10 @@ class BasicSession:
|
|
|
45
60
|
self.auto_compact_threshold = auto_compact_threshold
|
|
46
61
|
self._original_session = None # Track if this is a compacted session
|
|
47
62
|
|
|
63
|
+
# Store session-level generation parameters
|
|
64
|
+
self.temperature = temperature
|
|
65
|
+
self.seed = seed
|
|
66
|
+
|
|
48
67
|
# Optional analytics fields
|
|
49
68
|
self.summary = None
|
|
50
69
|
self.assessment = None
|
|
@@ -189,6 +208,12 @@ class BasicSession:
|
|
|
189
208
|
# Extract media parameter explicitly (fix for media parameter passing)
|
|
190
209
|
media = kwargs.pop('media', None)
|
|
191
210
|
|
|
211
|
+
# Add session-level parameters if not overridden in kwargs
|
|
212
|
+
if 'temperature' not in kwargs and self.temperature is not None:
|
|
213
|
+
kwargs['temperature'] = self.temperature
|
|
214
|
+
if 'seed' not in kwargs and self.seed is not None:
|
|
215
|
+
kwargs['seed'] = self.seed
|
|
216
|
+
|
|
192
217
|
# Call provider
|
|
193
218
|
response = self.provider.generate(
|
|
194
219
|
prompt=prompt,
|
|
@@ -735,7 +760,7 @@ class BasicSession:
|
|
|
735
760
|
"tokens_before": original_tokens,
|
|
736
761
|
"tokens_after": self._estimate_tokens_for_summary(summary_result.summary),
|
|
737
762
|
"compression_ratio": self._calculate_compression_ratio(original_tokens, summary_result.summary),
|
|
738
|
-
"
|
|
763
|
+
"gen_time": duration_ms
|
|
739
764
|
}
|
|
740
765
|
}
|
|
741
766
|
|
abstractcore/core/types.py
CHANGED
|
@@ -91,6 +91,7 @@ class GenerateResponse:
|
|
|
91
91
|
usage: Optional[Dict[str, int]] = None
|
|
92
92
|
tool_calls: Optional[List[Dict[str, Any]]] = None
|
|
93
93
|
metadata: Optional[Dict[str, Any]] = None
|
|
94
|
+
gen_time: Optional[float] = None # Generation time in milliseconds
|
|
94
95
|
|
|
95
96
|
def has_tool_calls(self) -> bool:
|
|
96
97
|
"""Check if response contains tool calls"""
|
|
@@ -109,6 +110,29 @@ class GenerateResponse:
|
|
|
109
110
|
parts.append(f"Model: {self.model}")
|
|
110
111
|
if self.usage:
|
|
111
112
|
parts.append(f"Tokens: {self.usage.get('total_tokens', 'unknown')}")
|
|
113
|
+
if self.gen_time:
|
|
114
|
+
parts.append(f"Time: {self.gen_time:.1f}ms")
|
|
112
115
|
if self.tool_calls:
|
|
113
116
|
parts.append(f"Tools: {len(self.tool_calls)} executed")
|
|
114
|
-
return " | ".join(parts)
|
|
117
|
+
return " | ".join(parts)
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def input_tokens(self) -> Optional[int]:
|
|
121
|
+
"""Get input tokens with consistent terminology (prompt_tokens or input_tokens)."""
|
|
122
|
+
if not self.usage:
|
|
123
|
+
return None
|
|
124
|
+
return self.usage.get('input_tokens') or self.usage.get('prompt_tokens')
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def output_tokens(self) -> Optional[int]:
|
|
128
|
+
"""Get output tokens with consistent terminology (completion_tokens or output_tokens)."""
|
|
129
|
+
if not self.usage:
|
|
130
|
+
return None
|
|
131
|
+
return self.usage.get('output_tokens') or self.usage.get('completion_tokens')
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def total_tokens(self) -> Optional[int]:
|
|
135
|
+
"""Get total tokens."""
|
|
136
|
+
if not self.usage:
|
|
137
|
+
return None
|
|
138
|
+
return self.usage.get('total_tokens')
|
|
@@ -47,8 +47,7 @@ class AnthropicProvider(BaseProvider):
|
|
|
47
47
|
# Initialize tool handler
|
|
48
48
|
self.tool_handler = UniversalToolHandler(model)
|
|
49
49
|
|
|
50
|
-
# Store configuration
|
|
51
|
-
self.temperature = kwargs.get("temperature", 0.7)
|
|
50
|
+
# Store provider-specific configuration
|
|
52
51
|
self.top_p = kwargs.get("top_p", 1.0)
|
|
53
52
|
self.top_k = kwargs.get("top_k", None)
|
|
54
53
|
|
|
@@ -132,6 +131,19 @@ class AnthropicProvider(BaseProvider):
|
|
|
132
131
|
if kwargs.get("top_k") or self.top_k:
|
|
133
132
|
call_params["top_k"] = kwargs.get("top_k", self.top_k)
|
|
134
133
|
|
|
134
|
+
# Handle seed parameter (Anthropic doesn't support seed natively)
|
|
135
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
136
|
+
if seed_value is not None:
|
|
137
|
+
import warnings
|
|
138
|
+
warnings.warn(
|
|
139
|
+
f"Seed parameter ({seed_value}) is not supported by Anthropic Claude API. "
|
|
140
|
+
f"For deterministic outputs, use temperature=0.0 which may provide more consistent results, "
|
|
141
|
+
f"though true determinism is not guaranteed.",
|
|
142
|
+
UserWarning,
|
|
143
|
+
stacklevel=3
|
|
144
|
+
)
|
|
145
|
+
self.logger.warning(f"Seed {seed_value} requested but not supported by Anthropic API")
|
|
146
|
+
|
|
135
147
|
# Handle structured output using the "tool trick"
|
|
136
148
|
structured_tool_name = None
|
|
137
149
|
if response_model and PYDANTIC_AVAILABLE:
|
|
@@ -174,8 +186,14 @@ class AnthropicProvider(BaseProvider):
|
|
|
174
186
|
if stream:
|
|
175
187
|
return self._stream_response(call_params, tools)
|
|
176
188
|
else:
|
|
189
|
+
# Track generation time
|
|
190
|
+
start_time = time.time()
|
|
177
191
|
response = self.client.messages.create(**call_params)
|
|
192
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
193
|
+
|
|
178
194
|
formatted = self._format_response(response)
|
|
195
|
+
# Add generation time to response
|
|
196
|
+
formatted.gen_time = gen_time
|
|
179
197
|
|
|
180
198
|
# Handle tool execution for Anthropic responses
|
|
181
199
|
if tools and (formatted.has_tool_calls() or
|
abstractcore/providers/base.py
CHANGED
|
@@ -570,8 +570,32 @@ class BaseProvider(AbstractCoreInterface, ABC):
|
|
|
570
570
|
result_kwargs = kwargs.copy()
|
|
571
571
|
result_kwargs["max_output_tokens"] = effective_max_output
|
|
572
572
|
|
|
573
|
+
# Add unified generation parameters with fallback hierarchy: kwargs → instance → defaults
|
|
574
|
+
result_kwargs["temperature"] = result_kwargs.get("temperature", self.temperature)
|
|
575
|
+
if self.seed is not None:
|
|
576
|
+
result_kwargs["seed"] = result_kwargs.get("seed", self.seed)
|
|
577
|
+
|
|
573
578
|
return result_kwargs
|
|
574
579
|
|
|
580
|
+
def _extract_generation_params(self, **kwargs) -> Dict[str, Any]:
|
|
581
|
+
"""
|
|
582
|
+
Extract generation parameters with consistent fallback hierarchy.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
Dict containing temperature, seed, and other generation parameters
|
|
586
|
+
"""
|
|
587
|
+
params = {}
|
|
588
|
+
|
|
589
|
+
# Temperature (always present)
|
|
590
|
+
params["temperature"] = kwargs.get("temperature", self.temperature)
|
|
591
|
+
|
|
592
|
+
# Seed (only if not None)
|
|
593
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
594
|
+
if seed_value is not None:
|
|
595
|
+
params["seed"] = seed_value
|
|
596
|
+
|
|
597
|
+
return params
|
|
598
|
+
|
|
575
599
|
def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
|
|
576
600
|
"""
|
|
577
601
|
Extract the appropriate max tokens parameter for this provider.
|
|
@@ -68,6 +68,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
68
68
|
# Initialize tool handler
|
|
69
69
|
self.tool_handler = UniversalToolHandler(model)
|
|
70
70
|
|
|
71
|
+
# Store provider-specific configuration
|
|
71
72
|
self.n_gpu_layers = n_gpu_layers
|
|
72
73
|
self.model_type = None # Will be "transformers" or "gguf"
|
|
73
74
|
self.device = device
|
|
@@ -537,14 +538,15 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
537
538
|
# Generation parameters using unified system
|
|
538
539
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
539
540
|
max_new_tokens = self._get_provider_max_tokens_param(generation_kwargs)
|
|
540
|
-
temperature = kwargs.get("temperature",
|
|
541
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
541
542
|
top_p = kwargs.get("top_p", 0.9)
|
|
543
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
542
544
|
|
|
543
545
|
try:
|
|
544
546
|
if stream:
|
|
545
|
-
return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'))
|
|
547
|
+
return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
|
|
546
548
|
else:
|
|
547
|
-
response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p)
|
|
549
|
+
response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed_value)
|
|
548
550
|
|
|
549
551
|
# Handle tool execution for prompted models
|
|
550
552
|
if tools and self.tool_handler.supports_prompted and response.content:
|
|
@@ -651,11 +653,16 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
651
653
|
generation_kwargs = {
|
|
652
654
|
"messages": chat_messages,
|
|
653
655
|
"max_tokens": max_output_tokens, # This is max_output_tokens for llama-cpp
|
|
654
|
-
"temperature": kwargs.get("temperature",
|
|
656
|
+
"temperature": kwargs.get("temperature", self.temperature),
|
|
655
657
|
"top_p": kwargs.get("top_p", 0.9),
|
|
656
658
|
"stream": stream
|
|
657
659
|
}
|
|
658
660
|
|
|
661
|
+
# Add seed if provided (GGUF/llama-cpp supports seed)
|
|
662
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
663
|
+
if seed_value is not None:
|
|
664
|
+
generation_kwargs["seed"] = seed_value
|
|
665
|
+
|
|
659
666
|
# Handle tools - both native and prompted support
|
|
660
667
|
has_native_tools = False
|
|
661
668
|
if tools:
|
|
@@ -846,9 +853,19 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
846
853
|
)
|
|
847
854
|
|
|
848
855
|
def _single_generate_transformers(self, input_text: str, max_new_tokens: int,
|
|
849
|
-
temperature: float, top_p: float) -> GenerateResponse:
|
|
856
|
+
temperature: float, top_p: float, seed: Optional[int] = None) -> GenerateResponse:
|
|
850
857
|
"""Generate single response using transformers (original implementation)"""
|
|
851
858
|
try:
|
|
859
|
+
# Set seed for deterministic generation if provided
|
|
860
|
+
if seed is not None:
|
|
861
|
+
import torch
|
|
862
|
+
torch.manual_seed(seed)
|
|
863
|
+
if torch.cuda.is_available():
|
|
864
|
+
torch.cuda.manual_seed_all(seed)
|
|
865
|
+
|
|
866
|
+
# Track generation time
|
|
867
|
+
start_time = time.time()
|
|
868
|
+
|
|
852
869
|
outputs = self.pipeline(
|
|
853
870
|
input_text,
|
|
854
871
|
max_new_tokens=max_new_tokens,
|
|
@@ -860,6 +877,8 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
860
877
|
truncation=True,
|
|
861
878
|
return_full_text=False
|
|
862
879
|
)
|
|
880
|
+
|
|
881
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
863
882
|
|
|
864
883
|
if outputs and len(outputs) > 0:
|
|
865
884
|
response_text = outputs[0]['generated_text'].strip()
|
|
@@ -871,42 +890,49 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
871
890
|
content=response_text,
|
|
872
891
|
model=self.model,
|
|
873
892
|
finish_reason="stop",
|
|
874
|
-
usage=usage
|
|
893
|
+
usage=usage,
|
|
894
|
+
gen_time=gen_time
|
|
875
895
|
)
|
|
876
896
|
else:
|
|
877
897
|
return GenerateResponse(
|
|
878
898
|
content="",
|
|
879
899
|
model=self.model,
|
|
880
|
-
finish_reason="stop"
|
|
900
|
+
finish_reason="stop",
|
|
901
|
+
gen_time=gen_time
|
|
881
902
|
)
|
|
882
903
|
|
|
883
904
|
except Exception as e:
|
|
905
|
+
gen_time = round((time.time() - start_time) * 1000, 1) if 'start_time' in locals() else 0.0
|
|
884
906
|
return GenerateResponse(
|
|
885
907
|
content=f"Error: {str(e)}",
|
|
886
908
|
model=self.model,
|
|
887
|
-
finish_reason="error"
|
|
909
|
+
finish_reason="error",
|
|
910
|
+
gen_time=gen_time
|
|
888
911
|
)
|
|
889
912
|
|
|
890
913
|
def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
|
|
891
914
|
"""Calculate token usage using centralized token utilities."""
|
|
892
915
|
from ..utils.token_utils import TokenUtils
|
|
893
916
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
total_tokens =
|
|
917
|
+
input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
|
|
918
|
+
output_tokens = TokenUtils.estimate_tokens(response, self.model)
|
|
919
|
+
total_tokens = input_tokens + output_tokens
|
|
897
920
|
|
|
898
921
|
return {
|
|
899
|
-
"
|
|
900
|
-
"
|
|
901
|
-
"total_tokens": total_tokens
|
|
922
|
+
"input_tokens": input_tokens,
|
|
923
|
+
"output_tokens": output_tokens,
|
|
924
|
+
"total_tokens": total_tokens,
|
|
925
|
+
# Keep legacy keys for backward compatibility
|
|
926
|
+
"prompt_tokens": input_tokens,
|
|
927
|
+
"completion_tokens": output_tokens
|
|
902
928
|
}
|
|
903
929
|
|
|
904
930
|
def _stream_generate_transformers(self, input_text: str, max_new_tokens: int,
|
|
905
|
-
temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
|
|
931
|
+
temperature: float, top_p: float, tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
|
|
906
932
|
"""Stream response using transformers (simulated, original implementation) with tool tag rewriting support"""
|
|
907
933
|
try:
|
|
908
934
|
# HuggingFace doesn't have native streaming, so we simulate it
|
|
909
|
-
full_response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p)
|
|
935
|
+
full_response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed)
|
|
910
936
|
|
|
911
937
|
if full_response.content:
|
|
912
938
|
# Apply tool tag rewriting if enabled
|
|
@@ -1039,12 +1065,12 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1039
1065
|
def _stream_generate_transformers_with_tools(self, input_text: str, max_new_tokens: int,
|
|
1040
1066
|
temperature: float, top_p: float,
|
|
1041
1067
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
1042
|
-
tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
|
|
1068
|
+
tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
|
|
1043
1069
|
"""Stream generate with tool execution at the end"""
|
|
1044
1070
|
collected_content = ""
|
|
1045
1071
|
|
|
1046
1072
|
# Stream the response content
|
|
1047
|
-
for chunk in self._stream_generate_transformers(input_text, max_new_tokens, temperature, top_p, tool_call_tags):
|
|
1073
|
+
for chunk in self._stream_generate_transformers(input_text, max_new_tokens, temperature, top_p, tool_call_tags, seed):
|
|
1048
1074
|
collected_content += chunk.content
|
|
1049
1075
|
yield chunk
|
|
1050
1076
|
|
|
@@ -4,6 +4,7 @@ LM Studio provider implementation (OpenAI-compatible API).
|
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
import json
|
|
7
|
+
import time
|
|
7
8
|
from typing import List, Dict, Any, Optional, Union, Iterator, Type
|
|
8
9
|
|
|
9
10
|
try:
|
|
@@ -196,11 +197,16 @@ class LMStudioProvider(BaseProvider):
|
|
|
196
197
|
"model": self.model,
|
|
197
198
|
"messages": chat_messages,
|
|
198
199
|
"stream": stream,
|
|
199
|
-
"temperature": kwargs.get("temperature",
|
|
200
|
+
"temperature": kwargs.get("temperature", self.temperature),
|
|
200
201
|
"max_tokens": max_output_tokens, # LMStudio uses max_tokens for output tokens
|
|
201
202
|
"top_p": kwargs.get("top_p", 0.9),
|
|
202
203
|
}
|
|
203
204
|
|
|
205
|
+
# Add seed if provided (LMStudio supports seed via OpenAI-compatible API)
|
|
206
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
207
|
+
if seed_value is not None:
|
|
208
|
+
payload["seed"] = seed_value
|
|
209
|
+
|
|
204
210
|
if stream:
|
|
205
211
|
# Return streaming response - BaseProvider will handle tag rewriting via UnifiedStreamProcessor
|
|
206
212
|
return self._stream_generate(payload)
|
|
@@ -220,12 +226,15 @@ class LMStudioProvider(BaseProvider):
|
|
|
220
226
|
if not hasattr(self, 'client') or self.client is None:
|
|
221
227
|
raise ProviderAPIError("HTTP client not initialized")
|
|
222
228
|
|
|
229
|
+
# Track generation time
|
|
230
|
+
start_time = time.time()
|
|
223
231
|
response = self.client.post(
|
|
224
232
|
f"{self.base_url}/chat/completions",
|
|
225
233
|
json=payload,
|
|
226
234
|
headers={"Content-Type": "application/json"}
|
|
227
235
|
)
|
|
228
236
|
response.raise_for_status()
|
|
237
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
229
238
|
|
|
230
239
|
result = response.json()
|
|
231
240
|
|
|
@@ -247,10 +256,14 @@ class LMStudioProvider(BaseProvider):
|
|
|
247
256
|
finish_reason=finish_reason,
|
|
248
257
|
raw_response=result,
|
|
249
258
|
usage={
|
|
259
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
260
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
261
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
262
|
+
# Keep legacy keys for backward compatibility
|
|
250
263
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
|
251
|
-
"completion_tokens": usage.get("completion_tokens", 0)
|
|
252
|
-
|
|
253
|
-
|
|
264
|
+
"completion_tokens": usage.get("completion_tokens", 0)
|
|
265
|
+
},
|
|
266
|
+
gen_time=gen_time
|
|
254
267
|
)
|
|
255
268
|
|
|
256
269
|
except AttributeError as e:
|
|
@@ -189,14 +189,15 @@ class MLXProvider(BaseProvider):
|
|
|
189
189
|
# MLX generation parameters using unified system
|
|
190
190
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
191
191
|
max_tokens = self._get_provider_max_tokens_param(generation_kwargs)
|
|
192
|
-
temperature = kwargs.get("temperature",
|
|
192
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
193
193
|
top_p = kwargs.get("top_p", 0.9)
|
|
194
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
194
195
|
|
|
195
196
|
try:
|
|
196
197
|
if stream:
|
|
197
|
-
return self._stream_generate_with_tools(full_prompt, max_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'))
|
|
198
|
+
return self._stream_generate_with_tools(full_prompt, max_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
|
|
198
199
|
else:
|
|
199
|
-
response = self._single_generate(full_prompt, max_tokens, temperature, top_p)
|
|
200
|
+
response = self._single_generate(full_prompt, max_tokens, temperature, top_p, seed_value)
|
|
200
201
|
|
|
201
202
|
# Handle tool execution for prompted models
|
|
202
203
|
if tools and self.tool_handler.supports_prompted and response.content:
|
|
@@ -256,9 +257,18 @@ class MLXProvider(BaseProvider):
|
|
|
256
257
|
|
|
257
258
|
return full_prompt
|
|
258
259
|
|
|
259
|
-
def _single_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float) -> GenerateResponse:
|
|
260
|
+
def _single_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, seed: Optional[int] = None) -> GenerateResponse:
|
|
260
261
|
"""Generate single response"""
|
|
261
262
|
|
|
263
|
+
# Handle seed parameter (MLX supports seed via mx.random.seed)
|
|
264
|
+
if seed is not None:
|
|
265
|
+
import mlx.core as mx
|
|
266
|
+
mx.random.seed(seed)
|
|
267
|
+
self.logger.debug(f"Set MLX random seed to {seed} for deterministic generation")
|
|
268
|
+
|
|
269
|
+
# Track generation time
|
|
270
|
+
start_time = time.time()
|
|
271
|
+
|
|
262
272
|
# Try different MLX API signatures
|
|
263
273
|
try:
|
|
264
274
|
# Try new mlx-lm API
|
|
@@ -281,6 +291,8 @@ class MLXProvider(BaseProvider):
|
|
|
281
291
|
# Fallback to basic response
|
|
282
292
|
response_text = prompt + " I am an AI assistant powered by MLX on Apple Silicon."
|
|
283
293
|
|
|
294
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
295
|
+
|
|
284
296
|
# Use the full response as-is - preserve all content including thinking
|
|
285
297
|
generated = response_text.strip()
|
|
286
298
|
|
|
@@ -288,26 +300,36 @@ class MLXProvider(BaseProvider):
|
|
|
288
300
|
content=generated,
|
|
289
301
|
model=self.model,
|
|
290
302
|
finish_reason="stop",
|
|
291
|
-
usage=self._calculate_usage(prompt, generated)
|
|
303
|
+
usage=self._calculate_usage(prompt, generated),
|
|
304
|
+
gen_time=gen_time
|
|
292
305
|
)
|
|
293
306
|
|
|
294
307
|
def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
|
|
295
308
|
"""Calculate token usage using centralized token utilities."""
|
|
296
309
|
from ..utils.token_utils import TokenUtils
|
|
297
310
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
total_tokens =
|
|
311
|
+
input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
|
|
312
|
+
output_tokens = TokenUtils.estimate_tokens(response, self.model)
|
|
313
|
+
total_tokens = input_tokens + output_tokens
|
|
301
314
|
|
|
302
315
|
return {
|
|
303
|
-
"
|
|
304
|
-
"
|
|
305
|
-
"total_tokens": total_tokens
|
|
316
|
+
"input_tokens": input_tokens,
|
|
317
|
+
"output_tokens": output_tokens,
|
|
318
|
+
"total_tokens": total_tokens,
|
|
319
|
+
# Keep legacy keys for backward compatibility
|
|
320
|
+
"prompt_tokens": input_tokens,
|
|
321
|
+
"completion_tokens": output_tokens
|
|
306
322
|
}
|
|
307
323
|
|
|
308
|
-
def _stream_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
|
|
324
|
+
def _stream_generate(self, prompt: str, max_tokens: int, temperature: float, top_p: float, tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
|
|
309
325
|
"""Generate real streaming response using MLX stream_generate with tool tag rewriting support"""
|
|
310
326
|
try:
|
|
327
|
+
# Handle seed parameter (MLX supports seed via mx.random.seed)
|
|
328
|
+
if seed is not None:
|
|
329
|
+
import mlx.core as mx
|
|
330
|
+
mx.random.seed(seed)
|
|
331
|
+
self.logger.debug(f"Set MLX random seed to {seed} for deterministic streaming generation")
|
|
332
|
+
|
|
311
333
|
# Initialize tool tag rewriter if needed
|
|
312
334
|
rewriter = None
|
|
313
335
|
buffer = ""
|
|
@@ -366,12 +388,12 @@ class MLXProvider(BaseProvider):
|
|
|
366
388
|
def _stream_generate_with_tools(self, full_prompt: str, max_tokens: int,
|
|
367
389
|
temperature: float, top_p: float,
|
|
368
390
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
369
|
-
tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
|
|
391
|
+
tool_call_tags: Optional[str] = None, seed: Optional[int] = None) -> Iterator[GenerateResponse]:
|
|
370
392
|
"""Stream generate with tool execution at the end"""
|
|
371
393
|
collected_content = ""
|
|
372
394
|
|
|
373
395
|
# Stream the response content
|
|
374
|
-
for chunk in self._stream_generate(full_prompt, max_tokens, temperature, top_p, tool_call_tags):
|
|
396
|
+
for chunk in self._stream_generate(full_prompt, max_tokens, temperature, top_p, tool_call_tags, seed):
|
|
375
397
|
collected_content += chunk.content
|
|
376
398
|
yield chunk
|
|
377
399
|
|
|
@@ -48,6 +48,12 @@ class MockProvider(BaseProvider):
|
|
|
48
48
|
|
|
49
49
|
def _single_response(self, prompt: str, response_model: Optional[Type[BaseModel]] = None) -> GenerateResponse:
|
|
50
50
|
"""Generate single mock response"""
|
|
51
|
+
import time
|
|
52
|
+
|
|
53
|
+
# Simulate generation time (10-100ms for mock)
|
|
54
|
+
start_time = time.time()
|
|
55
|
+
time.sleep(0.01 + (len(prompt) % 10) * 0.01) # 10-100ms based on prompt length
|
|
56
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
51
57
|
|
|
52
58
|
if response_model and PYDANTIC_AVAILABLE:
|
|
53
59
|
# Generate valid JSON for structured output
|
|
@@ -59,21 +65,25 @@ class MockProvider(BaseProvider):
|
|
|
59
65
|
content=content,
|
|
60
66
|
model=self.model,
|
|
61
67
|
finish_reason="stop",
|
|
62
|
-
usage=self._calculate_mock_usage(prompt, content)
|
|
68
|
+
usage=self._calculate_mock_usage(prompt, content),
|
|
69
|
+
gen_time=gen_time
|
|
63
70
|
)
|
|
64
71
|
|
|
65
72
|
def _calculate_mock_usage(self, prompt: str, response: str) -> Dict[str, int]:
|
|
66
73
|
"""Calculate mock token usage using centralized token utilities."""
|
|
67
74
|
from ..utils.token_utils import TokenUtils
|
|
68
75
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
total_tokens =
|
|
76
|
+
input_tokens = TokenUtils.estimate_tokens(prompt, self.model)
|
|
77
|
+
output_tokens = TokenUtils.estimate_tokens(response, self.model)
|
|
78
|
+
total_tokens = input_tokens + output_tokens
|
|
72
79
|
|
|
73
80
|
return {
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"total_tokens": total_tokens
|
|
81
|
+
"input_tokens": input_tokens,
|
|
82
|
+
"output_tokens": output_tokens,
|
|
83
|
+
"total_tokens": total_tokens,
|
|
84
|
+
# Keep legacy keys for backward compatibility
|
|
85
|
+
"prompt_tokens": input_tokens,
|
|
86
|
+
"completion_tokens": output_tokens
|
|
77
87
|
}
|
|
78
88
|
|
|
79
89
|
def _stream_response(self, prompt: str) -> Iterator[GenerateResponse]:
|
|
@@ -132,11 +132,16 @@ class OllamaProvider(BaseProvider):
|
|
|
132
132
|
"model": self.model,
|
|
133
133
|
"stream": stream,
|
|
134
134
|
"options": {
|
|
135
|
-
"temperature": kwargs.get("temperature",
|
|
135
|
+
"temperature": kwargs.get("temperature", self.temperature),
|
|
136
136
|
"num_predict": max_output_tokens, # Ollama uses num_predict for max output tokens
|
|
137
137
|
}
|
|
138
138
|
}
|
|
139
139
|
|
|
140
|
+
# Add seed if provided (Ollama supports seed for deterministic outputs)
|
|
141
|
+
seed_value = kwargs.get("seed", self.seed)
|
|
142
|
+
if seed_value is not None:
|
|
143
|
+
payload["options"]["seed"] = seed_value
|
|
144
|
+
|
|
140
145
|
# Add structured output support (Ollama native JSON schema)
|
|
141
146
|
if response_model and PYDANTIC_AVAILABLE:
|
|
142
147
|
json_schema = response_model.model_json_schema()
|
|
@@ -220,11 +225,14 @@ class OllamaProvider(BaseProvider):
|
|
|
220
225
|
def _single_generate(self, endpoint: str, payload: Dict[str, Any], tools: Optional[List[Dict[str, Any]]] = None) -> GenerateResponse:
|
|
221
226
|
"""Generate single response"""
|
|
222
227
|
try:
|
|
228
|
+
# Track generation time
|
|
229
|
+
start_time = time.time()
|
|
223
230
|
response = self.client.post(
|
|
224
231
|
f"{self.base_url}{endpoint}",
|
|
225
232
|
json=payload
|
|
226
233
|
)
|
|
227
234
|
response.raise_for_status()
|
|
235
|
+
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
228
236
|
|
|
229
237
|
result = response.json()
|
|
230
238
|
|
|
@@ -241,10 +249,14 @@ class OllamaProvider(BaseProvider):
|
|
|
241
249
|
finish_reason="stop",
|
|
242
250
|
raw_response=result,
|
|
243
251
|
usage={
|
|
252
|
+
"input_tokens": result.get("prompt_eval_count", 0),
|
|
253
|
+
"output_tokens": result.get("eval_count", 0),
|
|
254
|
+
"total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
|
|
255
|
+
# Keep legacy keys for backward compatibility
|
|
244
256
|
"prompt_tokens": result.get("prompt_eval_count", 0),
|
|
245
|
-
"completion_tokens": result.get("eval_count", 0)
|
|
246
|
-
|
|
247
|
-
|
|
257
|
+
"completion_tokens": result.get("eval_count", 0)
|
|
258
|
+
},
|
|
259
|
+
gen_time=gen_time
|
|
248
260
|
)
|
|
249
261
|
|
|
250
262
|
# Execute tools if enabled and tools are present
|