isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import json
|
4
|
+
import asyncio
|
4
5
|
from typing import Dict, Any, List, Union, AsyncGenerator, Optional, Callable
|
5
6
|
|
6
7
|
# 使用官方 OpenAI 库
|
@@ -17,9 +18,16 @@ class OpenAILLMService(BaseLLMService):
|
|
17
18
|
def __init__(self, model_name: str = "gpt-4o-mini", provider_name: str = "openai", **kwargs):
|
18
19
|
super().__init__(provider_name, model_name, **kwargs)
|
19
20
|
|
21
|
+
# Check if this is an O-series reasoning model
|
22
|
+
self.is_reasoning_model = model_name.startswith("o4-") or model_name.startswith("o3-")
|
23
|
+
self.supports_deep_research = "deep-search" in model_name or "deep-research" in model_name
|
24
|
+
|
20
25
|
# Get configuration from centralized config manager
|
21
26
|
provider_config = self.get_provider_config()
|
22
27
|
|
28
|
+
# Check if reasoning summary is enabled (requires verified organization)
|
29
|
+
self.enable_reasoning_summary = provider_config.get("enable_reasoning_summary", False)
|
30
|
+
|
23
31
|
# Initialize AsyncOpenAI client with provider configuration
|
24
32
|
try:
|
25
33
|
if not provider_config.get("api_key"):
|
@@ -40,11 +48,40 @@ class OpenAILLMService(BaseLLMService):
|
|
40
48
|
self.last_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
41
49
|
self.total_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "requests_count": 0}
|
42
50
|
|
51
|
+
# For O-series models, track reasoning tokens separately
|
52
|
+
if self.is_reasoning_model:
|
53
|
+
self.last_token_usage["reasoning_tokens"] = 0
|
54
|
+
self.total_token_usage["reasoning_tokens"] = 0
|
55
|
+
|
43
56
|
|
44
57
|
def _create_bound_copy(self) -> 'OpenAILLMService':
|
45
58
|
"""Create a copy of this service for tool binding"""
|
46
|
-
|
47
|
-
bound_service
|
59
|
+
# Create new instance but bypass full initialization
|
60
|
+
bound_service = object.__new__(OpenAILLMService)
|
61
|
+
|
62
|
+
# Copy all essential attributes from original service
|
63
|
+
bound_service.model_name = self.model_name
|
64
|
+
bound_service.provider_name = self.provider_name
|
65
|
+
bound_service.client = self.client # Reuse the same OpenAI client
|
66
|
+
bound_service.last_token_usage = self.last_token_usage.copy()
|
67
|
+
bound_service.total_token_usage = self.total_token_usage.copy()
|
68
|
+
bound_service._bound_tools = self._bound_tools.copy() if self._bound_tools else []
|
69
|
+
bound_service.adapter_manager = self.adapter_manager # Reuse adapter manager
|
70
|
+
|
71
|
+
# Copy OpenAI-specific attributes
|
72
|
+
bound_service.is_reasoning_model = self.is_reasoning_model
|
73
|
+
bound_service.supports_deep_research = self.supports_deep_research
|
74
|
+
|
75
|
+
# Copy base class attributes
|
76
|
+
bound_service.streaming = self.streaming
|
77
|
+
bound_service.max_tokens = self.max_tokens
|
78
|
+
bound_service.temperature = self.temperature
|
79
|
+
bound_service._tool_mappings = {}
|
80
|
+
|
81
|
+
# Copy BaseService attributes that are needed
|
82
|
+
bound_service.config_manager = self.config_manager
|
83
|
+
bound_service.model_manager = self.model_manager
|
84
|
+
|
48
85
|
return bound_service
|
49
86
|
|
50
87
|
def bind_tools(self, tools: List[Any], **kwargs) -> 'OpenAILLMService':
|
@@ -66,16 +103,133 @@ class OpenAILLMService(BaseLLMService):
|
|
66
103
|
|
67
104
|
return bound_service
|
68
105
|
|
69
|
-
async def astream(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[str, None]:
|
106
|
+
async def astream(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
70
107
|
"""
|
71
108
|
True streaming method - yields tokens one by one as they arrive
|
72
109
|
|
73
110
|
Args:
|
74
111
|
input_data: Same as ainvoke
|
112
|
+
show_reasoning: If True and model supports it, show reasoning process using Responses API
|
75
113
|
|
76
114
|
Yields:
|
77
|
-
Individual tokens as they arrive from the API
|
115
|
+
Individual tokens as they arrive from the API, plus final result object with tool_calls
|
78
116
|
"""
|
117
|
+
try:
|
118
|
+
# Determine which API to use for streaming
|
119
|
+
use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
|
120
|
+
|
121
|
+
if use_responses_api:
|
122
|
+
logger.info(f"Using Responses API streaming for {self.model_name}")
|
123
|
+
# Use Responses API streaming
|
124
|
+
async for chunk in self._astream_responses_api(input_data, show_reasoning):
|
125
|
+
yield chunk
|
126
|
+
else:
|
127
|
+
logger.debug(f"Using Chat Completions API streaming for {self.model_name}")
|
128
|
+
# Use Chat Completions API streaming
|
129
|
+
async for chunk in self._astream_chat_completions_api(input_data):
|
130
|
+
yield chunk
|
131
|
+
|
132
|
+
except Exception as e:
|
133
|
+
logger.error(f"Error in astream: {e}")
|
134
|
+
raise
|
135
|
+
|
136
|
+
async def _astream_responses_api(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
137
|
+
"""Stream using Responses API for reasoning models and deep research models"""
|
138
|
+
try:
|
139
|
+
# Use adapter manager to prepare messages
|
140
|
+
messages = self._prepare_messages(input_data)
|
141
|
+
|
142
|
+
# Prepare request kwargs for Responses API
|
143
|
+
provider_config = self.get_provider_config()
|
144
|
+
kwargs = {
|
145
|
+
"model": self.model_name,
|
146
|
+
"input": messages, # Responses API uses 'input' instead of 'messages'
|
147
|
+
"stream": True
|
148
|
+
}
|
149
|
+
|
150
|
+
# Responses API uses max_output_tokens
|
151
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
152
|
+
kwargs["max_output_tokens"] = max_tokens_value
|
153
|
+
|
154
|
+
# Add reasoning configuration if needed (optional - requires verified organization)
|
155
|
+
if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
|
156
|
+
kwargs["reasoning"] = {"summary": "auto"}
|
157
|
+
logger.info("Reasoning summary enabled - using verified organization features")
|
158
|
+
elif show_reasoning and self.is_reasoning_model:
|
159
|
+
logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
|
160
|
+
|
161
|
+
# Deep research models require web_search_preview tool
|
162
|
+
if self.supports_deep_research:
|
163
|
+
kwargs["tools"] = [{"type": "web_search_preview"}]
|
164
|
+
|
165
|
+
# Add any additional bound tools
|
166
|
+
tool_schemas = await self._prepare_tools_for_request()
|
167
|
+
if tool_schemas:
|
168
|
+
if "tools" not in kwargs:
|
169
|
+
kwargs["tools"] = []
|
170
|
+
kwargs["tools"].extend(tool_schemas)
|
171
|
+
|
172
|
+
# Stream using Responses API
|
173
|
+
content_chunks = []
|
174
|
+
reasoning_items = []
|
175
|
+
|
176
|
+
try:
|
177
|
+
logger.info(f"Streaming with Responses API for model {self.model_name}")
|
178
|
+
stream = await self.client.responses.create(**kwargs)
|
179
|
+
|
180
|
+
async for event in stream:
|
181
|
+
# Handle different event types from Responses API
|
182
|
+
if event.type == 'response.output_text.delta':
|
183
|
+
# Stream text content
|
184
|
+
if event.delta:
|
185
|
+
content_chunks.append(event.delta)
|
186
|
+
yield event.delta
|
187
|
+
|
188
|
+
elif event.type == 'response.reasoning.delta' and show_reasoning:
|
189
|
+
# Stream reasoning content (if enabled)
|
190
|
+
if hasattr(event, 'delta') and event.delta:
|
191
|
+
yield f"[思考: {event.delta}]"
|
192
|
+
|
193
|
+
elif event.type == 'response.output_item.done':
|
194
|
+
# Handle completed items (reasoning, function calls, etc.)
|
195
|
+
if hasattr(event, 'item'):
|
196
|
+
if event.item.type == 'reasoning':
|
197
|
+
reasoning_items.append(event.item)
|
198
|
+
elif event.item.type == 'function_call':
|
199
|
+
# Handle function call completion
|
200
|
+
logger.debug(f"Function call completed: {event.item}")
|
201
|
+
|
202
|
+
# Create final response object
|
203
|
+
full_content = "".join(content_chunks)
|
204
|
+
|
205
|
+
# Track usage for streaming
|
206
|
+
self._track_streaming_usage(messages, full_content)
|
207
|
+
|
208
|
+
# Get billing info
|
209
|
+
await asyncio.sleep(0.01)
|
210
|
+
billing_info = self._get_streaming_billing_info()
|
211
|
+
|
212
|
+
# Format final result
|
213
|
+
final_result = self._format_response(full_content, input_data)
|
214
|
+
|
215
|
+
# Yield final result with metadata
|
216
|
+
yield {
|
217
|
+
"result": final_result,
|
218
|
+
"billing": billing_info,
|
219
|
+
"reasoning_items": len(reasoning_items),
|
220
|
+
"api_used": "responses"
|
221
|
+
}
|
222
|
+
|
223
|
+
except Exception as e:
|
224
|
+
logger.error(f"Error in Responses API streaming: {e}")
|
225
|
+
raise
|
226
|
+
|
227
|
+
except Exception as e:
|
228
|
+
logger.error(f"Error in _astream_responses_api: {e}")
|
229
|
+
raise
|
230
|
+
|
231
|
+
async def _astream_chat_completions_api(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
232
|
+
"""Stream using Chat Completions API for standard models"""
|
79
233
|
try:
|
80
234
|
# Use adapter manager to prepare messages
|
81
235
|
messages = self._prepare_messages(input_data)
|
@@ -85,86 +239,264 @@ class OpenAILLMService(BaseLLMService):
|
|
85
239
|
kwargs = {
|
86
240
|
"model": self.model_name,
|
87
241
|
"messages": messages,
|
88
|
-
"temperature": provider_config.get("temperature", 0.7),
|
89
|
-
"max_tokens": provider_config.get("max_tokens", 1024),
|
90
242
|
"stream": True
|
91
243
|
}
|
92
244
|
|
245
|
+
# O4 models only support temperature=1 (default)
|
246
|
+
if not self.is_reasoning_model:
|
247
|
+
kwargs["temperature"] = provider_config.get("temperature", 0.7)
|
248
|
+
|
249
|
+
# O4 models use max_completion_tokens instead of max_tokens
|
250
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
251
|
+
if self.is_reasoning_model:
|
252
|
+
kwargs["max_completion_tokens"] = max_tokens_value
|
253
|
+
else:
|
254
|
+
kwargs["max_tokens"] = max_tokens_value
|
255
|
+
|
93
256
|
# Add tools if bound using adapter manager
|
94
257
|
tool_schemas = await self._prepare_tools_for_request()
|
95
258
|
if tool_schemas:
|
96
259
|
kwargs["tools"] = tool_schemas
|
97
260
|
kwargs["tool_choice"] = "auto"
|
98
261
|
|
99
|
-
# Stream tokens
|
262
|
+
# Stream tokens and detect tool calls
|
100
263
|
content_chunks = []
|
264
|
+
tool_calls_accumulator = {} # Track complete tool calls by ID
|
265
|
+
has_tool_calls = False
|
266
|
+
|
101
267
|
try:
|
102
268
|
stream = await self.client.chat.completions.create(**kwargs)
|
103
269
|
async for chunk in stream:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
270
|
+
delta = chunk.choices[0].delta
|
271
|
+
|
272
|
+
# Check for tool calls first
|
273
|
+
if hasattr(delta, 'tool_calls') and delta.tool_calls:
|
274
|
+
has_tool_calls = True
|
275
|
+
for tool_call in delta.tool_calls:
|
276
|
+
tool_index = getattr(tool_call, 'index', 0) # OpenAI uses index for streaming
|
277
|
+
|
278
|
+
# Use index as key since streaming tool calls use index
|
279
|
+
tool_key = f"tool_{tool_index}"
|
280
|
+
|
281
|
+
# Initialize tool call if not seen before
|
282
|
+
if tool_key not in tool_calls_accumulator:
|
283
|
+
tool_calls_accumulator[tool_key] = {
|
284
|
+
'id': getattr(tool_call, 'id', f"call_{tool_index}"),
|
285
|
+
'type': 'function',
|
286
|
+
'function': {
|
287
|
+
'name': '',
|
288
|
+
'arguments': ''
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
292
|
+
# Accumulate function name
|
293
|
+
if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'name') and tool_call.function.name:
|
294
|
+
tool_calls_accumulator[tool_key]['function']['name'] += tool_call.function.name
|
295
|
+
|
296
|
+
# Accumulate function arguments
|
297
|
+
if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
|
298
|
+
if tool_call.function.arguments:
|
299
|
+
tool_calls_accumulator[tool_key]['function']['arguments'] += tool_call.function.arguments
|
300
|
+
|
301
|
+
# Handle regular content - only stream if no tool calls detected
|
302
|
+
elif delta.content:
|
303
|
+
content_chunks.append(delta.content)
|
304
|
+
if not has_tool_calls: # Only yield content if no tool calls
|
305
|
+
yield delta.content
|
306
|
+
|
307
|
+
# Always yield final result at the end
|
308
|
+
# - If has tool_calls: complete structured response (no prior streaming)
|
309
|
+
# - If no tool_calls: AIMessage after streaming content
|
310
|
+
|
311
|
+
# Create a mock message object for adapter processing
|
312
|
+
class MockMessage:
|
313
|
+
def __init__(self):
|
314
|
+
self.content = "".join(content_chunks) or ""
|
315
|
+
self.tool_calls = []
|
316
|
+
# Add tool_calls if any
|
317
|
+
if tool_calls_accumulator:
|
318
|
+
for tool_data in tool_calls_accumulator.values():
|
319
|
+
mock_tool_call = type('MockToolCall', (), {
|
320
|
+
'id': tool_data['id'],
|
321
|
+
'function': type('MockFunction', (), {
|
322
|
+
'name': tool_data['function']['name'],
|
323
|
+
'arguments': tool_data['function']['arguments']
|
324
|
+
})()
|
325
|
+
})()
|
326
|
+
self.tool_calls.append(mock_tool_call)
|
327
|
+
|
328
|
+
mock_message = MockMessage()
|
329
|
+
|
330
|
+
logger.debug(f"Streaming complete - tool calls collected: {len(mock_message.tool_calls)}")
|
331
|
+
for i, tc in enumerate(mock_message.tool_calls):
|
332
|
+
logger.debug(f" Tool call {i+1}: {tc.function.name} with args: {tc.function.arguments}")
|
333
|
+
|
334
|
+
# Format response using adapter (this handles LangChain conversion)
|
335
|
+
final_result = self._format_response(mock_message, input_data)
|
336
|
+
|
337
|
+
logger.debug(f"Final result type after adapter: {type(final_result)}")
|
338
|
+
logger.debug(f"Final result has tool_calls: {hasattr(final_result, 'tool_calls')}")
|
108
339
|
|
109
340
|
# Track usage after streaming is complete
|
110
341
|
full_content = "".join(content_chunks)
|
111
342
|
self._track_streaming_usage(messages, full_content)
|
112
343
|
|
344
|
+
# Get billing info after tracking (wait a moment for billing to be recorded)
|
345
|
+
await asyncio.sleep(0.01)
|
346
|
+
billing_info = self._get_streaming_billing_info()
|
347
|
+
|
348
|
+
# Yield the final result with billing info
|
349
|
+
yield {
|
350
|
+
"result": final_result,
|
351
|
+
"billing": billing_info,
|
352
|
+
"api_used": "chat_completions"
|
353
|
+
}
|
354
|
+
|
113
355
|
except Exception as e:
|
114
|
-
logger.error(f"Error in streaming: {e}")
|
356
|
+
logger.error(f"Error in Chat Completions streaming: {e}")
|
115
357
|
raise
|
116
358
|
|
117
359
|
except Exception as e:
|
118
|
-
logger.error(f"Error in
|
360
|
+
logger.error(f"Error in _astream_chat_completions_api: {e}")
|
119
361
|
raise
|
120
362
|
|
121
|
-
async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any]) -> Union[str, Any]:
|
122
|
-
"""
|
363
|
+
async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> Union[str, Any]:
|
364
|
+
"""
|
365
|
+
Unified invoke method for all input types
|
366
|
+
|
367
|
+
Args:
|
368
|
+
input_data: Input messages or text
|
369
|
+
show_reasoning: If True and model supports it, show reasoning process using Responses API
|
370
|
+
"""
|
123
371
|
try:
|
124
372
|
# Use adapter manager to prepare messages
|
125
373
|
messages = self._prepare_messages(input_data)
|
126
374
|
|
375
|
+
# Determine which API to use
|
376
|
+
# Responses API is required for:
|
377
|
+
# 1. Reasoning models with show_reasoning=True
|
378
|
+
# 2. Deep research models (they only work with Responses API)
|
379
|
+
use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
|
380
|
+
|
127
381
|
# Prepare request kwargs
|
128
382
|
provider_config = self.get_provider_config()
|
129
383
|
kwargs = {
|
130
384
|
"model": self.model_name,
|
131
|
-
"messages": messages
|
132
|
-
"temperature": provider_config.get("temperature", 0.7),
|
133
|
-
"max_tokens": provider_config.get("max_tokens", 1024)
|
385
|
+
"messages": messages
|
134
386
|
}
|
135
387
|
|
388
|
+
# O4 models only support temperature=1 (default)
|
389
|
+
if not self.is_reasoning_model:
|
390
|
+
kwargs["temperature"] = provider_config.get("temperature", 0.7)
|
391
|
+
|
392
|
+
# O4 models use max_completion_tokens instead of max_tokens
|
393
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
394
|
+
if self.is_reasoning_model:
|
395
|
+
kwargs["max_completion_tokens"] = max_tokens_value
|
396
|
+
else:
|
397
|
+
kwargs["max_tokens"] = max_tokens_value
|
398
|
+
|
136
399
|
# Add tools if bound using adapter manager
|
137
400
|
tool_schemas = await self._prepare_tools_for_request()
|
138
401
|
if tool_schemas:
|
139
402
|
kwargs["tools"] = tool_schemas
|
140
|
-
|
403
|
+
if not use_responses_api: # Responses API handles tool choice differently
|
404
|
+
kwargs["tool_choice"] = "auto"
|
141
405
|
|
142
406
|
# Handle streaming vs non-streaming
|
143
407
|
if self.streaming:
|
144
408
|
# TRUE STREAMING MODE - collect all chunks from the stream
|
145
409
|
content_chunks = []
|
146
|
-
async for token in self.astream(input_data):
|
147
|
-
|
148
|
-
|
410
|
+
async for token in self.astream(input_data, show_reasoning=show_reasoning):
|
411
|
+
if isinstance(token, str):
|
412
|
+
content_chunks.append(token)
|
413
|
+
elif isinstance(token, dict) and "result" in token:
|
414
|
+
# Return the final result from streaming
|
415
|
+
return token["result"]
|
149
416
|
|
417
|
+
# Fallback: join collected content
|
418
|
+
content = "".join(content_chunks)
|
150
419
|
return self._format_response(content, input_data)
|
151
420
|
else:
|
152
|
-
# Non-streaming mode
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
421
|
+
# Non-streaming mode - choose API based on reasoning visibility
|
422
|
+
if use_responses_api:
|
423
|
+
logger.info(f"Using Responses API for model {self.model_name}")
|
424
|
+
|
425
|
+
# Convert kwargs for Responses API
|
426
|
+
responses_kwargs = {
|
427
|
+
"model": kwargs["model"],
|
428
|
+
"input": kwargs["messages"] # Responses API uses 'input' instead of 'messages'
|
429
|
+
}
|
430
|
+
|
431
|
+
# Handle max tokens parameter
|
432
|
+
if "max_completion_tokens" in kwargs:
|
433
|
+
responses_kwargs["max_output_tokens"] = kwargs["max_completion_tokens"]
|
434
|
+
elif "max_tokens" in kwargs:
|
435
|
+
responses_kwargs["max_output_tokens"] = kwargs["max_tokens"]
|
436
|
+
|
437
|
+
# Add tools if present
|
438
|
+
if "tools" in kwargs:
|
439
|
+
responses_kwargs["tools"] = kwargs["tools"]
|
440
|
+
|
441
|
+
# Add reasoning configuration for reasoning models (requires verified organization)
|
442
|
+
if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
|
443
|
+
responses_kwargs["reasoning"] = {"summary": "auto"}
|
444
|
+
logger.info("Reasoning summary enabled - using verified organization features")
|
445
|
+
elif show_reasoning and self.is_reasoning_model:
|
446
|
+
logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
|
447
|
+
|
448
|
+
# Deep research models require web_search_preview tool
|
449
|
+
if self.supports_deep_research:
|
450
|
+
if "tools" not in responses_kwargs:
|
451
|
+
responses_kwargs["tools"] = []
|
452
|
+
responses_kwargs["tools"].insert(0, {"type": "web_search_preview"})
|
453
|
+
|
454
|
+
response = await self.client.responses.create(**responses_kwargs)
|
455
|
+
|
456
|
+
# Handle Responses API format
|
457
|
+
if hasattr(response, 'output_text'):
|
458
|
+
# Modern Responses API format
|
459
|
+
content = response.output_text
|
460
|
+
usage_info = getattr(response, 'usage', None)
|
461
|
+
elif hasattr(response, 'body') and hasattr(response.body, 'response'):
|
462
|
+
# Legacy format
|
463
|
+
content = response.body.response
|
464
|
+
usage_info = getattr(response.body, 'usage', None)
|
465
|
+
else:
|
466
|
+
# Fallback handling
|
467
|
+
content = str(response)
|
468
|
+
usage_info = None
|
469
|
+
|
470
|
+
# Update usage tracking if available
|
471
|
+
if usage_info:
|
472
|
+
self._update_token_usage(usage_info)
|
473
|
+
await self._track_billing(usage_info)
|
474
|
+
|
475
|
+
return self._format_response(content, input_data)
|
476
|
+
else:
|
477
|
+
# Standard Chat Completions API
|
478
|
+
response = await self.client.chat.completions.create(**kwargs)
|
479
|
+
message = response.choices[0].message
|
480
|
+
|
481
|
+
# Debug: Log the raw OpenAI response
|
482
|
+
logger.debug(f"OpenAI response message: {message}")
|
483
|
+
if message.tool_calls:
|
484
|
+
logger.debug(f"Tool calls found: {len(message.tool_calls)}")
|
485
|
+
for i, tc in enumerate(message.tool_calls):
|
486
|
+
logger.debug(f" Tool call {i+1}: id={tc.id}, function={tc.function.name}, args={tc.function.arguments}")
|
487
|
+
|
488
|
+
# Update usage tracking
|
489
|
+
if response.usage:
|
490
|
+
self._update_token_usage(response.usage)
|
491
|
+
await self._track_billing(response.usage)
|
492
|
+
|
493
|
+
# Handle tool calls if present - let adapter process the complete message
|
494
|
+
if message.tool_calls:
|
495
|
+
# Pass the complete message object to adapter for proper tool_calls handling
|
496
|
+
return self._format_response(message, input_data)
|
497
|
+
|
498
|
+
# Return appropriate format based on input type
|
499
|
+
return self._format_response(message.content or "", input_data)
|
168
500
|
|
169
501
|
except Exception as e:
|
170
502
|
logger.error(f"Error in ainvoke: {e}")
|
@@ -210,11 +542,42 @@ class OpenAILLMService(BaseLLMService):
|
|
210
542
|
|
211
543
|
def _update_token_usage(self, usage):
|
212
544
|
"""Update token usage statistics"""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
545
|
+
# Handle different usage object structures (Chat Completions vs Responses API)
|
546
|
+
if hasattr(usage, 'prompt_tokens'):
|
547
|
+
# Chat Completions API format
|
548
|
+
self.last_token_usage = {
|
549
|
+
"prompt_tokens": usage.prompt_tokens,
|
550
|
+
"completion_tokens": usage.completion_tokens,
|
551
|
+
"total_tokens": usage.total_tokens
|
552
|
+
}
|
553
|
+
elif hasattr(usage, 'input_tokens'):
|
554
|
+
# Responses API format
|
555
|
+
self.last_token_usage = {
|
556
|
+
"prompt_tokens": usage.input_tokens,
|
557
|
+
"completion_tokens": usage.output_tokens,
|
558
|
+
"total_tokens": usage.total_tokens
|
559
|
+
}
|
560
|
+
else:
|
561
|
+
# Fallback for unknown usage format
|
562
|
+
logger.warning(f"Unknown usage format: {type(usage)}, attributes: {dir(usage)}")
|
563
|
+
self.last_token_usage = {
|
564
|
+
"prompt_tokens": 0,
|
565
|
+
"completion_tokens": 0,
|
566
|
+
"total_tokens": 0
|
567
|
+
}
|
568
|
+
|
569
|
+
# For O-series models, track reasoning tokens if available
|
570
|
+
if self.is_reasoning_model:
|
571
|
+
reasoning_tokens = 0
|
572
|
+
if hasattr(usage, 'reasoning_tokens'):
|
573
|
+
reasoning_tokens = usage.reasoning_tokens
|
574
|
+
elif hasattr(usage, 'output_tokens_details') and hasattr(usage.output_tokens_details, 'reasoning_tokens'):
|
575
|
+
reasoning_tokens = usage.output_tokens_details.reasoning_tokens
|
576
|
+
|
577
|
+
self.last_token_usage["reasoning_tokens"] = reasoning_tokens
|
578
|
+
if "reasoning_tokens" not in self.total_token_usage:
|
579
|
+
self.total_token_usage["reasoning_tokens"] = 0
|
580
|
+
self.total_token_usage["reasoning_tokens"] += reasoning_tokens
|
218
581
|
|
219
582
|
# Update total usage
|
220
583
|
self.total_token_usage["prompt_tokens"] += self.last_token_usage["prompt_tokens"]
|
@@ -225,15 +588,35 @@ class OpenAILLMService(BaseLLMService):
|
|
225
588
|
async def _track_billing(self, usage):
|
226
589
|
"""Track billing information"""
|
227
590
|
provider_config = self.get_provider_config()
|
591
|
+
|
592
|
+
# Prepare metadata for tracking
|
593
|
+
metadata = {
|
594
|
+
"temperature": provider_config.get("temperature", 0.7),
|
595
|
+
"max_tokens": provider_config.get("max_tokens", 1024),
|
596
|
+
"is_reasoning_model": self.is_reasoning_model
|
597
|
+
}
|
598
|
+
|
599
|
+
# Add reasoning tokens if available for O-series models
|
600
|
+
if self.is_reasoning_model and hasattr(usage, 'reasoning_tokens'):
|
601
|
+
metadata["reasoning_tokens"] = usage.reasoning_tokens
|
602
|
+
|
603
|
+
# Get tokens using the same logic as _update_token_usage
|
604
|
+
if hasattr(usage, 'prompt_tokens'):
|
605
|
+
input_tokens = usage.prompt_tokens
|
606
|
+
output_tokens = usage.completion_tokens
|
607
|
+
elif hasattr(usage, 'input_tokens'):
|
608
|
+
input_tokens = usage.input_tokens
|
609
|
+
output_tokens = usage.output_tokens
|
610
|
+
else:
|
611
|
+
input_tokens = 0
|
612
|
+
output_tokens = 0
|
613
|
+
|
228
614
|
await self._track_usage(
|
229
615
|
service_type=ServiceType.LLM,
|
230
616
|
operation="chat",
|
231
|
-
input_tokens=
|
232
|
-
output_tokens=
|
233
|
-
metadata=
|
234
|
-
"temperature": provider_config.get("temperature", 0.7),
|
235
|
-
"max_tokens": provider_config.get("max_tokens", 1024)
|
236
|
-
}
|
617
|
+
input_tokens=input_tokens,
|
618
|
+
output_tokens=output_tokens,
|
619
|
+
metadata=metadata
|
237
620
|
)
|
238
621
|
|
239
622
|
def get_token_usage(self) -> Dict[str, Any]:
|
@@ -252,14 +635,18 @@ class OpenAILLMService(BaseLLMService):
|
|
252
635
|
"max_tokens": provider_config.get("max_tokens", 1024),
|
253
636
|
"supports_streaming": True,
|
254
637
|
"supports_functions": True,
|
255
|
-
"
|
638
|
+
"supports_reasoning": self.is_reasoning_model,
|
639
|
+
"supports_deep_research": self.supports_deep_research,
|
640
|
+
"provider": "openai",
|
641
|
+
"model_type": "reasoning" if self.is_reasoning_model else "standard"
|
256
642
|
}
|
257
643
|
|
258
644
|
|
259
645
|
async def chat(
|
260
646
|
self,
|
261
647
|
input_data: Union[str, List[Dict[str, str]], Any],
|
262
|
-
max_tokens: Optional[int] = None
|
648
|
+
max_tokens: Optional[int] = None,
|
649
|
+
show_reasoning: bool = False
|
263
650
|
) -> Dict[str, Any]:
|
264
651
|
"""
|
265
652
|
Chat method that wraps ainvoke for compatibility with base class
|
@@ -267,13 +654,14 @@ class OpenAILLMService(BaseLLMService):
|
|
267
654
|
Args:
|
268
655
|
input_data: Input messages
|
269
656
|
max_tokens: Maximum tokens to generate
|
657
|
+
show_reasoning: Whether to show reasoning process (for O4 models)
|
270
658
|
|
271
659
|
Returns:
|
272
660
|
Dict containing chat response with properly formatted message object
|
273
661
|
"""
|
274
662
|
try:
|
275
|
-
# Call ainvoke
|
276
|
-
response = await self.ainvoke(input_data)
|
663
|
+
# Call ainvoke with show_reasoning parameter
|
664
|
+
response = await self.ainvoke(input_data, show_reasoning=show_reasoning)
|
277
665
|
|
278
666
|
# Return the response as-is (adapter already formatted it correctly)
|
279
667
|
# For LangChain inputs, this will be an AIMessage object
|
@@ -284,7 +672,9 @@ class OpenAILLMService(BaseLLMService):
|
|
284
672
|
"metadata": {
|
285
673
|
"model": self.model_name,
|
286
674
|
"provider": self.provider_name,
|
287
|
-
"max_tokens": max_tokens or self.max_tokens
|
675
|
+
"max_tokens": max_tokens or self.max_tokens,
|
676
|
+
"show_reasoning": show_reasoning,
|
677
|
+
"is_reasoning_model": self.is_reasoning_model
|
288
678
|
}
|
289
679
|
}
|
290
680
|
except Exception as e:
|
@@ -299,6 +689,213 @@ class OpenAILLMService(BaseLLMService):
|
|
299
689
|
}
|
300
690
|
}
|
301
691
|
|
692
|
+
async def deep_research(
|
693
|
+
self,
|
694
|
+
input_data: Union[str, List[Dict[str, str]], Any],
|
695
|
+
research_type: Optional[str] = None,
|
696
|
+
search_enabled: bool = True
|
697
|
+
) -> Dict[str, Any]:
|
698
|
+
"""
|
699
|
+
深度研究任务 - 专为深度研究模型设计,使用OpenAI Responses API
|
700
|
+
|
701
|
+
Args:
|
702
|
+
input_data: 研究查询或问题
|
703
|
+
research_type: 研究类型 (academic, market, competitive, etc.)
|
704
|
+
search_enabled: 是否启用网络搜索
|
705
|
+
|
706
|
+
Returns:
|
707
|
+
Dict containing research results
|
708
|
+
"""
|
709
|
+
if not self.supports_deep_research:
|
710
|
+
# Fallback to regular chat for non-deep-research models
|
711
|
+
logger.info(f"Model {self.model_name} doesn't support deep research, falling back to regular chat")
|
712
|
+
return await self.chat(input_data)
|
713
|
+
|
714
|
+
try:
|
715
|
+
# Prepare messages with research context
|
716
|
+
messages = self._prepare_messages(input_data)
|
717
|
+
|
718
|
+
# Add research-specific system prompt if research_type is specified
|
719
|
+
if research_type and messages:
|
720
|
+
research_prompts = {
|
721
|
+
"academic": "You are conducting academic research. Please provide thorough, well-sourced analysis with proper citations and methodical reasoning.",
|
722
|
+
"market": "You are conducting market research. Focus on market trends, competitive analysis, and business insights.",
|
723
|
+
"competitive": "You are conducting competitive analysis. Compare and contrast different approaches, solutions, or entities.",
|
724
|
+
"technical": "You are conducting technical research. Provide detailed technical analysis with implementation considerations."
|
725
|
+
}
|
726
|
+
|
727
|
+
if research_type in research_prompts:
|
728
|
+
# Insert system message at the beginning
|
729
|
+
system_msg = {"role": "system", "content": research_prompts[research_type]}
|
730
|
+
if messages[0].get("role") == "system":
|
731
|
+
messages[0]["content"] = research_prompts[research_type] + "\n\n" + messages[0]["content"]
|
732
|
+
else:
|
733
|
+
messages.insert(0, system_msg)
|
734
|
+
|
735
|
+
# Prepare request kwargs for Responses API
|
736
|
+
provider_config = self.get_provider_config()
|
737
|
+
kwargs = {
|
738
|
+
"model": self.model_name,
|
739
|
+
"input": messages # Responses API uses 'input' instead of 'messages'
|
740
|
+
}
|
741
|
+
|
742
|
+
# Responses API uses max_output_tokens instead of max_completion_tokens
|
743
|
+
max_tokens_value = provider_config.get("max_tokens", 4096)
|
744
|
+
kwargs["max_output_tokens"] = max_tokens_value
|
745
|
+
|
746
|
+
# Deep research models require web_search_preview tool when search is enabled
|
747
|
+
if search_enabled:
|
748
|
+
kwargs["tools"] = [
|
749
|
+
{
|
750
|
+
"type": "web_search_preview"
|
751
|
+
}
|
752
|
+
]
|
753
|
+
|
754
|
+
# Add any additional bound tools
|
755
|
+
tool_schemas = await self._prepare_tools_for_request()
|
756
|
+
if tool_schemas:
|
757
|
+
if "tools" not in kwargs:
|
758
|
+
kwargs["tools"] = []
|
759
|
+
kwargs["tools"].extend(tool_schemas)
|
760
|
+
|
761
|
+
# Check if streaming is enabled
|
762
|
+
if self.streaming:
|
763
|
+
# Use streaming mode for deep research
|
764
|
+
logger.info(f"Using Responses API streaming for deep research model {self.model_name}")
|
765
|
+
kwargs["stream"] = True
|
766
|
+
|
767
|
+
content_chunks = []
|
768
|
+
stream = await self.client.responses.create(**kwargs)
|
769
|
+
|
770
|
+
async for event in stream:
|
771
|
+
if event.type == 'response.output_text.delta':
|
772
|
+
if event.delta:
|
773
|
+
content_chunks.append(event.delta)
|
774
|
+
|
775
|
+
message_content = "".join(content_chunks)
|
776
|
+
|
777
|
+
# Track estimated usage for streaming
|
778
|
+
messages = self._prepare_messages(input_data)
|
779
|
+
self._track_streaming_usage(messages, message_content)
|
780
|
+
|
781
|
+
# Format response
|
782
|
+
formatted_response = self._format_response(message_content or "", input_data)
|
783
|
+
else:
|
784
|
+
# Use non-streaming mode for deep research
|
785
|
+
logger.info(f"Using Responses API for deep research model {self.model_name}")
|
786
|
+
response = await self.client.responses.create(**kwargs)
|
787
|
+
|
788
|
+
# Extract the response content from Responses API format
|
789
|
+
if hasattr(response, 'output_text'):
|
790
|
+
# Modern Responses API format
|
791
|
+
message_content = response.output_text
|
792
|
+
usage_info = getattr(response, 'usage', None)
|
793
|
+
elif hasattr(response, 'body') and hasattr(response.body, 'response'):
|
794
|
+
# Legacy Responses API format
|
795
|
+
message_content = response.body.response
|
796
|
+
usage_info = getattr(response.body, 'usage', None)
|
797
|
+
elif hasattr(response, 'choices') and response.choices:
|
798
|
+
# Fallback to standard format
|
799
|
+
message_content = response.choices[0].message.content
|
800
|
+
usage_info = getattr(response, 'usage', None)
|
801
|
+
else:
|
802
|
+
# Handle unexpected format
|
803
|
+
message_content = str(response)
|
804
|
+
usage_info = None
|
805
|
+
|
806
|
+
# Update usage tracking if available
|
807
|
+
if usage_info:
|
808
|
+
self._update_token_usage(usage_info)
|
809
|
+
await self._track_billing(usage_info)
|
810
|
+
|
811
|
+
# Format response
|
812
|
+
formatted_response = self._format_response(message_content or "", input_data)
|
813
|
+
|
814
|
+
return {
|
815
|
+
"result": formatted_response,
|
816
|
+
"research_type": research_type,
|
817
|
+
"search_enabled": search_enabled,
|
818
|
+
"success": True,
|
819
|
+
"metadata": {
|
820
|
+
"model": self.model_name,
|
821
|
+
"provider": self.provider_name,
|
822
|
+
"supports_deep_research": self.supports_deep_research,
|
823
|
+
"reasoning_model": self.is_reasoning_model,
|
824
|
+
"api_used": "responses"
|
825
|
+
}
|
826
|
+
}
|
827
|
+
|
828
|
+
except Exception as e:
|
829
|
+
logger.error(f"Deep research failed: {e}")
|
830
|
+
return {
|
831
|
+
"result": None,
|
832
|
+
"success": False,
|
833
|
+
"error": str(e),
|
834
|
+
"metadata": {
|
835
|
+
"model": self.model_name,
|
836
|
+
"provider": self.provider_name,
|
837
|
+
"api_used": "responses"
|
838
|
+
}
|
839
|
+
}
|
840
|
+
|
302
841
|
async def close(self):
|
303
842
|
"""Close the backend client"""
|
304
|
-
await self.client.close()
|
843
|
+
await self.client.close()
|
844
|
+
|
845
|
+
def _get_streaming_billing_info(self) -> Dict[str, Any]:
|
846
|
+
"""Get billing information for streaming requests"""
|
847
|
+
try:
|
848
|
+
# Check if service has model_manager with billing_tracker
|
849
|
+
if hasattr(self, 'model_manager') and hasattr(self.model_manager, 'billing_tracker'):
|
850
|
+
billing_tracker = self.model_manager.billing_tracker
|
851
|
+
|
852
|
+
# Get the latest usage record for this model
|
853
|
+
model_records = [
|
854
|
+
record for record in billing_tracker.usage_records
|
855
|
+
if record.model_id == self.model_name
|
856
|
+
]
|
857
|
+
|
858
|
+
if model_records:
|
859
|
+
# Get the most recent record
|
860
|
+
latest_record = max(model_records, key=lambda r: r.timestamp)
|
861
|
+
|
862
|
+
return {
|
863
|
+
"cost_usd": latest_record.cost_usd,
|
864
|
+
"input_tokens": latest_record.input_tokens,
|
865
|
+
"output_tokens": latest_record.output_tokens,
|
866
|
+
"total_tokens": latest_record.total_tokens,
|
867
|
+
"operation": latest_record.operation,
|
868
|
+
"timestamp": latest_record.timestamp,
|
869
|
+
"currency": "USD"
|
870
|
+
}
|
871
|
+
|
872
|
+
# Fallback: use last token usage with estimated cost
|
873
|
+
last_usage = self.get_last_token_usage()
|
874
|
+
estimated_cost = 0.0
|
875
|
+
|
876
|
+
if hasattr(self, 'model_manager'):
|
877
|
+
estimated_cost = self.model_manager.calculate_cost(
|
878
|
+
provider=self.provider_name,
|
879
|
+
model_name=self.model_name,
|
880
|
+
input_tokens=last_usage.get("prompt_tokens", 0),
|
881
|
+
output_tokens=last_usage.get("completion_tokens", 0)
|
882
|
+
)
|
883
|
+
|
884
|
+
return {
|
885
|
+
"cost_usd": estimated_cost,
|
886
|
+
"input_tokens": last_usage.get("prompt_tokens", 0),
|
887
|
+
"output_tokens": last_usage.get("completion_tokens", 0),
|
888
|
+
"total_tokens": last_usage.get("total_tokens", 0),
|
889
|
+
"operation": "chat",
|
890
|
+
"timestamp": None,
|
891
|
+
"currency": "USD",
|
892
|
+
"note": "Estimated from last token usage"
|
893
|
+
}
|
894
|
+
|
895
|
+
except Exception as e:
|
896
|
+
logger.warning(f"Failed to get streaming billing info: {e}")
|
897
|
+
return {
|
898
|
+
"cost_usd": 0.0,
|
899
|
+
"error": str(e),
|
900
|
+
"currency": "USD"
|
901
|
+
}
|