isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. isa_model/__init__.py +1 -1
  2. isa_model/client.py +732 -565
  3. isa_model/core/cache/redis_cache.py +401 -0
  4. isa_model/core/config/config_manager.py +53 -10
  5. isa_model/core/config.py +1 -1
  6. isa_model/core/database/__init__.py +1 -0
  7. isa_model/core/database/migrations.py +277 -0
  8. isa_model/core/database/supabase_client.py +123 -0
  9. isa_model/core/models/__init__.py +37 -0
  10. isa_model/core/models/model_billing_tracker.py +60 -88
  11. isa_model/core/models/model_manager.py +36 -18
  12. isa_model/core/models/model_repo.py +44 -38
  13. isa_model/core/models/model_statistics_tracker.py +234 -0
  14. isa_model/core/models/model_storage.py +0 -1
  15. isa_model/core/models/model_version_manager.py +959 -0
  16. isa_model/core/pricing_manager.py +2 -249
  17. isa_model/core/resilience/circuit_breaker.py +366 -0
  18. isa_model/core/security/secrets.py +358 -0
  19. isa_model/core/services/__init__.py +2 -4
  20. isa_model/core/services/intelligent_model_selector.py +101 -370
  21. isa_model/core/storage/hf_storage.py +1 -1
  22. isa_model/core/types.py +7 -0
  23. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  24. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  25. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  26. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  27. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  28. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  29. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  30. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  31. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  33. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  34. isa_model/deployment/core/deployment_manager.py +6 -4
  35. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  36. isa_model/eval/benchmarks/__init__.py +27 -0
  37. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  38. isa_model/eval/benchmarks.py +244 -12
  39. isa_model/eval/evaluators/__init__.py +8 -2
  40. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  41. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  42. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  43. isa_model/eval/example_evaluation.py +395 -0
  44. isa_model/eval/factory.py +272 -5
  45. isa_model/eval/isa_benchmarks.py +700 -0
  46. isa_model/eval/isa_integration.py +582 -0
  47. isa_model/eval/metrics.py +159 -6
  48. isa_model/eval/tests/unit/test_basic.py +396 -0
  49. isa_model/inference/ai_factory.py +44 -8
  50. isa_model/inference/services/audio/__init__.py +21 -0
  51. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  52. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  53. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  54. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  55. isa_model/inference/services/base_service.py +17 -1
  56. isa_model/inference/services/embedding/__init__.py +13 -0
  57. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  58. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  59. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  60. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  61. isa_model/inference/services/img/__init__.py +2 -2
  62. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  63. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  64. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  65. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  66. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  67. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  68. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  69. isa_model/inference/services/llm/base_llm_service.py +30 -6
  70. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  71. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  72. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  73. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  74. isa_model/inference/services/vision/__init__.py +5 -5
  75. isa_model/inference/services/vision/base_vision_service.py +118 -185
  76. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  77. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  78. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  79. isa_model/serving/api/fastapi_server.py +88 -16
  80. isa_model/serving/api/middleware/auth.py +311 -0
  81. isa_model/serving/api/middleware/security.py +278 -0
  82. isa_model/serving/api/routes/analytics.py +486 -0
  83. isa_model/serving/api/routes/deployments.py +339 -0
  84. isa_model/serving/api/routes/evaluations.py +579 -0
  85. isa_model/serving/api/routes/logs.py +430 -0
  86. isa_model/serving/api/routes/settings.py +582 -0
  87. isa_model/serving/api/routes/unified.py +324 -165
  88. isa_model/serving/api/startup.py +304 -0
  89. isa_model/serving/modal_proxy_server.py +249 -0
  90. isa_model/training/__init__.py +100 -6
  91. isa_model/training/core/__init__.py +4 -1
  92. isa_model/training/examples/intelligent_training_example.py +281 -0
  93. isa_model/training/intelligent/__init__.py +25 -0
  94. isa_model/training/intelligent/decision_engine.py +643 -0
  95. isa_model/training/intelligent/intelligent_factory.py +888 -0
  96. isa_model/training/intelligent/knowledge_base.py +751 -0
  97. isa_model/training/intelligent/resource_optimizer.py +839 -0
  98. isa_model/training/intelligent/task_classifier.py +576 -0
  99. isa_model/training/storage/__init__.py +24 -0
  100. isa_model/training/storage/core_integration.py +439 -0
  101. isa_model/training/storage/training_repository.py +552 -0
  102. isa_model/training/storage/training_storage.py +628 -0
  103. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  104. isa_model-0.4.0.dist-info/RECORD +182 -0
  105. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  106. isa_model/deployment/cloud/modal/register_models.py +0 -321
  107. isa_model/inference/adapter/unified_api.py +0 -248
  108. isa_model/inference/services/helpers/stacked_config.py +0 -148
  109. isa_model/inference/services/img/flux_professional_service.py +0 -603
  110. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  111. isa_model/inference/services/others/table_transformer_service.py +0 -61
  112. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  113. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  114. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  115. isa_model/scripts/inference_tracker.py +0 -283
  116. isa_model/scripts/mlflow_manager.py +0 -379
  117. isa_model/scripts/model_registry.py +0 -465
  118. isa_model/scripts/register_models.py +0 -370
  119. isa_model/scripts/register_models_with_embeddings.py +0 -510
  120. isa_model/scripts/start_mlflow.py +0 -95
  121. isa_model/scripts/training_tracker.py +0 -257
  122. isa_model-0.3.9.dist-info/RECORD +0 -138
  123. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  124. {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import json
4
+ import asyncio
4
5
  from typing import Dict, Any, List, Union, AsyncGenerator, Optional, Callable
5
6
 
6
7
  # 使用官方 OpenAI 库
@@ -17,9 +18,16 @@ class OpenAILLMService(BaseLLMService):
17
18
  def __init__(self, model_name: str = "gpt-4o-mini", provider_name: str = "openai", **kwargs):
18
19
  super().__init__(provider_name, model_name, **kwargs)
19
20
 
21
+ # Check if this is an O-series reasoning model
22
+ self.is_reasoning_model = model_name.startswith("o4-") or model_name.startswith("o3-")
23
+ self.supports_deep_research = "deep-search" in model_name or "deep-research" in model_name
24
+
20
25
  # Get configuration from centralized config manager
21
26
  provider_config = self.get_provider_config()
22
27
 
28
+ # Check if reasoning summary is enabled (requires verified organization)
29
+ self.enable_reasoning_summary = provider_config.get("enable_reasoning_summary", False)
30
+
23
31
  # Initialize AsyncOpenAI client with provider configuration
24
32
  try:
25
33
  if not provider_config.get("api_key"):
@@ -40,11 +48,40 @@ class OpenAILLMService(BaseLLMService):
40
48
  self.last_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
41
49
  self.total_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "requests_count": 0}
42
50
 
51
+ # For O-series models, track reasoning tokens separately
52
+ if self.is_reasoning_model:
53
+ self.last_token_usage["reasoning_tokens"] = 0
54
+ self.total_token_usage["reasoning_tokens"] = 0
55
+
43
56
 
44
57
  def _create_bound_copy(self) -> 'OpenAILLMService':
45
58
  """Create a copy of this service for tool binding"""
46
- bound_service = OpenAILLMService(self.model_name, self.provider_name)
47
- bound_service._bound_tools = self._bound_tools.copy()
59
+ # Create new instance but bypass full initialization
60
+ bound_service = object.__new__(OpenAILLMService)
61
+
62
+ # Copy all essential attributes from original service
63
+ bound_service.model_name = self.model_name
64
+ bound_service.provider_name = self.provider_name
65
+ bound_service.client = self.client # Reuse the same OpenAI client
66
+ bound_service.last_token_usage = self.last_token_usage.copy()
67
+ bound_service.total_token_usage = self.total_token_usage.copy()
68
+ bound_service._bound_tools = self._bound_tools.copy() if self._bound_tools else []
69
+ bound_service.adapter_manager = self.adapter_manager # Reuse adapter manager
70
+
71
+ # Copy OpenAI-specific attributes
72
+ bound_service.is_reasoning_model = self.is_reasoning_model
73
+ bound_service.supports_deep_research = self.supports_deep_research
74
+
75
+ # Copy base class attributes
76
+ bound_service.streaming = self.streaming
77
+ bound_service.max_tokens = self.max_tokens
78
+ bound_service.temperature = self.temperature
79
+ bound_service._tool_mappings = {}
80
+
81
+ # Copy BaseService attributes that are needed
82
+ bound_service.config_manager = self.config_manager
83
+ bound_service.model_manager = self.model_manager
84
+
48
85
  return bound_service
49
86
 
50
87
  def bind_tools(self, tools: List[Any], **kwargs) -> 'OpenAILLMService':
@@ -66,16 +103,133 @@ class OpenAILLMService(BaseLLMService):
66
103
 
67
104
  return bound_service
68
105
 
69
- async def astream(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[str, None]:
106
+ async def astream(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
70
107
  """
71
108
  True streaming method - yields tokens one by one as they arrive
72
109
 
73
110
  Args:
74
111
  input_data: Same as ainvoke
112
+ show_reasoning: If True and model supports it, show reasoning process using Responses API
75
113
 
76
114
  Yields:
77
- Individual tokens as they arrive from the API
115
+ Individual tokens as they arrive from the API, plus final result object with tool_calls
78
116
  """
117
+ try:
118
+ # Determine which API to use for streaming
119
+ use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
120
+
121
+ if use_responses_api:
122
+ logger.info(f"Using Responses API streaming for {self.model_name}")
123
+ # Use Responses API streaming
124
+ async for chunk in self._astream_responses_api(input_data, show_reasoning):
125
+ yield chunk
126
+ else:
127
+ logger.debug(f"Using Chat Completions API streaming for {self.model_name}")
128
+ # Use Chat Completions API streaming
129
+ async for chunk in self._astream_chat_completions_api(input_data):
130
+ yield chunk
131
+
132
+ except Exception as e:
133
+ logger.error(f"Error in astream: {e}")
134
+ raise
135
+
136
+ async def _astream_responses_api(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
137
+ """Stream using Responses API for reasoning models and deep research models"""
138
+ try:
139
+ # Use adapter manager to prepare messages
140
+ messages = self._prepare_messages(input_data)
141
+
142
+ # Prepare request kwargs for Responses API
143
+ provider_config = self.get_provider_config()
144
+ kwargs = {
145
+ "model": self.model_name,
146
+ "input": messages, # Responses API uses 'input' instead of 'messages'
147
+ "stream": True
148
+ }
149
+
150
+ # Responses API uses max_output_tokens
151
+ max_tokens_value = provider_config.get("max_tokens", 1024)
152
+ kwargs["max_output_tokens"] = max_tokens_value
153
+
154
+ # Add reasoning configuration if needed (optional - requires verified organization)
155
+ if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
156
+ kwargs["reasoning"] = {"summary": "auto"}
157
+ logger.info("Reasoning summary enabled - using verified organization features")
158
+ elif show_reasoning and self.is_reasoning_model:
159
+ logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
160
+
161
+ # Deep research models require web_search_preview tool
162
+ if self.supports_deep_research:
163
+ kwargs["tools"] = [{"type": "web_search_preview"}]
164
+
165
+ # Add any additional bound tools
166
+ tool_schemas = await self._prepare_tools_for_request()
167
+ if tool_schemas:
168
+ if "tools" not in kwargs:
169
+ kwargs["tools"] = []
170
+ kwargs["tools"].extend(tool_schemas)
171
+
172
+ # Stream using Responses API
173
+ content_chunks = []
174
+ reasoning_items = []
175
+
176
+ try:
177
+ logger.info(f"Streaming with Responses API for model {self.model_name}")
178
+ stream = await self.client.responses.create(**kwargs)
179
+
180
+ async for event in stream:
181
+ # Handle different event types from Responses API
182
+ if event.type == 'response.output_text.delta':
183
+ # Stream text content
184
+ if event.delta:
185
+ content_chunks.append(event.delta)
186
+ yield event.delta
187
+
188
+ elif event.type == 'response.reasoning.delta' and show_reasoning:
189
+ # Stream reasoning content (if enabled)
190
+ if hasattr(event, 'delta') and event.delta:
191
+ yield f"[思考: {event.delta}]"
192
+
193
+ elif event.type == 'response.output_item.done':
194
+ # Handle completed items (reasoning, function calls, etc.)
195
+ if hasattr(event, 'item'):
196
+ if event.item.type == 'reasoning':
197
+ reasoning_items.append(event.item)
198
+ elif event.item.type == 'function_call':
199
+ # Handle function call completion
200
+ logger.debug(f"Function call completed: {event.item}")
201
+
202
+ # Create final response object
203
+ full_content = "".join(content_chunks)
204
+
205
+ # Track usage for streaming
206
+ self._track_streaming_usage(messages, full_content)
207
+
208
+ # Get billing info
209
+ await asyncio.sleep(0.01)
210
+ billing_info = self._get_streaming_billing_info()
211
+
212
+ # Format final result
213
+ final_result = self._format_response(full_content, input_data)
214
+
215
+ # Yield final result with metadata
216
+ yield {
217
+ "result": final_result,
218
+ "billing": billing_info,
219
+ "reasoning_items": len(reasoning_items),
220
+ "api_used": "responses"
221
+ }
222
+
223
+ except Exception as e:
224
+ logger.error(f"Error in Responses API streaming: {e}")
225
+ raise
226
+
227
+ except Exception as e:
228
+ logger.error(f"Error in _astream_responses_api: {e}")
229
+ raise
230
+
231
+ async def _astream_chat_completions_api(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
232
+ """Stream using Chat Completions API for standard models"""
79
233
  try:
80
234
  # Use adapter manager to prepare messages
81
235
  messages = self._prepare_messages(input_data)
@@ -85,86 +239,264 @@ class OpenAILLMService(BaseLLMService):
85
239
  kwargs = {
86
240
  "model": self.model_name,
87
241
  "messages": messages,
88
- "temperature": provider_config.get("temperature", 0.7),
89
- "max_tokens": provider_config.get("max_tokens", 1024),
90
242
  "stream": True
91
243
  }
92
244
 
245
+ # O4 models only support temperature=1 (default)
246
+ if not self.is_reasoning_model:
247
+ kwargs["temperature"] = provider_config.get("temperature", 0.7)
248
+
249
+ # O4 models use max_completion_tokens instead of max_tokens
250
+ max_tokens_value = provider_config.get("max_tokens", 1024)
251
+ if self.is_reasoning_model:
252
+ kwargs["max_completion_tokens"] = max_tokens_value
253
+ else:
254
+ kwargs["max_tokens"] = max_tokens_value
255
+
93
256
  # Add tools if bound using adapter manager
94
257
  tool_schemas = await self._prepare_tools_for_request()
95
258
  if tool_schemas:
96
259
  kwargs["tools"] = tool_schemas
97
260
  kwargs["tool_choice"] = "auto"
98
261
 
99
- # Stream tokens one by one
262
+ # Stream tokens and detect tool calls
100
263
  content_chunks = []
264
+ tool_calls_accumulator = {} # Track complete tool calls by ID
265
+ has_tool_calls = False
266
+
101
267
  try:
102
268
  stream = await self.client.chat.completions.create(**kwargs)
103
269
  async for chunk in stream:
104
- content = chunk.choices[0].delta.content
105
- if content:
106
- content_chunks.append(content)
107
- yield content
270
+ delta = chunk.choices[0].delta
271
+
272
+ # Check for tool calls first
273
+ if hasattr(delta, 'tool_calls') and delta.tool_calls:
274
+ has_tool_calls = True
275
+ for tool_call in delta.tool_calls:
276
+ tool_index = getattr(tool_call, 'index', 0) # OpenAI uses index for streaming
277
+
278
+ # Use index as key since streaming tool calls use index
279
+ tool_key = f"tool_{tool_index}"
280
+
281
+ # Initialize tool call if not seen before
282
+ if tool_key not in tool_calls_accumulator:
283
+ tool_calls_accumulator[tool_key] = {
284
+ 'id': getattr(tool_call, 'id', f"call_{tool_index}"),
285
+ 'type': 'function',
286
+ 'function': {
287
+ 'name': '',
288
+ 'arguments': ''
289
+ }
290
+ }
291
+
292
+ # Accumulate function name
293
+ if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'name') and tool_call.function.name:
294
+ tool_calls_accumulator[tool_key]['function']['name'] += tool_call.function.name
295
+
296
+ # Accumulate function arguments
297
+ if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
298
+ if tool_call.function.arguments:
299
+ tool_calls_accumulator[tool_key]['function']['arguments'] += tool_call.function.arguments
300
+
301
+ # Handle regular content - only stream if no tool calls detected
302
+ elif delta.content:
303
+ content_chunks.append(delta.content)
304
+ if not has_tool_calls: # Only yield content if no tool calls
305
+ yield delta.content
306
+
307
+ # Always yield final result at the end
308
+ # - If has tool_calls: complete structured response (no prior streaming)
309
+ # - If no tool_calls: AIMessage after streaming content
310
+
311
+ # Create a mock message object for adapter processing
312
+ class MockMessage:
313
+ def __init__(self):
314
+ self.content = "".join(content_chunks) or ""
315
+ self.tool_calls = []
316
+ # Add tool_calls if any
317
+ if tool_calls_accumulator:
318
+ for tool_data in tool_calls_accumulator.values():
319
+ mock_tool_call = type('MockToolCall', (), {
320
+ 'id': tool_data['id'],
321
+ 'function': type('MockFunction', (), {
322
+ 'name': tool_data['function']['name'],
323
+ 'arguments': tool_data['function']['arguments']
324
+ })()
325
+ })()
326
+ self.tool_calls.append(mock_tool_call)
327
+
328
+ mock_message = MockMessage()
329
+
330
+ logger.debug(f"Streaming complete - tool calls collected: {len(mock_message.tool_calls)}")
331
+ for i, tc in enumerate(mock_message.tool_calls):
332
+ logger.debug(f" Tool call {i+1}: {tc.function.name} with args: {tc.function.arguments}")
333
+
334
+ # Format response using adapter (this handles LangChain conversion)
335
+ final_result = self._format_response(mock_message, input_data)
336
+
337
+ logger.debug(f"Final result type after adapter: {type(final_result)}")
338
+ logger.debug(f"Final result has tool_calls: {hasattr(final_result, 'tool_calls')}")
108
339
 
109
340
  # Track usage after streaming is complete
110
341
  full_content = "".join(content_chunks)
111
342
  self._track_streaming_usage(messages, full_content)
112
343
 
344
+ # Get billing info after tracking (wait a moment for billing to be recorded)
345
+ await asyncio.sleep(0.01)
346
+ billing_info = self._get_streaming_billing_info()
347
+
348
+ # Yield the final result with billing info
349
+ yield {
350
+ "result": final_result,
351
+ "billing": billing_info,
352
+ "api_used": "chat_completions"
353
+ }
354
+
113
355
  except Exception as e:
114
- logger.error(f"Error in streaming: {e}")
356
+ logger.error(f"Error in Chat Completions streaming: {e}")
115
357
  raise
116
358
 
117
359
  except Exception as e:
118
- logger.error(f"Error in astream: {e}")
360
+ logger.error(f"Error in _astream_chat_completions_api: {e}")
119
361
  raise
120
362
 
121
- async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any]) -> Union[str, Any]:
122
- """Unified invoke method for all input types"""
363
+ async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False) -> Union[str, Any]:
364
+ """
365
+ Unified invoke method for all input types
366
+
367
+ Args:
368
+ input_data: Input messages or text
369
+ show_reasoning: If True and model supports it, show reasoning process using Responses API
370
+ """
123
371
  try:
124
372
  # Use adapter manager to prepare messages
125
373
  messages = self._prepare_messages(input_data)
126
374
 
375
+ # Determine which API to use
376
+ # Responses API is required for:
377
+ # 1. Reasoning models with show_reasoning=True
378
+ # 2. Deep research models (they only work with Responses API)
379
+ use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
380
+
127
381
  # Prepare request kwargs
128
382
  provider_config = self.get_provider_config()
129
383
  kwargs = {
130
384
  "model": self.model_name,
131
- "messages": messages,
132
- "temperature": provider_config.get("temperature", 0.7),
133
- "max_tokens": provider_config.get("max_tokens", 1024)
385
+ "messages": messages
134
386
  }
135
387
 
388
+ # O4 models only support temperature=1 (default)
389
+ if not self.is_reasoning_model:
390
+ kwargs["temperature"] = provider_config.get("temperature", 0.7)
391
+
392
+ # O4 models use max_completion_tokens instead of max_tokens
393
+ max_tokens_value = provider_config.get("max_tokens", 1024)
394
+ if self.is_reasoning_model:
395
+ kwargs["max_completion_tokens"] = max_tokens_value
396
+ else:
397
+ kwargs["max_tokens"] = max_tokens_value
398
+
136
399
  # Add tools if bound using adapter manager
137
400
  tool_schemas = await self._prepare_tools_for_request()
138
401
  if tool_schemas:
139
402
  kwargs["tools"] = tool_schemas
140
- kwargs["tool_choice"] = "auto"
403
+ if not use_responses_api: # Responses API handles tool choice differently
404
+ kwargs["tool_choice"] = "auto"
141
405
 
142
406
  # Handle streaming vs non-streaming
143
407
  if self.streaming:
144
408
  # TRUE STREAMING MODE - collect all chunks from the stream
145
409
  content_chunks = []
146
- async for token in self.astream(input_data):
147
- content_chunks.append(token)
148
- content = "".join(content_chunks)
410
+ async for token in self.astream(input_data, show_reasoning=show_reasoning):
411
+ if isinstance(token, str):
412
+ content_chunks.append(token)
413
+ elif isinstance(token, dict) and "result" in token:
414
+ # Return the final result from streaming
415
+ return token["result"]
149
416
 
417
+ # Fallback: join collected content
418
+ content = "".join(content_chunks)
150
419
  return self._format_response(content, input_data)
151
420
  else:
152
- # Non-streaming mode
153
- response = await self.client.chat.completions.create(**kwargs)
154
- message = response.choices[0].message
155
-
156
- # Update usage tracking
157
- if response.usage:
158
- self._update_token_usage(response.usage)
159
- await self._track_billing(response.usage)
160
-
161
- # Handle tool calls if present - let adapter process the complete message
162
- if message.tool_calls:
163
- # Pass the complete message object to adapter for proper tool_calls handling
164
- return self._format_response(message, input_data)
165
-
166
- # Return appropriate format based on input type
167
- return self._format_response(message.content or "", input_data)
421
+ # Non-streaming mode - choose API based on reasoning visibility
422
+ if use_responses_api:
423
+ logger.info(f"Using Responses API for model {self.model_name}")
424
+
425
+ # Convert kwargs for Responses API
426
+ responses_kwargs = {
427
+ "model": kwargs["model"],
428
+ "input": kwargs["messages"] # Responses API uses 'input' instead of 'messages'
429
+ }
430
+
431
+ # Handle max tokens parameter
432
+ if "max_completion_tokens" in kwargs:
433
+ responses_kwargs["max_output_tokens"] = kwargs["max_completion_tokens"]
434
+ elif "max_tokens" in kwargs:
435
+ responses_kwargs["max_output_tokens"] = kwargs["max_tokens"]
436
+
437
+ # Add tools if present
438
+ if "tools" in kwargs:
439
+ responses_kwargs["tools"] = kwargs["tools"]
440
+
441
+ # Add reasoning configuration for reasoning models (requires verified organization)
442
+ if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
443
+ responses_kwargs["reasoning"] = {"summary": "auto"}
444
+ logger.info("Reasoning summary enabled - using verified organization features")
445
+ elif show_reasoning and self.is_reasoning_model:
446
+ logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
447
+
448
+ # Deep research models require web_search_preview tool
449
+ if self.supports_deep_research:
450
+ if "tools" not in responses_kwargs:
451
+ responses_kwargs["tools"] = []
452
+ responses_kwargs["tools"].insert(0, {"type": "web_search_preview"})
453
+
454
+ response = await self.client.responses.create(**responses_kwargs)
455
+
456
+ # Handle Responses API format
457
+ if hasattr(response, 'output_text'):
458
+ # Modern Responses API format
459
+ content = response.output_text
460
+ usage_info = getattr(response, 'usage', None)
461
+ elif hasattr(response, 'body') and hasattr(response.body, 'response'):
462
+ # Legacy format
463
+ content = response.body.response
464
+ usage_info = getattr(response.body, 'usage', None)
465
+ else:
466
+ # Fallback handling
467
+ content = str(response)
468
+ usage_info = None
469
+
470
+ # Update usage tracking if available
471
+ if usage_info:
472
+ self._update_token_usage(usage_info)
473
+ await self._track_billing(usage_info)
474
+
475
+ return self._format_response(content, input_data)
476
+ else:
477
+ # Standard Chat Completions API
478
+ response = await self.client.chat.completions.create(**kwargs)
479
+ message = response.choices[0].message
480
+
481
+ # Debug: Log the raw OpenAI response
482
+ logger.debug(f"OpenAI response message: {message}")
483
+ if message.tool_calls:
484
+ logger.debug(f"Tool calls found: {len(message.tool_calls)}")
485
+ for i, tc in enumerate(message.tool_calls):
486
+ logger.debug(f" Tool call {i+1}: id={tc.id}, function={tc.function.name}, args={tc.function.arguments}")
487
+
488
+ # Update usage tracking
489
+ if response.usage:
490
+ self._update_token_usage(response.usage)
491
+ await self._track_billing(response.usage)
492
+
493
+ # Handle tool calls if present - let adapter process the complete message
494
+ if message.tool_calls:
495
+ # Pass the complete message object to adapter for proper tool_calls handling
496
+ return self._format_response(message, input_data)
497
+
498
+ # Return appropriate format based on input type
499
+ return self._format_response(message.content or "", input_data)
168
500
 
169
501
  except Exception as e:
170
502
  logger.error(f"Error in ainvoke: {e}")
@@ -210,11 +542,42 @@ class OpenAILLMService(BaseLLMService):
210
542
 
211
543
  def _update_token_usage(self, usage):
212
544
  """Update token usage statistics"""
213
- self.last_token_usage = {
214
- "prompt_tokens": usage.prompt_tokens,
215
- "completion_tokens": usage.completion_tokens,
216
- "total_tokens": usage.total_tokens
217
- }
545
+ # Handle different usage object structures (Chat Completions vs Responses API)
546
+ if hasattr(usage, 'prompt_tokens'):
547
+ # Chat Completions API format
548
+ self.last_token_usage = {
549
+ "prompt_tokens": usage.prompt_tokens,
550
+ "completion_tokens": usage.completion_tokens,
551
+ "total_tokens": usage.total_tokens
552
+ }
553
+ elif hasattr(usage, 'input_tokens'):
554
+ # Responses API format
555
+ self.last_token_usage = {
556
+ "prompt_tokens": usage.input_tokens,
557
+ "completion_tokens": usage.output_tokens,
558
+ "total_tokens": usage.total_tokens
559
+ }
560
+ else:
561
+ # Fallback for unknown usage format
562
+ logger.warning(f"Unknown usage format: {type(usage)}, attributes: {dir(usage)}")
563
+ self.last_token_usage = {
564
+ "prompt_tokens": 0,
565
+ "completion_tokens": 0,
566
+ "total_tokens": 0
567
+ }
568
+
569
+ # For O-series models, track reasoning tokens if available
570
+ if self.is_reasoning_model:
571
+ reasoning_tokens = 0
572
+ if hasattr(usage, 'reasoning_tokens'):
573
+ reasoning_tokens = usage.reasoning_tokens
574
+ elif hasattr(usage, 'output_tokens_details') and hasattr(usage.output_tokens_details, 'reasoning_tokens'):
575
+ reasoning_tokens = usage.output_tokens_details.reasoning_tokens
576
+
577
+ self.last_token_usage["reasoning_tokens"] = reasoning_tokens
578
+ if "reasoning_tokens" not in self.total_token_usage:
579
+ self.total_token_usage["reasoning_tokens"] = 0
580
+ self.total_token_usage["reasoning_tokens"] += reasoning_tokens
218
581
 
219
582
  # Update total usage
220
583
  self.total_token_usage["prompt_tokens"] += self.last_token_usage["prompt_tokens"]
@@ -225,15 +588,35 @@ class OpenAILLMService(BaseLLMService):
225
588
  async def _track_billing(self, usage):
226
589
  """Track billing information"""
227
590
  provider_config = self.get_provider_config()
591
+
592
+ # Prepare metadata for tracking
593
+ metadata = {
594
+ "temperature": provider_config.get("temperature", 0.7),
595
+ "max_tokens": provider_config.get("max_tokens", 1024),
596
+ "is_reasoning_model": self.is_reasoning_model
597
+ }
598
+
599
+ # Add reasoning tokens if available for O-series models
600
+ if self.is_reasoning_model and hasattr(usage, 'reasoning_tokens'):
601
+ metadata["reasoning_tokens"] = usage.reasoning_tokens
602
+
603
+ # Get tokens using the same logic as _update_token_usage
604
+ if hasattr(usage, 'prompt_tokens'):
605
+ input_tokens = usage.prompt_tokens
606
+ output_tokens = usage.completion_tokens
607
+ elif hasattr(usage, 'input_tokens'):
608
+ input_tokens = usage.input_tokens
609
+ output_tokens = usage.output_tokens
610
+ else:
611
+ input_tokens = 0
612
+ output_tokens = 0
613
+
228
614
  await self._track_usage(
229
615
  service_type=ServiceType.LLM,
230
616
  operation="chat",
231
- input_tokens=usage.prompt_tokens,
232
- output_tokens=usage.completion_tokens,
233
- metadata={
234
- "temperature": provider_config.get("temperature", 0.7),
235
- "max_tokens": provider_config.get("max_tokens", 1024)
236
- }
617
+ input_tokens=input_tokens,
618
+ output_tokens=output_tokens,
619
+ metadata=metadata
237
620
  )
238
621
 
239
622
  def get_token_usage(self) -> Dict[str, Any]:
@@ -252,14 +635,18 @@ class OpenAILLMService(BaseLLMService):
252
635
  "max_tokens": provider_config.get("max_tokens", 1024),
253
636
  "supports_streaming": True,
254
637
  "supports_functions": True,
255
- "provider": "openai"
638
+ "supports_reasoning": self.is_reasoning_model,
639
+ "supports_deep_research": self.supports_deep_research,
640
+ "provider": "openai",
641
+ "model_type": "reasoning" if self.is_reasoning_model else "standard"
256
642
  }
257
643
 
258
644
 
259
645
  async def chat(
260
646
  self,
261
647
  input_data: Union[str, List[Dict[str, str]], Any],
262
- max_tokens: Optional[int] = None
648
+ max_tokens: Optional[int] = None,
649
+ show_reasoning: bool = False
263
650
  ) -> Dict[str, Any]:
264
651
  """
265
652
  Chat method that wraps ainvoke for compatibility with base class
@@ -267,13 +654,14 @@ class OpenAILLMService(BaseLLMService):
267
654
  Args:
268
655
  input_data: Input messages
269
656
  max_tokens: Maximum tokens to generate
657
+ show_reasoning: Whether to show reasoning process (for O4 models)
270
658
 
271
659
  Returns:
272
660
  Dict containing chat response with properly formatted message object
273
661
  """
274
662
  try:
275
- # Call ainvoke and get the response (already processed by adapter)
276
- response = await self.ainvoke(input_data)
663
+ # Call ainvoke with show_reasoning parameter
664
+ response = await self.ainvoke(input_data, show_reasoning=show_reasoning)
277
665
 
278
666
  # Return the response as-is (adapter already formatted it correctly)
279
667
  # For LangChain inputs, this will be an AIMessage object
@@ -284,7 +672,9 @@ class OpenAILLMService(BaseLLMService):
284
672
  "metadata": {
285
673
  "model": self.model_name,
286
674
  "provider": self.provider_name,
287
- "max_tokens": max_tokens or self.max_tokens
675
+ "max_tokens": max_tokens or self.max_tokens,
676
+ "show_reasoning": show_reasoning,
677
+ "is_reasoning_model": self.is_reasoning_model
288
678
  }
289
679
  }
290
680
  except Exception as e:
@@ -299,6 +689,213 @@ class OpenAILLMService(BaseLLMService):
299
689
  }
300
690
  }
301
691
 
692
+ async def deep_research(
693
+ self,
694
+ input_data: Union[str, List[Dict[str, str]], Any],
695
+ research_type: Optional[str] = None,
696
+ search_enabled: bool = True
697
+ ) -> Dict[str, Any]:
698
+ """
699
+ 深度研究任务 - 专为深度研究模型设计,使用OpenAI Responses API
700
+
701
+ Args:
702
+ input_data: 研究查询或问题
703
+ research_type: 研究类型 (academic, market, competitive, etc.)
704
+ search_enabled: 是否启用网络搜索
705
+
706
+ Returns:
707
+ Dict containing research results
708
+ """
709
+ if not self.supports_deep_research:
710
+ # Fallback to regular chat for non-deep-research models
711
+ logger.info(f"Model {self.model_name} doesn't support deep research, falling back to regular chat")
712
+ return await self.chat(input_data)
713
+
714
+ try:
715
+ # Prepare messages with research context
716
+ messages = self._prepare_messages(input_data)
717
+
718
+ # Add research-specific system prompt if research_type is specified
719
+ if research_type and messages:
720
+ research_prompts = {
721
+ "academic": "You are conducting academic research. Please provide thorough, well-sourced analysis with proper citations and methodical reasoning.",
722
+ "market": "You are conducting market research. Focus on market trends, competitive analysis, and business insights.",
723
+ "competitive": "You are conducting competitive analysis. Compare and contrast different approaches, solutions, or entities.",
724
+ "technical": "You are conducting technical research. Provide detailed technical analysis with implementation considerations."
725
+ }
726
+
727
+ if research_type in research_prompts:
728
+ # Insert system message at the beginning
729
+ system_msg = {"role": "system", "content": research_prompts[research_type]}
730
+ if messages[0].get("role") == "system":
731
+ messages[0]["content"] = research_prompts[research_type] + "\n\n" + messages[0]["content"]
732
+ else:
733
+ messages.insert(0, system_msg)
734
+
735
+ # Prepare request kwargs for Responses API
736
+ provider_config = self.get_provider_config()
737
+ kwargs = {
738
+ "model": self.model_name,
739
+ "input": messages # Responses API uses 'input' instead of 'messages'
740
+ }
741
+
742
+ # Responses API uses max_output_tokens instead of max_completion_tokens
743
+ max_tokens_value = provider_config.get("max_tokens", 4096)
744
+ kwargs["max_output_tokens"] = max_tokens_value
745
+
746
+ # Deep research models require web_search_preview tool when search is enabled
747
+ if search_enabled:
748
+ kwargs["tools"] = [
749
+ {
750
+ "type": "web_search_preview"
751
+ }
752
+ ]
753
+
754
+ # Add any additional bound tools
755
+ tool_schemas = await self._prepare_tools_for_request()
756
+ if tool_schemas:
757
+ if "tools" not in kwargs:
758
+ kwargs["tools"] = []
759
+ kwargs["tools"].extend(tool_schemas)
760
+
761
+ # Check if streaming is enabled
762
+ if self.streaming:
763
+ # Use streaming mode for deep research
764
+ logger.info(f"Using Responses API streaming for deep research model {self.model_name}")
765
+ kwargs["stream"] = True
766
+
767
+ content_chunks = []
768
+ stream = await self.client.responses.create(**kwargs)
769
+
770
+ async for event in stream:
771
+ if event.type == 'response.output_text.delta':
772
+ if event.delta:
773
+ content_chunks.append(event.delta)
774
+
775
+ message_content = "".join(content_chunks)
776
+
777
+ # Track estimated usage for streaming
778
+ messages = self._prepare_messages(input_data)
779
+ self._track_streaming_usage(messages, message_content)
780
+
781
+ # Format response
782
+ formatted_response = self._format_response(message_content or "", input_data)
783
+ else:
784
+ # Use non-streaming mode for deep research
785
+ logger.info(f"Using Responses API for deep research model {self.model_name}")
786
+ response = await self.client.responses.create(**kwargs)
787
+
788
+ # Extract the response content from Responses API format
789
+ if hasattr(response, 'output_text'):
790
+ # Modern Responses API format
791
+ message_content = response.output_text
792
+ usage_info = getattr(response, 'usage', None)
793
+ elif hasattr(response, 'body') and hasattr(response.body, 'response'):
794
+ # Legacy Responses API format
795
+ message_content = response.body.response
796
+ usage_info = getattr(response.body, 'usage', None)
797
+ elif hasattr(response, 'choices') and response.choices:
798
+ # Fallback to standard format
799
+ message_content = response.choices[0].message.content
800
+ usage_info = getattr(response, 'usage', None)
801
+ else:
802
+ # Handle unexpected format
803
+ message_content = str(response)
804
+ usage_info = None
805
+
806
+ # Update usage tracking if available
807
+ if usage_info:
808
+ self._update_token_usage(usage_info)
809
+ await self._track_billing(usage_info)
810
+
811
+ # Format response
812
+ formatted_response = self._format_response(message_content or "", input_data)
813
+
814
+ return {
815
+ "result": formatted_response,
816
+ "research_type": research_type,
817
+ "search_enabled": search_enabled,
818
+ "success": True,
819
+ "metadata": {
820
+ "model": self.model_name,
821
+ "provider": self.provider_name,
822
+ "supports_deep_research": self.supports_deep_research,
823
+ "reasoning_model": self.is_reasoning_model,
824
+ "api_used": "responses"
825
+ }
826
+ }
827
+
828
+ except Exception as e:
829
+ logger.error(f"Deep research failed: {e}")
830
+ return {
831
+ "result": None,
832
+ "success": False,
833
+ "error": str(e),
834
+ "metadata": {
835
+ "model": self.model_name,
836
+ "provider": self.provider_name,
837
+ "api_used": "responses"
838
+ }
839
+ }
840
+
302
841
  async def close(self):
303
842
  """Close the backend client"""
304
- await self.client.close()
843
+ await self.client.close()
844
+
845
+ def _get_streaming_billing_info(self) -> Dict[str, Any]:
846
+ """Get billing information for streaming requests"""
847
+ try:
848
+ # Check if service has model_manager with billing_tracker
849
+ if hasattr(self, 'model_manager') and hasattr(self.model_manager, 'billing_tracker'):
850
+ billing_tracker = self.model_manager.billing_tracker
851
+
852
+ # Get the latest usage record for this model
853
+ model_records = [
854
+ record for record in billing_tracker.usage_records
855
+ if record.model_id == self.model_name
856
+ ]
857
+
858
+ if model_records:
859
+ # Get the most recent record
860
+ latest_record = max(model_records, key=lambda r: r.timestamp)
861
+
862
+ return {
863
+ "cost_usd": latest_record.cost_usd,
864
+ "input_tokens": latest_record.input_tokens,
865
+ "output_tokens": latest_record.output_tokens,
866
+ "total_tokens": latest_record.total_tokens,
867
+ "operation": latest_record.operation,
868
+ "timestamp": latest_record.timestamp,
869
+ "currency": "USD"
870
+ }
871
+
872
+ # Fallback: use last token usage with estimated cost
873
+ last_usage = self.get_last_token_usage()
874
+ estimated_cost = 0.0
875
+
876
+ if hasattr(self, 'model_manager'):
877
+ estimated_cost = self.model_manager.calculate_cost(
878
+ provider=self.provider_name,
879
+ model_name=self.model_name,
880
+ input_tokens=last_usage.get("prompt_tokens", 0),
881
+ output_tokens=last_usage.get("completion_tokens", 0)
882
+ )
883
+
884
+ return {
885
+ "cost_usd": estimated_cost,
886
+ "input_tokens": last_usage.get("prompt_tokens", 0),
887
+ "output_tokens": last_usage.get("completion_tokens", 0),
888
+ "total_tokens": last_usage.get("total_tokens", 0),
889
+ "operation": "chat",
890
+ "timestamp": None,
891
+ "currency": "USD",
892
+ "note": "Estimated from last token usage"
893
+ }
894
+
895
+ except Exception as e:
896
+ logger.warning(f"Failed to get streaming billing info: {e}")
897
+ return {
898
+ "cost_usd": 0.0,
899
+ "error": str(e),
900
+ "currency": "USD"
901
+ }