isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import json
4
+ import asyncio
4
5
  from typing import Dict, Any, List, Union, AsyncGenerator, Optional, Callable
5
6
 
6
7
  # 使用官方 OpenAI 库
@@ -17,9 +18,18 @@ class OpenAILLMService(BaseLLMService):
17
18
  def __init__(self, model_name: str = "gpt-4o-mini", provider_name: str = "openai", **kwargs):
18
19
  super().__init__(provider_name, model_name, **kwargs)
19
20
 
21
+ # Check if this is an O-series reasoning model
22
+ self.is_reasoning_model = model_name.startswith("o4-") or model_name.startswith("o3-")
23
+ self.uses_completion_tokens = self.is_reasoning_model or model_name.startswith("gpt-5")
24
+ self.requires_default_temperature = self.is_reasoning_model or model_name.startswith("gpt-5")
25
+ self.supports_deep_research = "deep-search" in model_name or "deep-research" in model_name
26
+
20
27
  # Get configuration from centralized config manager
21
28
  provider_config = self.get_provider_config()
22
29
 
30
+ # Check if reasoning summary is enabled (requires verified organization)
31
+ self.enable_reasoning_summary = provider_config.get("enable_reasoning_summary", False)
32
+
23
33
  # Initialize AsyncOpenAI client with provider configuration
24
34
  try:
25
35
  if not provider_config.get("api_key"):
@@ -28,7 +38,9 @@ class OpenAILLMService(BaseLLMService):
28
38
  self.client = AsyncOpenAI(
29
39
  api_key=provider_config["api_key"],
30
40
  base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
31
- organization=provider_config.get("organization")
41
+ organization=provider_config.get("organization"),
42
+ timeout=10.0, # 10 second timeout for first token (much faster than 600s default)
43
+ max_retries=2 # Retry on timeout
32
44
  )
33
45
 
34
46
  logger.info(f"Initialized OpenAILLMService with model {self.model_name} and endpoint {self.client.base_url}")
@@ -40,11 +52,42 @@ class OpenAILLMService(BaseLLMService):
40
52
  self.last_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
41
53
  self.total_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "requests_count": 0}
42
54
 
55
+ # For O-series models, track reasoning tokens separately
56
+ if self.is_reasoning_model:
57
+ self.last_token_usage["reasoning_tokens"] = 0
58
+ self.total_token_usage["reasoning_tokens"] = 0
59
+
43
60
 
44
61
  def _create_bound_copy(self) -> 'OpenAILLMService':
45
62
  """Create a copy of this service for tool binding"""
46
- bound_service = OpenAILLMService(self.model_name, self.provider_name)
47
- bound_service._bound_tools = self._bound_tools.copy()
63
+ # Create new instance but bypass full initialization
64
+ bound_service = object.__new__(OpenAILLMService)
65
+
66
+ # Copy all essential attributes from original service
67
+ bound_service.model_name = self.model_name
68
+ bound_service.provider_name = self.provider_name
69
+ bound_service.client = self.client # Reuse the same OpenAI client
70
+ bound_service.last_token_usage = self.last_token_usage.copy()
71
+ bound_service.total_token_usage = self.total_token_usage.copy()
72
+ bound_service._bound_tools = self._bound_tools.copy() if self._bound_tools else []
73
+ bound_service.adapter_manager = self.adapter_manager # Reuse adapter manager
74
+
75
+ # Copy OpenAI-specific attributes
76
+ bound_service.is_reasoning_model = self.is_reasoning_model
77
+ bound_service.uses_completion_tokens = self.uses_completion_tokens
78
+ bound_service.requires_default_temperature = self.requires_default_temperature
79
+ bound_service.supports_deep_research = self.supports_deep_research
80
+
81
+ # Copy base class attributes
82
+ bound_service.streaming = self.streaming
83
+ bound_service.max_tokens = self.max_tokens
84
+ bound_service.temperature = self.temperature
85
+ bound_service._tool_mappings = {}
86
+
87
+ # Copy BaseService attributes that are needed
88
+ bound_service.config_manager = self.config_manager
89
+ bound_service.model_manager = self.model_manager
90
+
48
91
  return bound_service
49
92
 
50
93
  def bind_tools(self, tools: List[Any], **kwargs) -> 'OpenAILLMService':
@@ -66,16 +109,133 @@ class OpenAILLMService(BaseLLMService):
66
109
 
67
110
  return bound_service
68
111
 
69
- async def astream(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[str, None]:
112
+ async def astream(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
70
113
  """
71
114
  True streaming method - yields tokens one by one as they arrive
72
115
 
73
116
  Args:
74
117
  input_data: Same as ainvoke
118
+ show_reasoning: If True and model supports it, show reasoning process using Responses API
75
119
 
76
120
  Yields:
77
- Individual tokens as they arrive from the API
121
+ Individual tokens as they arrive from the API, plus final result object with tool_calls
78
122
  """
123
+ try:
124
+ # Determine which API to use for streaming
125
+ use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
126
+
127
+ if use_responses_api:
128
+ logger.info(f"Using Responses API streaming for {self.model_name}")
129
+ # Use Responses API streaming
130
+ async for chunk in self._astream_responses_api(input_data, show_reasoning, **extra_kwargs):
131
+ yield chunk
132
+ else:
133
+ logger.debug(f"Using Chat Completions API streaming for {self.model_name}")
134
+ # Use Chat Completions API streaming
135
+ async for chunk in self._astream_chat_completions_api(input_data, **extra_kwargs):
136
+ yield chunk
137
+
138
+ except Exception as e:
139
+ logger.error(f"Error in astream: {e}")
140
+ raise
141
+
142
+ async def _astream_responses_api(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
143
+ """Stream using Responses API for reasoning models and deep research models"""
144
+ try:
145
+ # Use adapter manager to prepare messages
146
+ messages = self._prepare_messages(input_data)
147
+
148
+ # Prepare request kwargs for Responses API
149
+ provider_config = self.get_provider_config()
150
+ kwargs = {
151
+ "model": self.model_name,
152
+ "input": messages, # Responses API uses 'input' instead of 'messages'
153
+ "stream": True
154
+ }
155
+
156
+ # Responses API uses max_output_tokens
157
+ max_tokens_value = provider_config.get("max_tokens", 1024)
158
+ kwargs["max_output_tokens"] = max_tokens_value
159
+
160
+ # Add reasoning configuration if needed (optional - requires verified organization)
161
+ if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
162
+ kwargs["reasoning"] = {"summary": "auto"}
163
+ logger.info("Reasoning summary enabled - using verified organization features")
164
+ elif show_reasoning and self.is_reasoning_model:
165
+ logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
166
+
167
+ # Deep research models require web_search_preview tool
168
+ if self.supports_deep_research:
169
+ kwargs["tools"] = [{"type": "web_search_preview"}]
170
+
171
+ # Add any additional bound tools
172
+ tool_schemas = await self._prepare_tools_for_request()
173
+ if tool_schemas:
174
+ if "tools" not in kwargs:
175
+ kwargs["tools"] = []
176
+ kwargs["tools"].extend(tool_schemas)
177
+
178
+ # Stream using Responses API
179
+ content_chunks = []
180
+ reasoning_items = []
181
+
182
+ try:
183
+ logger.info(f"Streaming with Responses API for model {self.model_name}")
184
+ stream = await self.client.responses.create(**kwargs)
185
+
186
+ async for event in stream:
187
+ # Handle different event types from Responses API
188
+ if event.type == 'response.output_text.delta':
189
+ # Stream text content
190
+ if event.delta:
191
+ content_chunks.append(event.delta)
192
+ yield event.delta
193
+
194
+ elif event.type == 'response.reasoning.delta' and show_reasoning:
195
+ # Stream reasoning content (if enabled)
196
+ if hasattr(event, 'delta') and event.delta:
197
+ yield f"[思考: {event.delta}]"
198
+
199
+ elif event.type == 'response.output_item.done':
200
+ # Handle completed items (reasoning, function calls, etc.)
201
+ if hasattr(event, 'item'):
202
+ if event.item.type == 'reasoning':
203
+ reasoning_items.append(event.item)
204
+ elif event.item.type == 'function_call':
205
+ # Handle function call completion
206
+ logger.debug(f"Function call completed: {event.item}")
207
+
208
+ # Create final response object
209
+ full_content = "".join(content_chunks)
210
+
211
+ # Track usage for streaming
212
+ self._track_streaming_usage(messages, full_content)
213
+
214
+ # Get billing info
215
+ await asyncio.sleep(0.01)
216
+ billing_info = self._get_streaming_billing_info()
217
+
218
+ # Format final result
219
+ final_result = self._format_response(full_content, input_data)
220
+
221
+ # Yield final result with metadata
222
+ yield {
223
+ "result": final_result,
224
+ "billing": billing_info,
225
+ "reasoning_items": len(reasoning_items),
226
+ "api_used": "responses"
227
+ }
228
+
229
+ except Exception as e:
230
+ logger.error(f"Error in Responses API streaming: {e}")
231
+ raise
232
+
233
+ except Exception as e:
234
+ logger.error(f"Error in _astream_responses_api: {e}")
235
+ raise
236
+
237
+ async def _astream_chat_completions_api(self, input_data: Union[str, List[Dict[str, str]], Any], **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
238
+ """Stream using Chat Completions API for standard models"""
79
239
  try:
80
240
  # Use adapter manager to prepare messages
81
241
  messages = self._prepare_messages(input_data)
@@ -85,86 +245,275 @@ class OpenAILLMService(BaseLLMService):
85
245
  kwargs = {
86
246
  "model": self.model_name,
87
247
  "messages": messages,
88
- "temperature": provider_config.get("temperature", 0.7),
89
- "max_tokens": provider_config.get("max_tokens", 1024),
90
248
  "stream": True
91
249
  }
92
250
 
251
+ # O4 and GPT-5 models only support temperature=1 (default)
252
+ if not self.requires_default_temperature:
253
+ kwargs["temperature"] = provider_config.get("temperature", 0.7)
254
+
255
+ # O4 and GPT-5 models use max_completion_tokens instead of max_tokens
256
+ max_tokens_value = provider_config.get("max_tokens", 1024)
257
+ if self.uses_completion_tokens:
258
+ kwargs["max_completion_tokens"] = max_tokens_value
259
+ else:
260
+ kwargs["max_tokens"] = max_tokens_value
261
+
93
262
  # Add tools if bound using adapter manager
94
263
  tool_schemas = await self._prepare_tools_for_request()
95
264
  if tool_schemas:
96
265
  kwargs["tools"] = tool_schemas
97
266
  kwargs["tool_choice"] = "auto"
98
267
 
99
- # Stream tokens one by one
268
+ # Add response_format if specified (for JSON mode)
269
+ if 'response_format' in extra_kwargs:
270
+ kwargs['response_format'] = extra_kwargs['response_format']
271
+ logger.debug(f"Using response_format in streaming: {extra_kwargs['response_format']}")
272
+
273
+ # Stream tokens and detect tool calls
100
274
  content_chunks = []
275
+ tool_calls_accumulator = {} # Track complete tool calls by ID
276
+ has_tool_calls = False
277
+
101
278
  try:
102
279
  stream = await self.client.chat.completions.create(**kwargs)
103
280
  async for chunk in stream:
104
- content = chunk.choices[0].delta.content
105
- if content:
106
- content_chunks.append(content)
107
- yield content
281
+ delta = chunk.choices[0].delta
282
+
283
+ # Check for tool calls first
284
+ if hasattr(delta, 'tool_calls') and delta.tool_calls:
285
+ has_tool_calls = True
286
+ for tool_call in delta.tool_calls:
287
+ tool_index = getattr(tool_call, 'index', 0) # OpenAI uses index for streaming
288
+
289
+ # Use index as key since streaming tool calls use index
290
+ tool_key = f"tool_{tool_index}"
291
+
292
+ # Initialize tool call if not seen before
293
+ if tool_key not in tool_calls_accumulator:
294
+ tool_calls_accumulator[tool_key] = {
295
+ 'id': getattr(tool_call, 'id', f"call_{tool_index}"),
296
+ 'type': 'function',
297
+ 'function': {
298
+ 'name': '',
299
+ 'arguments': ''
300
+ }
301
+ }
302
+
303
+ # Accumulate function name
304
+ if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'name') and tool_call.function.name:
305
+ tool_calls_accumulator[tool_key]['function']['name'] += tool_call.function.name
306
+
307
+ # Accumulate function arguments
308
+ if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
309
+ if tool_call.function.arguments:
310
+ tool_calls_accumulator[tool_key]['function']['arguments'] += tool_call.function.arguments
311
+
312
+ # Handle regular content - only stream if no tool calls detected
313
+ elif delta.content:
314
+ content_chunks.append(delta.content)
315
+ if not has_tool_calls: # Only yield content if no tool calls
316
+ yield delta.content
317
+
318
+ # Always yield final result at the end
319
+ # - If has tool_calls: complete structured response (no prior streaming)
320
+ # - If no tool_calls: AIMessage after streaming content
321
+
322
+ # Create a mock message object for adapter processing
323
+ class MockMessage:
324
+ def __init__(self):
325
+ self.content = "".join(content_chunks) or ""
326
+ self.tool_calls = []
327
+ # Add tool_calls if any
328
+ if tool_calls_accumulator:
329
+ for tool_data in tool_calls_accumulator.values():
330
+ mock_tool_call = type('MockToolCall', (), {
331
+ 'id': tool_data['id'],
332
+ 'function': type('MockFunction', (), {
333
+ 'name': tool_data['function']['name'],
334
+ 'arguments': tool_data['function']['arguments']
335
+ })()
336
+ })()
337
+ self.tool_calls.append(mock_tool_call)
338
+
339
+ mock_message = MockMessage()
340
+
341
+ logger.debug(f"Streaming complete - tool calls collected: {len(mock_message.tool_calls)}")
342
+ for i, tc in enumerate(mock_message.tool_calls):
343
+ logger.debug(f" Tool call {i+1}: {tc.function.name} with args: {tc.function.arguments}")
344
+
345
+ # Format response using adapter (this handles LangChain conversion)
346
+ final_result = self._format_response(mock_message, input_data)
347
+
348
+ logger.debug(f"Final result type after adapter: {type(final_result)}")
349
+ logger.debug(f"Final result has tool_calls: {hasattr(final_result, 'tool_calls')}")
108
350
 
109
351
  # Track usage after streaming is complete
110
352
  full_content = "".join(content_chunks)
111
353
  self._track_streaming_usage(messages, full_content)
112
354
 
355
+ # Get billing info after tracking (wait a moment for billing to be recorded)
356
+ await asyncio.sleep(0.01)
357
+ billing_info = self._get_streaming_billing_info()
358
+
359
+ # Yield the final result with billing info
360
+ yield {
361
+ "result": final_result,
362
+ "billing": billing_info,
363
+ "api_used": "chat_completions"
364
+ }
365
+
113
366
  except Exception as e:
114
- logger.error(f"Error in streaming: {e}")
367
+ logger.error(f"Error in Chat Completions streaming: {e}")
115
368
  raise
116
369
 
117
370
  except Exception as e:
118
- logger.error(f"Error in astream: {e}")
371
+ logger.error(f"Error in _astream_chat_completions_api: {e}")
119
372
  raise
120
373
 
121
- async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any]) -> Union[str, Any]:
122
- """Unified invoke method for all input types"""
374
+ async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> Union[str, Any]:
375
+ """
376
+ Unified invoke method for all input types
377
+
378
+ Args:
379
+ input_data: Input messages or text
380
+ show_reasoning: If True and model supports it, show reasoning process using Responses API
381
+ **extra_kwargs: Additional parameters to pass to the API (e.g., response_format)
382
+ """
123
383
  try:
124
384
  # Use adapter manager to prepare messages
125
385
  messages = self._prepare_messages(input_data)
126
386
 
387
+ # Determine which API to use
388
+ # Responses API is required for:
389
+ # 1. Reasoning models with show_reasoning=True
390
+ # 2. Deep research models (they only work with Responses API)
391
+ use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
392
+
127
393
  # Prepare request kwargs
128
394
  provider_config = self.get_provider_config()
129
395
  kwargs = {
130
396
  "model": self.model_name,
131
- "messages": messages,
132
- "temperature": provider_config.get("temperature", 0.7),
133
- "max_tokens": provider_config.get("max_tokens", 1024)
397
+ "messages": messages
134
398
  }
135
399
 
400
+ # O4 and GPT-5 models only support temperature=1 (default)
401
+ if not self.requires_default_temperature:
402
+ kwargs["temperature"] = provider_config.get("temperature", 0.7)
403
+
404
+ # O4 and GPT-5 models use max_completion_tokens instead of max_tokens
405
+ max_tokens_value = provider_config.get("max_tokens", 1024)
406
+ if self.uses_completion_tokens:
407
+ kwargs["max_completion_tokens"] = max_tokens_value
408
+ else:
409
+ kwargs["max_tokens"] = max_tokens_value
410
+
136
411
  # Add tools if bound using adapter manager
137
412
  tool_schemas = await self._prepare_tools_for_request()
138
413
  if tool_schemas:
139
414
  kwargs["tools"] = tool_schemas
140
- kwargs["tool_choice"] = "auto"
415
+ if not use_responses_api: # Responses API handles tool choice differently
416
+ kwargs["tool_choice"] = "auto"
417
+
418
+ # Add response_format if specified (for JSON mode)
419
+ if 'response_format' in extra_kwargs:
420
+ kwargs['response_format'] = extra_kwargs['response_format']
421
+ logger.debug(f"Using response_format: {extra_kwargs['response_format']}")
141
422
 
142
423
  # Handle streaming vs non-streaming
143
424
  if self.streaming:
144
425
  # TRUE STREAMING MODE - collect all chunks from the stream
145
426
  content_chunks = []
146
- async for token in self.astream(input_data):
147
- content_chunks.append(token)
148
- content = "".join(content_chunks)
427
+ async for token in self.astream(input_data, show_reasoning=show_reasoning, **extra_kwargs):
428
+ if isinstance(token, str):
429
+ content_chunks.append(token)
430
+ elif isinstance(token, dict) and "result" in token:
431
+ # Return the final result from streaming
432
+ return token["result"]
149
433
 
434
+ # Fallback: join collected content
435
+ content = "".join(content_chunks)
150
436
  return self._format_response(content, input_data)
151
437
  else:
152
- # Non-streaming mode
153
- response = await self.client.chat.completions.create(**kwargs)
154
- message = response.choices[0].message
155
-
156
- # Update usage tracking
157
- if response.usage:
158
- self._update_token_usage(response.usage)
159
- await self._track_billing(response.usage)
160
-
161
- # Handle tool calls if present - let adapter process the complete message
162
- if message.tool_calls:
163
- # Pass the complete message object to adapter for proper tool_calls handling
164
- return self._format_response(message, input_data)
165
-
166
- # Return appropriate format based on input type
167
- return self._format_response(message.content or "", input_data)
438
+ # Non-streaming mode - choose API based on reasoning visibility
439
+ if use_responses_api:
440
+ logger.info(f"Using Responses API for model {self.model_name}")
441
+
442
+ # Convert kwargs for Responses API
443
+ responses_kwargs = {
444
+ "model": kwargs["model"],
445
+ "input": kwargs["messages"] # Responses API uses 'input' instead of 'messages'
446
+ }
447
+
448
+ # Handle max tokens parameter
449
+ if "max_completion_tokens" in kwargs:
450
+ responses_kwargs["max_output_tokens"] = kwargs["max_completion_tokens"]
451
+ elif "max_tokens" in kwargs:
452
+ responses_kwargs["max_output_tokens"] = kwargs["max_tokens"]
453
+
454
+ # Add tools if present
455
+ if "tools" in kwargs:
456
+ responses_kwargs["tools"] = kwargs["tools"]
457
+
458
+ # Add reasoning configuration for reasoning models (requires verified organization)
459
+ if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
460
+ responses_kwargs["reasoning"] = {"summary": "auto"}
461
+ logger.info("Reasoning summary enabled - using verified organization features")
462
+ elif show_reasoning and self.is_reasoning_model:
463
+ logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
464
+
465
+ # Deep research models require web_search_preview tool
466
+ if self.supports_deep_research:
467
+ if "tools" not in responses_kwargs:
468
+ responses_kwargs["tools"] = []
469
+ responses_kwargs["tools"].insert(0, {"type": "web_search_preview"})
470
+
471
+ response = await self.client.responses.create(**responses_kwargs)
472
+
473
+ # Handle Responses API format
474
+ if hasattr(response, 'output_text'):
475
+ # Modern Responses API format
476
+ content = response.output_text
477
+ usage_info = getattr(response, 'usage', None)
478
+ elif hasattr(response, 'body') and hasattr(response.body, 'response'):
479
+ # Legacy format
480
+ content = response.body.response
481
+ usage_info = getattr(response.body, 'usage', None)
482
+ else:
483
+ # Fallback handling
484
+ content = str(response)
485
+ usage_info = None
486
+
487
+ # Update usage tracking if available
488
+ if usage_info:
489
+ self._update_token_usage(usage_info)
490
+ await self._track_billing(usage_info)
491
+
492
+ return self._format_response(content, input_data)
493
+ else:
494
+ # Standard Chat Completions API
495
+ response = await self.client.chat.completions.create(**kwargs)
496
+ message = response.choices[0].message
497
+
498
+ # Debug: Log the raw OpenAI response
499
+ logger.debug(f"OpenAI response message: {message}")
500
+ if message.tool_calls:
501
+ logger.debug(f"Tool calls found: {len(message.tool_calls)}")
502
+ for i, tc in enumerate(message.tool_calls):
503
+ logger.debug(f" Tool call {i+1}: id={tc.id}, function={tc.function.name}, args={tc.function.arguments}")
504
+
505
+ # Update usage tracking
506
+ if response.usage:
507
+ self._update_token_usage(response.usage)
508
+ await self._track_billing(response.usage)
509
+
510
+ # Handle tool calls if present - let adapter process the complete message
511
+ if message.tool_calls:
512
+ # Pass the complete message object to adapter for proper tool_calls handling
513
+ return self._format_response(message, input_data)
514
+
515
+ # Return appropriate format based on input type
516
+ return self._format_response(message.content or "", input_data)
168
517
 
169
518
  except Exception as e:
170
519
  logger.error(f"Error in ainvoke: {e}")
@@ -210,11 +559,42 @@ class OpenAILLMService(BaseLLMService):
210
559
 
211
560
  def _update_token_usage(self, usage):
212
561
  """Update token usage statistics"""
213
- self.last_token_usage = {
214
- "prompt_tokens": usage.prompt_tokens,
215
- "completion_tokens": usage.completion_tokens,
216
- "total_tokens": usage.total_tokens
217
- }
562
+ # Handle different usage object structures (Chat Completions vs Responses API)
563
+ if hasattr(usage, 'prompt_tokens'):
564
+ # Chat Completions API format
565
+ self.last_token_usage = {
566
+ "prompt_tokens": usage.prompt_tokens,
567
+ "completion_tokens": usage.completion_tokens,
568
+ "total_tokens": usage.total_tokens
569
+ }
570
+ elif hasattr(usage, 'input_tokens'):
571
+ # Responses API format
572
+ self.last_token_usage = {
573
+ "prompt_tokens": usage.input_tokens,
574
+ "completion_tokens": usage.output_tokens,
575
+ "total_tokens": usage.total_tokens
576
+ }
577
+ else:
578
+ # Fallback for unknown usage format
579
+ logger.warning(f"Unknown usage format: {type(usage)}, attributes: {dir(usage)}")
580
+ self.last_token_usage = {
581
+ "prompt_tokens": 0,
582
+ "completion_tokens": 0,
583
+ "total_tokens": 0
584
+ }
585
+
586
+ # For O-series models, track reasoning tokens if available
587
+ if self.is_reasoning_model:
588
+ reasoning_tokens = 0
589
+ if hasattr(usage, 'reasoning_tokens'):
590
+ reasoning_tokens = usage.reasoning_tokens
591
+ elif hasattr(usage, 'output_tokens_details') and hasattr(usage.output_tokens_details, 'reasoning_tokens'):
592
+ reasoning_tokens = usage.output_tokens_details.reasoning_tokens
593
+
594
+ self.last_token_usage["reasoning_tokens"] = reasoning_tokens
595
+ if "reasoning_tokens" not in self.total_token_usage:
596
+ self.total_token_usage["reasoning_tokens"] = 0
597
+ self.total_token_usage["reasoning_tokens"] += reasoning_tokens
218
598
 
219
599
  # Update total usage
220
600
  self.total_token_usage["prompt_tokens"] += self.last_token_usage["prompt_tokens"]
@@ -225,15 +605,35 @@ class OpenAILLMService(BaseLLMService):
225
605
  async def _track_billing(self, usage):
226
606
  """Track billing information"""
227
607
  provider_config = self.get_provider_config()
608
+
609
+ # Prepare metadata for tracking
610
+ metadata = {
611
+ "temperature": provider_config.get("temperature", 0.7),
612
+ "max_tokens": provider_config.get("max_tokens", 1024),
613
+ "is_reasoning_model": self.is_reasoning_model
614
+ }
615
+
616
+ # Add reasoning tokens if available for O-series models
617
+ if self.is_reasoning_model and hasattr(usage, 'reasoning_tokens'):
618
+ metadata["reasoning_tokens"] = usage.reasoning_tokens
619
+
620
+ # Get tokens using the same logic as _update_token_usage
621
+ if hasattr(usage, 'prompt_tokens'):
622
+ input_tokens = usage.prompt_tokens
623
+ output_tokens = usage.completion_tokens
624
+ elif hasattr(usage, 'input_tokens'):
625
+ input_tokens = usage.input_tokens
626
+ output_tokens = usage.output_tokens
627
+ else:
628
+ input_tokens = 0
629
+ output_tokens = 0
630
+
228
631
  await self._track_usage(
229
632
  service_type=ServiceType.LLM,
230
633
  operation="chat",
231
- input_tokens=usage.prompt_tokens,
232
- output_tokens=usage.completion_tokens,
233
- metadata={
234
- "temperature": provider_config.get("temperature", 0.7),
235
- "max_tokens": provider_config.get("max_tokens", 1024)
236
- }
634
+ input_tokens=input_tokens,
635
+ output_tokens=output_tokens,
636
+ metadata=metadata
237
637
  )
238
638
 
239
639
  def get_token_usage(self) -> Dict[str, Any]:
@@ -252,14 +652,18 @@ class OpenAILLMService(BaseLLMService):
252
652
  "max_tokens": provider_config.get("max_tokens", 1024),
253
653
  "supports_streaming": True,
254
654
  "supports_functions": True,
255
- "provider": "openai"
655
+ "supports_reasoning": self.is_reasoning_model,
656
+ "supports_deep_research": self.supports_deep_research,
657
+ "provider": "openai",
658
+ "model_type": "reasoning" if self.is_reasoning_model else "standard"
256
659
  }
257
660
 
258
661
 
259
662
  async def chat(
260
663
  self,
261
664
  input_data: Union[str, List[Dict[str, str]], Any],
262
- max_tokens: Optional[int] = None
665
+ max_tokens: Optional[int] = None,
666
+ show_reasoning: bool = False
263
667
  ) -> Dict[str, Any]:
264
668
  """
265
669
  Chat method that wraps ainvoke for compatibility with base class
@@ -267,13 +671,14 @@ class OpenAILLMService(BaseLLMService):
267
671
  Args:
268
672
  input_data: Input messages
269
673
  max_tokens: Maximum tokens to generate
674
+ show_reasoning: Whether to show reasoning process (for O4 models)
270
675
 
271
676
  Returns:
272
677
  Dict containing chat response with properly formatted message object
273
678
  """
274
679
  try:
275
- # Call ainvoke and get the response (already processed by adapter)
276
- response = await self.ainvoke(input_data)
680
+ # Call ainvoke with show_reasoning parameter
681
+ response = await self.ainvoke(input_data, show_reasoning=show_reasoning)
277
682
 
278
683
  # Return the response as-is (adapter already formatted it correctly)
279
684
  # For LangChain inputs, this will be an AIMessage object
@@ -284,7 +689,9 @@ class OpenAILLMService(BaseLLMService):
284
689
  "metadata": {
285
690
  "model": self.model_name,
286
691
  "provider": self.provider_name,
287
- "max_tokens": max_tokens or self.max_tokens
692
+ "max_tokens": max_tokens or self.max_tokens,
693
+ "show_reasoning": show_reasoning,
694
+ "is_reasoning_model": self.is_reasoning_model
288
695
  }
289
696
  }
290
697
  except Exception as e:
@@ -299,6 +706,213 @@ class OpenAILLMService(BaseLLMService):
299
706
  }
300
707
  }
301
708
 
709
+ async def deep_research(
710
+ self,
711
+ input_data: Union[str, List[Dict[str, str]], Any],
712
+ research_type: Optional[str] = None,
713
+ search_enabled: bool = True
714
+ ) -> Dict[str, Any]:
715
+ """
716
+ 深度研究任务 - 专为深度研究模型设计,使用OpenAI Responses API
717
+
718
+ Args:
719
+ input_data: 研究查询或问题
720
+ research_type: 研究类型 (academic, market, competitive, etc.)
721
+ search_enabled: 是否启用网络搜索
722
+
723
+ Returns:
724
+ Dict containing research results
725
+ """
726
+ if not self.supports_deep_research:
727
+ # Fallback to regular chat for non-deep-research models
728
+ logger.info(f"Model {self.model_name} doesn't support deep research, falling back to regular chat")
729
+ return await self.chat(input_data)
730
+
731
+ try:
732
+ # Prepare messages with research context
733
+ messages = self._prepare_messages(input_data)
734
+
735
+ # Add research-specific system prompt if research_type is specified
736
+ if research_type and messages:
737
+ research_prompts = {
738
+ "academic": "You are conducting academic research. Please provide thorough, well-sourced analysis with proper citations and methodical reasoning.",
739
+ "market": "You are conducting market research. Focus on market trends, competitive analysis, and business insights.",
740
+ "competitive": "You are conducting competitive analysis. Compare and contrast different approaches, solutions, or entities.",
741
+ "technical": "You are conducting technical research. Provide detailed technical analysis with implementation considerations."
742
+ }
743
+
744
+ if research_type in research_prompts:
745
+ # Insert system message at the beginning
746
+ system_msg = {"role": "system", "content": research_prompts[research_type]}
747
+ if messages[0].get("role") == "system":
748
+ messages[0]["content"] = research_prompts[research_type] + "\n\n" + messages[0]["content"]
749
+ else:
750
+ messages.insert(0, system_msg)
751
+
752
+ # Prepare request kwargs for Responses API
753
+ provider_config = self.get_provider_config()
754
+ kwargs = {
755
+ "model": self.model_name,
756
+ "input": messages # Responses API uses 'input' instead of 'messages'
757
+ }
758
+
759
+ # Responses API uses max_output_tokens instead of max_completion_tokens
760
+ max_tokens_value = provider_config.get("max_tokens", 4096)
761
+ kwargs["max_output_tokens"] = max_tokens_value
762
+
763
+ # Deep research models require web_search_preview tool when search is enabled
764
+ if search_enabled:
765
+ kwargs["tools"] = [
766
+ {
767
+ "type": "web_search_preview"
768
+ }
769
+ ]
770
+
771
+ # Add any additional bound tools
772
+ tool_schemas = await self._prepare_tools_for_request()
773
+ if tool_schemas:
774
+ if "tools" not in kwargs:
775
+ kwargs["tools"] = []
776
+ kwargs["tools"].extend(tool_schemas)
777
+
778
+ # Check if streaming is enabled
779
+ if self.streaming:
780
+ # Use streaming mode for deep research
781
+ logger.info(f"Using Responses API streaming for deep research model {self.model_name}")
782
+ kwargs["stream"] = True
783
+
784
+ content_chunks = []
785
+ stream = await self.client.responses.create(**kwargs)
786
+
787
+ async for event in stream:
788
+ if event.type == 'response.output_text.delta':
789
+ if event.delta:
790
+ content_chunks.append(event.delta)
791
+
792
+ message_content = "".join(content_chunks)
793
+
794
+ # Track estimated usage for streaming
795
+ messages = self._prepare_messages(input_data)
796
+ self._track_streaming_usage(messages, message_content)
797
+
798
+ # Format response
799
+ formatted_response = self._format_response(message_content or "", input_data)
800
+ else:
801
+ # Use non-streaming mode for deep research
802
+ logger.info(f"Using Responses API for deep research model {self.model_name}")
803
+ response = await self.client.responses.create(**kwargs)
804
+
805
+ # Extract the response content from Responses API format
806
+ if hasattr(response, 'output_text'):
807
+ # Modern Responses API format
808
+ message_content = response.output_text
809
+ usage_info = getattr(response, 'usage', None)
810
+ elif hasattr(response, 'body') and hasattr(response.body, 'response'):
811
+ # Legacy Responses API format
812
+ message_content = response.body.response
813
+ usage_info = getattr(response.body, 'usage', None)
814
+ elif hasattr(response, 'choices') and response.choices:
815
+ # Fallback to standard format
816
+ message_content = response.choices[0].message.content
817
+ usage_info = getattr(response, 'usage', None)
818
+ else:
819
+ # Handle unexpected format
820
+ message_content = str(response)
821
+ usage_info = None
822
+
823
+ # Update usage tracking if available
824
+ if usage_info:
825
+ self._update_token_usage(usage_info)
826
+ await self._track_billing(usage_info)
827
+
828
+ # Format response
829
+ formatted_response = self._format_response(message_content or "", input_data)
830
+
831
+ return {
832
+ "result": formatted_response,
833
+ "research_type": research_type,
834
+ "search_enabled": search_enabled,
835
+ "success": True,
836
+ "metadata": {
837
+ "model": self.model_name,
838
+ "provider": self.provider_name,
839
+ "supports_deep_research": self.supports_deep_research,
840
+ "reasoning_model": self.is_reasoning_model,
841
+ "api_used": "responses"
842
+ }
843
+ }
844
+
845
+ except Exception as e:
846
+ logger.error(f"Deep research failed: {e}")
847
+ return {
848
+ "result": None,
849
+ "success": False,
850
+ "error": str(e),
851
+ "metadata": {
852
+ "model": self.model_name,
853
+ "provider": self.provider_name,
854
+ "api_used": "responses"
855
+ }
856
+ }
857
+
302
858
  async def close(self):
303
859
  """Close the backend client"""
304
- await self.client.close()
860
+ await self.client.close()
861
+
862
+ def _get_streaming_billing_info(self) -> Dict[str, Any]:
863
+ """Get billing information for streaming requests"""
864
+ try:
865
+ # Check if service has model_manager with billing_tracker
866
+ if hasattr(self, 'model_manager') and hasattr(self.model_manager, 'billing_tracker'):
867
+ billing_tracker = self.model_manager.billing_tracker
868
+
869
+ # Get the latest usage record for this model
870
+ model_records = [
871
+ record for record in billing_tracker.usage_records
872
+ if record.model_id == self.model_name
873
+ ]
874
+
875
+ if model_records:
876
+ # Get the most recent record
877
+ latest_record = max(model_records, key=lambda r: r.timestamp)
878
+
879
+ return {
880
+ "cost_usd": latest_record.cost_usd,
881
+ "input_tokens": latest_record.input_tokens,
882
+ "output_tokens": latest_record.output_tokens,
883
+ "total_tokens": latest_record.total_tokens,
884
+ "operation": latest_record.operation,
885
+ "timestamp": latest_record.timestamp,
886
+ "currency": "USD"
887
+ }
888
+
889
+ # Fallback: use last token usage with estimated cost
890
+ last_usage = self.get_last_token_usage()
891
+ estimated_cost = 0.0
892
+
893
+ if hasattr(self, 'model_manager'):
894
+ estimated_cost = self.model_manager.calculate_cost(
895
+ provider=self.provider_name,
896
+ model_name=self.model_name,
897
+ input_tokens=last_usage.get("prompt_tokens", 0),
898
+ output_tokens=last_usage.get("completion_tokens", 0)
899
+ )
900
+
901
+ return {
902
+ "cost_usd": estimated_cost,
903
+ "input_tokens": last_usage.get("prompt_tokens", 0),
904
+ "output_tokens": last_usage.get("completion_tokens", 0),
905
+ "total_tokens": last_usage.get("total_tokens", 0),
906
+ "operation": "chat",
907
+ "timestamp": None,
908
+ "currency": "USD",
909
+ "note": "Estimated from last token usage"
910
+ }
911
+
912
+ except Exception as e:
913
+ logger.warning(f"Failed to get streaming billing info: {e}")
914
+ return {
915
+ "cost_usd": 0.0,
916
+ "error": str(e),
917
+ "currency": "USD"
918
+ }