isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import json
|
4
|
+
import asyncio
|
4
5
|
from typing import Dict, Any, List, Union, AsyncGenerator, Optional, Callable
|
5
6
|
|
6
7
|
# 使用官方 OpenAI 库
|
@@ -17,9 +18,18 @@ class OpenAILLMService(BaseLLMService):
|
|
17
18
|
def __init__(self, model_name: str = "gpt-4o-mini", provider_name: str = "openai", **kwargs):
|
18
19
|
super().__init__(provider_name, model_name, **kwargs)
|
19
20
|
|
21
|
+
# Check if this is an O-series reasoning model
|
22
|
+
self.is_reasoning_model = model_name.startswith("o4-") or model_name.startswith("o3-")
|
23
|
+
self.uses_completion_tokens = self.is_reasoning_model or model_name.startswith("gpt-5")
|
24
|
+
self.requires_default_temperature = self.is_reasoning_model or model_name.startswith("gpt-5")
|
25
|
+
self.supports_deep_research = "deep-search" in model_name or "deep-research" in model_name
|
26
|
+
|
20
27
|
# Get configuration from centralized config manager
|
21
28
|
provider_config = self.get_provider_config()
|
22
29
|
|
30
|
+
# Check if reasoning summary is enabled (requires verified organization)
|
31
|
+
self.enable_reasoning_summary = provider_config.get("enable_reasoning_summary", False)
|
32
|
+
|
23
33
|
# Initialize AsyncOpenAI client with provider configuration
|
24
34
|
try:
|
25
35
|
if not provider_config.get("api_key"):
|
@@ -28,7 +38,9 @@ class OpenAILLMService(BaseLLMService):
|
|
28
38
|
self.client = AsyncOpenAI(
|
29
39
|
api_key=provider_config["api_key"],
|
30
40
|
base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
|
31
|
-
organization=provider_config.get("organization")
|
41
|
+
organization=provider_config.get("organization"),
|
42
|
+
timeout=10.0, # 10 second timeout for first token (much faster than 600s default)
|
43
|
+
max_retries=2 # Retry on timeout
|
32
44
|
)
|
33
45
|
|
34
46
|
logger.info(f"Initialized OpenAILLMService with model {self.model_name} and endpoint {self.client.base_url}")
|
@@ -40,11 +52,42 @@ class OpenAILLMService(BaseLLMService):
|
|
40
52
|
self.last_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
41
53
|
self.total_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "requests_count": 0}
|
42
54
|
|
55
|
+
# For O-series models, track reasoning tokens separately
|
56
|
+
if self.is_reasoning_model:
|
57
|
+
self.last_token_usage["reasoning_tokens"] = 0
|
58
|
+
self.total_token_usage["reasoning_tokens"] = 0
|
59
|
+
|
43
60
|
|
44
61
|
def _create_bound_copy(self) -> 'OpenAILLMService':
|
45
62
|
"""Create a copy of this service for tool binding"""
|
46
|
-
|
47
|
-
bound_service
|
63
|
+
# Create new instance but bypass full initialization
|
64
|
+
bound_service = object.__new__(OpenAILLMService)
|
65
|
+
|
66
|
+
# Copy all essential attributes from original service
|
67
|
+
bound_service.model_name = self.model_name
|
68
|
+
bound_service.provider_name = self.provider_name
|
69
|
+
bound_service.client = self.client # Reuse the same OpenAI client
|
70
|
+
bound_service.last_token_usage = self.last_token_usage.copy()
|
71
|
+
bound_service.total_token_usage = self.total_token_usage.copy()
|
72
|
+
bound_service._bound_tools = self._bound_tools.copy() if self._bound_tools else []
|
73
|
+
bound_service.adapter_manager = self.adapter_manager # Reuse adapter manager
|
74
|
+
|
75
|
+
# Copy OpenAI-specific attributes
|
76
|
+
bound_service.is_reasoning_model = self.is_reasoning_model
|
77
|
+
bound_service.uses_completion_tokens = self.uses_completion_tokens
|
78
|
+
bound_service.requires_default_temperature = self.requires_default_temperature
|
79
|
+
bound_service.supports_deep_research = self.supports_deep_research
|
80
|
+
|
81
|
+
# Copy base class attributes
|
82
|
+
bound_service.streaming = self.streaming
|
83
|
+
bound_service.max_tokens = self.max_tokens
|
84
|
+
bound_service.temperature = self.temperature
|
85
|
+
bound_service._tool_mappings = {}
|
86
|
+
|
87
|
+
# Copy BaseService attributes that are needed
|
88
|
+
bound_service.config_manager = self.config_manager
|
89
|
+
bound_service.model_manager = self.model_manager
|
90
|
+
|
48
91
|
return bound_service
|
49
92
|
|
50
93
|
def bind_tools(self, tools: List[Any], **kwargs) -> 'OpenAILLMService':
|
@@ -66,16 +109,133 @@ class OpenAILLMService(BaseLLMService):
|
|
66
109
|
|
67
110
|
return bound_service
|
68
111
|
|
69
|
-
async def astream(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[str, None]:
|
112
|
+
async def astream(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
70
113
|
"""
|
71
114
|
True streaming method - yields tokens one by one as they arrive
|
72
115
|
|
73
116
|
Args:
|
74
117
|
input_data: Same as ainvoke
|
118
|
+
show_reasoning: If True and model supports it, show reasoning process using Responses API
|
75
119
|
|
76
120
|
Yields:
|
77
|
-
Individual tokens as they arrive from the API
|
121
|
+
Individual tokens as they arrive from the API, plus final result object with tool_calls
|
78
122
|
"""
|
123
|
+
try:
|
124
|
+
# Determine which API to use for streaming
|
125
|
+
use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
|
126
|
+
|
127
|
+
if use_responses_api:
|
128
|
+
logger.info(f"Using Responses API streaming for {self.model_name}")
|
129
|
+
# Use Responses API streaming
|
130
|
+
async for chunk in self._astream_responses_api(input_data, show_reasoning, **extra_kwargs):
|
131
|
+
yield chunk
|
132
|
+
else:
|
133
|
+
logger.debug(f"Using Chat Completions API streaming for {self.model_name}")
|
134
|
+
# Use Chat Completions API streaming
|
135
|
+
async for chunk in self._astream_chat_completions_api(input_data, **extra_kwargs):
|
136
|
+
yield chunk
|
137
|
+
|
138
|
+
except Exception as e:
|
139
|
+
logger.error(f"Error in astream: {e}")
|
140
|
+
raise
|
141
|
+
|
142
|
+
async def _astream_responses_api(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
143
|
+
"""Stream using Responses API for reasoning models and deep research models"""
|
144
|
+
try:
|
145
|
+
# Use adapter manager to prepare messages
|
146
|
+
messages = self._prepare_messages(input_data)
|
147
|
+
|
148
|
+
# Prepare request kwargs for Responses API
|
149
|
+
provider_config = self.get_provider_config()
|
150
|
+
kwargs = {
|
151
|
+
"model": self.model_name,
|
152
|
+
"input": messages, # Responses API uses 'input' instead of 'messages'
|
153
|
+
"stream": True
|
154
|
+
}
|
155
|
+
|
156
|
+
# Responses API uses max_output_tokens
|
157
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
158
|
+
kwargs["max_output_tokens"] = max_tokens_value
|
159
|
+
|
160
|
+
# Add reasoning configuration if needed (optional - requires verified organization)
|
161
|
+
if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
|
162
|
+
kwargs["reasoning"] = {"summary": "auto"}
|
163
|
+
logger.info("Reasoning summary enabled - using verified organization features")
|
164
|
+
elif show_reasoning and self.is_reasoning_model:
|
165
|
+
logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
|
166
|
+
|
167
|
+
# Deep research models require web_search_preview tool
|
168
|
+
if self.supports_deep_research:
|
169
|
+
kwargs["tools"] = [{"type": "web_search_preview"}]
|
170
|
+
|
171
|
+
# Add any additional bound tools
|
172
|
+
tool_schemas = await self._prepare_tools_for_request()
|
173
|
+
if tool_schemas:
|
174
|
+
if "tools" not in kwargs:
|
175
|
+
kwargs["tools"] = []
|
176
|
+
kwargs["tools"].extend(tool_schemas)
|
177
|
+
|
178
|
+
# Stream using Responses API
|
179
|
+
content_chunks = []
|
180
|
+
reasoning_items = []
|
181
|
+
|
182
|
+
try:
|
183
|
+
logger.info(f"Streaming with Responses API for model {self.model_name}")
|
184
|
+
stream = await self.client.responses.create(**kwargs)
|
185
|
+
|
186
|
+
async for event in stream:
|
187
|
+
# Handle different event types from Responses API
|
188
|
+
if event.type == 'response.output_text.delta':
|
189
|
+
# Stream text content
|
190
|
+
if event.delta:
|
191
|
+
content_chunks.append(event.delta)
|
192
|
+
yield event.delta
|
193
|
+
|
194
|
+
elif event.type == 'response.reasoning.delta' and show_reasoning:
|
195
|
+
# Stream reasoning content (if enabled)
|
196
|
+
if hasattr(event, 'delta') and event.delta:
|
197
|
+
yield f"[思考: {event.delta}]"
|
198
|
+
|
199
|
+
elif event.type == 'response.output_item.done':
|
200
|
+
# Handle completed items (reasoning, function calls, etc.)
|
201
|
+
if hasattr(event, 'item'):
|
202
|
+
if event.item.type == 'reasoning':
|
203
|
+
reasoning_items.append(event.item)
|
204
|
+
elif event.item.type == 'function_call':
|
205
|
+
# Handle function call completion
|
206
|
+
logger.debug(f"Function call completed: {event.item}")
|
207
|
+
|
208
|
+
# Create final response object
|
209
|
+
full_content = "".join(content_chunks)
|
210
|
+
|
211
|
+
# Track usage for streaming
|
212
|
+
self._track_streaming_usage(messages, full_content)
|
213
|
+
|
214
|
+
# Get billing info
|
215
|
+
await asyncio.sleep(0.01)
|
216
|
+
billing_info = self._get_streaming_billing_info()
|
217
|
+
|
218
|
+
# Format final result
|
219
|
+
final_result = self._format_response(full_content, input_data)
|
220
|
+
|
221
|
+
# Yield final result with metadata
|
222
|
+
yield {
|
223
|
+
"result": final_result,
|
224
|
+
"billing": billing_info,
|
225
|
+
"reasoning_items": len(reasoning_items),
|
226
|
+
"api_used": "responses"
|
227
|
+
}
|
228
|
+
|
229
|
+
except Exception as e:
|
230
|
+
logger.error(f"Error in Responses API streaming: {e}")
|
231
|
+
raise
|
232
|
+
|
233
|
+
except Exception as e:
|
234
|
+
logger.error(f"Error in _astream_responses_api: {e}")
|
235
|
+
raise
|
236
|
+
|
237
|
+
async def _astream_chat_completions_api(self, input_data: Union[str, List[Dict[str, str]], Any], **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
|
238
|
+
"""Stream using Chat Completions API for standard models"""
|
79
239
|
try:
|
80
240
|
# Use adapter manager to prepare messages
|
81
241
|
messages = self._prepare_messages(input_data)
|
@@ -85,86 +245,275 @@ class OpenAILLMService(BaseLLMService):
|
|
85
245
|
kwargs = {
|
86
246
|
"model": self.model_name,
|
87
247
|
"messages": messages,
|
88
|
-
"temperature": provider_config.get("temperature", 0.7),
|
89
|
-
"max_tokens": provider_config.get("max_tokens", 1024),
|
90
248
|
"stream": True
|
91
249
|
}
|
92
250
|
|
251
|
+
# O4 and GPT-5 models only support temperature=1 (default)
|
252
|
+
if not self.requires_default_temperature:
|
253
|
+
kwargs["temperature"] = provider_config.get("temperature", 0.7)
|
254
|
+
|
255
|
+
# O4 and GPT-5 models use max_completion_tokens instead of max_tokens
|
256
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
257
|
+
if self.uses_completion_tokens:
|
258
|
+
kwargs["max_completion_tokens"] = max_tokens_value
|
259
|
+
else:
|
260
|
+
kwargs["max_tokens"] = max_tokens_value
|
261
|
+
|
93
262
|
# Add tools if bound using adapter manager
|
94
263
|
tool_schemas = await self._prepare_tools_for_request()
|
95
264
|
if tool_schemas:
|
96
265
|
kwargs["tools"] = tool_schemas
|
97
266
|
kwargs["tool_choice"] = "auto"
|
98
267
|
|
99
|
-
#
|
268
|
+
# Add response_format if specified (for JSON mode)
|
269
|
+
if 'response_format' in extra_kwargs:
|
270
|
+
kwargs['response_format'] = extra_kwargs['response_format']
|
271
|
+
logger.debug(f"Using response_format in streaming: {extra_kwargs['response_format']}")
|
272
|
+
|
273
|
+
# Stream tokens and detect tool calls
|
100
274
|
content_chunks = []
|
275
|
+
tool_calls_accumulator = {} # Track complete tool calls by ID
|
276
|
+
has_tool_calls = False
|
277
|
+
|
101
278
|
try:
|
102
279
|
stream = await self.client.chat.completions.create(**kwargs)
|
103
280
|
async for chunk in stream:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
281
|
+
delta = chunk.choices[0].delta
|
282
|
+
|
283
|
+
# Check for tool calls first
|
284
|
+
if hasattr(delta, 'tool_calls') and delta.tool_calls:
|
285
|
+
has_tool_calls = True
|
286
|
+
for tool_call in delta.tool_calls:
|
287
|
+
tool_index = getattr(tool_call, 'index', 0) # OpenAI uses index for streaming
|
288
|
+
|
289
|
+
# Use index as key since streaming tool calls use index
|
290
|
+
tool_key = f"tool_{tool_index}"
|
291
|
+
|
292
|
+
# Initialize tool call if not seen before
|
293
|
+
if tool_key not in tool_calls_accumulator:
|
294
|
+
tool_calls_accumulator[tool_key] = {
|
295
|
+
'id': getattr(tool_call, 'id', f"call_{tool_index}"),
|
296
|
+
'type': 'function',
|
297
|
+
'function': {
|
298
|
+
'name': '',
|
299
|
+
'arguments': ''
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
# Accumulate function name
|
304
|
+
if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'name') and tool_call.function.name:
|
305
|
+
tool_calls_accumulator[tool_key]['function']['name'] += tool_call.function.name
|
306
|
+
|
307
|
+
# Accumulate function arguments
|
308
|
+
if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
|
309
|
+
if tool_call.function.arguments:
|
310
|
+
tool_calls_accumulator[tool_key]['function']['arguments'] += tool_call.function.arguments
|
311
|
+
|
312
|
+
# Handle regular content - only stream if no tool calls detected
|
313
|
+
elif delta.content:
|
314
|
+
content_chunks.append(delta.content)
|
315
|
+
if not has_tool_calls: # Only yield content if no tool calls
|
316
|
+
yield delta.content
|
317
|
+
|
318
|
+
# Always yield final result at the end
|
319
|
+
# - If has tool_calls: complete structured response (no prior streaming)
|
320
|
+
# - If no tool_calls: AIMessage after streaming content
|
321
|
+
|
322
|
+
# Create a mock message object for adapter processing
|
323
|
+
class MockMessage:
|
324
|
+
def __init__(self):
|
325
|
+
self.content = "".join(content_chunks) or ""
|
326
|
+
self.tool_calls = []
|
327
|
+
# Add tool_calls if any
|
328
|
+
if tool_calls_accumulator:
|
329
|
+
for tool_data in tool_calls_accumulator.values():
|
330
|
+
mock_tool_call = type('MockToolCall', (), {
|
331
|
+
'id': tool_data['id'],
|
332
|
+
'function': type('MockFunction', (), {
|
333
|
+
'name': tool_data['function']['name'],
|
334
|
+
'arguments': tool_data['function']['arguments']
|
335
|
+
})()
|
336
|
+
})()
|
337
|
+
self.tool_calls.append(mock_tool_call)
|
338
|
+
|
339
|
+
mock_message = MockMessage()
|
340
|
+
|
341
|
+
logger.debug(f"Streaming complete - tool calls collected: {len(mock_message.tool_calls)}")
|
342
|
+
for i, tc in enumerate(mock_message.tool_calls):
|
343
|
+
logger.debug(f" Tool call {i+1}: {tc.function.name} with args: {tc.function.arguments}")
|
344
|
+
|
345
|
+
# Format response using adapter (this handles LangChain conversion)
|
346
|
+
final_result = self._format_response(mock_message, input_data)
|
347
|
+
|
348
|
+
logger.debug(f"Final result type after adapter: {type(final_result)}")
|
349
|
+
logger.debug(f"Final result has tool_calls: {hasattr(final_result, 'tool_calls')}")
|
108
350
|
|
109
351
|
# Track usage after streaming is complete
|
110
352
|
full_content = "".join(content_chunks)
|
111
353
|
self._track_streaming_usage(messages, full_content)
|
112
354
|
|
355
|
+
# Get billing info after tracking (wait a moment for billing to be recorded)
|
356
|
+
await asyncio.sleep(0.01)
|
357
|
+
billing_info = self._get_streaming_billing_info()
|
358
|
+
|
359
|
+
# Yield the final result with billing info
|
360
|
+
yield {
|
361
|
+
"result": final_result,
|
362
|
+
"billing": billing_info,
|
363
|
+
"api_used": "chat_completions"
|
364
|
+
}
|
365
|
+
|
113
366
|
except Exception as e:
|
114
|
-
logger.error(f"Error in streaming: {e}")
|
367
|
+
logger.error(f"Error in Chat Completions streaming: {e}")
|
115
368
|
raise
|
116
369
|
|
117
370
|
except Exception as e:
|
118
|
-
logger.error(f"Error in
|
371
|
+
logger.error(f"Error in _astream_chat_completions_api: {e}")
|
119
372
|
raise
|
120
373
|
|
121
|
-
async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any]) -> Union[str, Any]:
|
122
|
-
"""
|
374
|
+
async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> Union[str, Any]:
|
375
|
+
"""
|
376
|
+
Unified invoke method for all input types
|
377
|
+
|
378
|
+
Args:
|
379
|
+
input_data: Input messages or text
|
380
|
+
show_reasoning: If True and model supports it, show reasoning process using Responses API
|
381
|
+
**extra_kwargs: Additional parameters to pass to the API (e.g., response_format)
|
382
|
+
"""
|
123
383
|
try:
|
124
384
|
# Use adapter manager to prepare messages
|
125
385
|
messages = self._prepare_messages(input_data)
|
126
386
|
|
387
|
+
# Determine which API to use
|
388
|
+
# Responses API is required for:
|
389
|
+
# 1. Reasoning models with show_reasoning=True
|
390
|
+
# 2. Deep research models (they only work with Responses API)
|
391
|
+
use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
|
392
|
+
|
127
393
|
# Prepare request kwargs
|
128
394
|
provider_config = self.get_provider_config()
|
129
395
|
kwargs = {
|
130
396
|
"model": self.model_name,
|
131
|
-
"messages": messages
|
132
|
-
"temperature": provider_config.get("temperature", 0.7),
|
133
|
-
"max_tokens": provider_config.get("max_tokens", 1024)
|
397
|
+
"messages": messages
|
134
398
|
}
|
135
399
|
|
400
|
+
# O4 and GPT-5 models only support temperature=1 (default)
|
401
|
+
if not self.requires_default_temperature:
|
402
|
+
kwargs["temperature"] = provider_config.get("temperature", 0.7)
|
403
|
+
|
404
|
+
# O4 and GPT-5 models use max_completion_tokens instead of max_tokens
|
405
|
+
max_tokens_value = provider_config.get("max_tokens", 1024)
|
406
|
+
if self.uses_completion_tokens:
|
407
|
+
kwargs["max_completion_tokens"] = max_tokens_value
|
408
|
+
else:
|
409
|
+
kwargs["max_tokens"] = max_tokens_value
|
410
|
+
|
136
411
|
# Add tools if bound using adapter manager
|
137
412
|
tool_schemas = await self._prepare_tools_for_request()
|
138
413
|
if tool_schemas:
|
139
414
|
kwargs["tools"] = tool_schemas
|
140
|
-
|
415
|
+
if not use_responses_api: # Responses API handles tool choice differently
|
416
|
+
kwargs["tool_choice"] = "auto"
|
417
|
+
|
418
|
+
# Add response_format if specified (for JSON mode)
|
419
|
+
if 'response_format' in extra_kwargs:
|
420
|
+
kwargs['response_format'] = extra_kwargs['response_format']
|
421
|
+
logger.debug(f"Using response_format: {extra_kwargs['response_format']}")
|
141
422
|
|
142
423
|
# Handle streaming vs non-streaming
|
143
424
|
if self.streaming:
|
144
425
|
# TRUE STREAMING MODE - collect all chunks from the stream
|
145
426
|
content_chunks = []
|
146
|
-
async for token in self.astream(input_data):
|
147
|
-
|
148
|
-
|
427
|
+
async for token in self.astream(input_data, show_reasoning=show_reasoning, **extra_kwargs):
|
428
|
+
if isinstance(token, str):
|
429
|
+
content_chunks.append(token)
|
430
|
+
elif isinstance(token, dict) and "result" in token:
|
431
|
+
# Return the final result from streaming
|
432
|
+
return token["result"]
|
149
433
|
|
434
|
+
# Fallback: join collected content
|
435
|
+
content = "".join(content_chunks)
|
150
436
|
return self._format_response(content, input_data)
|
151
437
|
else:
|
152
|
-
# Non-streaming mode
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
438
|
+
# Non-streaming mode - choose API based on reasoning visibility
|
439
|
+
if use_responses_api:
|
440
|
+
logger.info(f"Using Responses API for model {self.model_name}")
|
441
|
+
|
442
|
+
# Convert kwargs for Responses API
|
443
|
+
responses_kwargs = {
|
444
|
+
"model": kwargs["model"],
|
445
|
+
"input": kwargs["messages"] # Responses API uses 'input' instead of 'messages'
|
446
|
+
}
|
447
|
+
|
448
|
+
# Handle max tokens parameter
|
449
|
+
if "max_completion_tokens" in kwargs:
|
450
|
+
responses_kwargs["max_output_tokens"] = kwargs["max_completion_tokens"]
|
451
|
+
elif "max_tokens" in kwargs:
|
452
|
+
responses_kwargs["max_output_tokens"] = kwargs["max_tokens"]
|
453
|
+
|
454
|
+
# Add tools if present
|
455
|
+
if "tools" in kwargs:
|
456
|
+
responses_kwargs["tools"] = kwargs["tools"]
|
457
|
+
|
458
|
+
# Add reasoning configuration for reasoning models (requires verified organization)
|
459
|
+
if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
|
460
|
+
responses_kwargs["reasoning"] = {"summary": "auto"}
|
461
|
+
logger.info("Reasoning summary enabled - using verified organization features")
|
462
|
+
elif show_reasoning and self.is_reasoning_model:
|
463
|
+
logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
|
464
|
+
|
465
|
+
# Deep research models require web_search_preview tool
|
466
|
+
if self.supports_deep_research:
|
467
|
+
if "tools" not in responses_kwargs:
|
468
|
+
responses_kwargs["tools"] = []
|
469
|
+
responses_kwargs["tools"].insert(0, {"type": "web_search_preview"})
|
470
|
+
|
471
|
+
response = await self.client.responses.create(**responses_kwargs)
|
472
|
+
|
473
|
+
# Handle Responses API format
|
474
|
+
if hasattr(response, 'output_text'):
|
475
|
+
# Modern Responses API format
|
476
|
+
content = response.output_text
|
477
|
+
usage_info = getattr(response, 'usage', None)
|
478
|
+
elif hasattr(response, 'body') and hasattr(response.body, 'response'):
|
479
|
+
# Legacy format
|
480
|
+
content = response.body.response
|
481
|
+
usage_info = getattr(response.body, 'usage', None)
|
482
|
+
else:
|
483
|
+
# Fallback handling
|
484
|
+
content = str(response)
|
485
|
+
usage_info = None
|
486
|
+
|
487
|
+
# Update usage tracking if available
|
488
|
+
if usage_info:
|
489
|
+
self._update_token_usage(usage_info)
|
490
|
+
await self._track_billing(usage_info)
|
491
|
+
|
492
|
+
return self._format_response(content, input_data)
|
493
|
+
else:
|
494
|
+
# Standard Chat Completions API
|
495
|
+
response = await self.client.chat.completions.create(**kwargs)
|
496
|
+
message = response.choices[0].message
|
497
|
+
|
498
|
+
# Debug: Log the raw OpenAI response
|
499
|
+
logger.debug(f"OpenAI response message: {message}")
|
500
|
+
if message.tool_calls:
|
501
|
+
logger.debug(f"Tool calls found: {len(message.tool_calls)}")
|
502
|
+
for i, tc in enumerate(message.tool_calls):
|
503
|
+
logger.debug(f" Tool call {i+1}: id={tc.id}, function={tc.function.name}, args={tc.function.arguments}")
|
504
|
+
|
505
|
+
# Update usage tracking
|
506
|
+
if response.usage:
|
507
|
+
self._update_token_usage(response.usage)
|
508
|
+
await self._track_billing(response.usage)
|
509
|
+
|
510
|
+
# Handle tool calls if present - let adapter process the complete message
|
511
|
+
if message.tool_calls:
|
512
|
+
# Pass the complete message object to adapter for proper tool_calls handling
|
513
|
+
return self._format_response(message, input_data)
|
514
|
+
|
515
|
+
# Return appropriate format based on input type
|
516
|
+
return self._format_response(message.content or "", input_data)
|
168
517
|
|
169
518
|
except Exception as e:
|
170
519
|
logger.error(f"Error in ainvoke: {e}")
|
@@ -210,11 +559,42 @@ class OpenAILLMService(BaseLLMService):
|
|
210
559
|
|
211
560
|
def _update_token_usage(self, usage):
|
212
561
|
"""Update token usage statistics"""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
562
|
+
# Handle different usage object structures (Chat Completions vs Responses API)
|
563
|
+
if hasattr(usage, 'prompt_tokens'):
|
564
|
+
# Chat Completions API format
|
565
|
+
self.last_token_usage = {
|
566
|
+
"prompt_tokens": usage.prompt_tokens,
|
567
|
+
"completion_tokens": usage.completion_tokens,
|
568
|
+
"total_tokens": usage.total_tokens
|
569
|
+
}
|
570
|
+
elif hasattr(usage, 'input_tokens'):
|
571
|
+
# Responses API format
|
572
|
+
self.last_token_usage = {
|
573
|
+
"prompt_tokens": usage.input_tokens,
|
574
|
+
"completion_tokens": usage.output_tokens,
|
575
|
+
"total_tokens": usage.total_tokens
|
576
|
+
}
|
577
|
+
else:
|
578
|
+
# Fallback for unknown usage format
|
579
|
+
logger.warning(f"Unknown usage format: {type(usage)}, attributes: {dir(usage)}")
|
580
|
+
self.last_token_usage = {
|
581
|
+
"prompt_tokens": 0,
|
582
|
+
"completion_tokens": 0,
|
583
|
+
"total_tokens": 0
|
584
|
+
}
|
585
|
+
|
586
|
+
# For O-series models, track reasoning tokens if available
|
587
|
+
if self.is_reasoning_model:
|
588
|
+
reasoning_tokens = 0
|
589
|
+
if hasattr(usage, 'reasoning_tokens'):
|
590
|
+
reasoning_tokens = usage.reasoning_tokens
|
591
|
+
elif hasattr(usage, 'output_tokens_details') and hasattr(usage.output_tokens_details, 'reasoning_tokens'):
|
592
|
+
reasoning_tokens = usage.output_tokens_details.reasoning_tokens
|
593
|
+
|
594
|
+
self.last_token_usage["reasoning_tokens"] = reasoning_tokens
|
595
|
+
if "reasoning_tokens" not in self.total_token_usage:
|
596
|
+
self.total_token_usage["reasoning_tokens"] = 0
|
597
|
+
self.total_token_usage["reasoning_tokens"] += reasoning_tokens
|
218
598
|
|
219
599
|
# Update total usage
|
220
600
|
self.total_token_usage["prompt_tokens"] += self.last_token_usage["prompt_tokens"]
|
@@ -225,15 +605,35 @@ class OpenAILLMService(BaseLLMService):
|
|
225
605
|
async def _track_billing(self, usage):
|
226
606
|
"""Track billing information"""
|
227
607
|
provider_config = self.get_provider_config()
|
608
|
+
|
609
|
+
# Prepare metadata for tracking
|
610
|
+
metadata = {
|
611
|
+
"temperature": provider_config.get("temperature", 0.7),
|
612
|
+
"max_tokens": provider_config.get("max_tokens", 1024),
|
613
|
+
"is_reasoning_model": self.is_reasoning_model
|
614
|
+
}
|
615
|
+
|
616
|
+
# Add reasoning tokens if available for O-series models
|
617
|
+
if self.is_reasoning_model and hasattr(usage, 'reasoning_tokens'):
|
618
|
+
metadata["reasoning_tokens"] = usage.reasoning_tokens
|
619
|
+
|
620
|
+
# Get tokens using the same logic as _update_token_usage
|
621
|
+
if hasattr(usage, 'prompt_tokens'):
|
622
|
+
input_tokens = usage.prompt_tokens
|
623
|
+
output_tokens = usage.completion_tokens
|
624
|
+
elif hasattr(usage, 'input_tokens'):
|
625
|
+
input_tokens = usage.input_tokens
|
626
|
+
output_tokens = usage.output_tokens
|
627
|
+
else:
|
628
|
+
input_tokens = 0
|
629
|
+
output_tokens = 0
|
630
|
+
|
228
631
|
await self._track_usage(
|
229
632
|
service_type=ServiceType.LLM,
|
230
633
|
operation="chat",
|
231
|
-
input_tokens=
|
232
|
-
output_tokens=
|
233
|
-
metadata=
|
234
|
-
"temperature": provider_config.get("temperature", 0.7),
|
235
|
-
"max_tokens": provider_config.get("max_tokens", 1024)
|
236
|
-
}
|
634
|
+
input_tokens=input_tokens,
|
635
|
+
output_tokens=output_tokens,
|
636
|
+
metadata=metadata
|
237
637
|
)
|
238
638
|
|
239
639
|
def get_token_usage(self) -> Dict[str, Any]:
|
@@ -252,14 +652,18 @@ class OpenAILLMService(BaseLLMService):
|
|
252
652
|
"max_tokens": provider_config.get("max_tokens", 1024),
|
253
653
|
"supports_streaming": True,
|
254
654
|
"supports_functions": True,
|
255
|
-
"
|
655
|
+
"supports_reasoning": self.is_reasoning_model,
|
656
|
+
"supports_deep_research": self.supports_deep_research,
|
657
|
+
"provider": "openai",
|
658
|
+
"model_type": "reasoning" if self.is_reasoning_model else "standard"
|
256
659
|
}
|
257
660
|
|
258
661
|
|
259
662
|
async def chat(
|
260
663
|
self,
|
261
664
|
input_data: Union[str, List[Dict[str, str]], Any],
|
262
|
-
max_tokens: Optional[int] = None
|
665
|
+
max_tokens: Optional[int] = None,
|
666
|
+
show_reasoning: bool = False
|
263
667
|
) -> Dict[str, Any]:
|
264
668
|
"""
|
265
669
|
Chat method that wraps ainvoke for compatibility with base class
|
@@ -267,13 +671,14 @@ class OpenAILLMService(BaseLLMService):
|
|
267
671
|
Args:
|
268
672
|
input_data: Input messages
|
269
673
|
max_tokens: Maximum tokens to generate
|
674
|
+
show_reasoning: Whether to show reasoning process (for O4 models)
|
270
675
|
|
271
676
|
Returns:
|
272
677
|
Dict containing chat response with properly formatted message object
|
273
678
|
"""
|
274
679
|
try:
|
275
|
-
# Call ainvoke
|
276
|
-
response = await self.ainvoke(input_data)
|
680
|
+
# Call ainvoke with show_reasoning parameter
|
681
|
+
response = await self.ainvoke(input_data, show_reasoning=show_reasoning)
|
277
682
|
|
278
683
|
# Return the response as-is (adapter already formatted it correctly)
|
279
684
|
# For LangChain inputs, this will be an AIMessage object
|
@@ -284,7 +689,9 @@ class OpenAILLMService(BaseLLMService):
|
|
284
689
|
"metadata": {
|
285
690
|
"model": self.model_name,
|
286
691
|
"provider": self.provider_name,
|
287
|
-
"max_tokens": max_tokens or self.max_tokens
|
692
|
+
"max_tokens": max_tokens or self.max_tokens,
|
693
|
+
"show_reasoning": show_reasoning,
|
694
|
+
"is_reasoning_model": self.is_reasoning_model
|
288
695
|
}
|
289
696
|
}
|
290
697
|
except Exception as e:
|
@@ -299,6 +706,213 @@ class OpenAILLMService(BaseLLMService):
|
|
299
706
|
}
|
300
707
|
}
|
301
708
|
|
709
|
+
async def deep_research(
|
710
|
+
self,
|
711
|
+
input_data: Union[str, List[Dict[str, str]], Any],
|
712
|
+
research_type: Optional[str] = None,
|
713
|
+
search_enabled: bool = True
|
714
|
+
) -> Dict[str, Any]:
|
715
|
+
"""
|
716
|
+
深度研究任务 - 专为深度研究模型设计,使用OpenAI Responses API
|
717
|
+
|
718
|
+
Args:
|
719
|
+
input_data: 研究查询或问题
|
720
|
+
research_type: 研究类型 (academic, market, competitive, etc.)
|
721
|
+
search_enabled: 是否启用网络搜索
|
722
|
+
|
723
|
+
Returns:
|
724
|
+
Dict containing research results
|
725
|
+
"""
|
726
|
+
if not self.supports_deep_research:
|
727
|
+
# Fallback to regular chat for non-deep-research models
|
728
|
+
logger.info(f"Model {self.model_name} doesn't support deep research, falling back to regular chat")
|
729
|
+
return await self.chat(input_data)
|
730
|
+
|
731
|
+
try:
|
732
|
+
# Prepare messages with research context
|
733
|
+
messages = self._prepare_messages(input_data)
|
734
|
+
|
735
|
+
# Add research-specific system prompt if research_type is specified
|
736
|
+
if research_type and messages:
|
737
|
+
research_prompts = {
|
738
|
+
"academic": "You are conducting academic research. Please provide thorough, well-sourced analysis with proper citations and methodical reasoning.",
|
739
|
+
"market": "You are conducting market research. Focus on market trends, competitive analysis, and business insights.",
|
740
|
+
"competitive": "You are conducting competitive analysis. Compare and contrast different approaches, solutions, or entities.",
|
741
|
+
"technical": "You are conducting technical research. Provide detailed technical analysis with implementation considerations."
|
742
|
+
}
|
743
|
+
|
744
|
+
if research_type in research_prompts:
|
745
|
+
# Insert system message at the beginning
|
746
|
+
system_msg = {"role": "system", "content": research_prompts[research_type]}
|
747
|
+
if messages[0].get("role") == "system":
|
748
|
+
messages[0]["content"] = research_prompts[research_type] + "\n\n" + messages[0]["content"]
|
749
|
+
else:
|
750
|
+
messages.insert(0, system_msg)
|
751
|
+
|
752
|
+
# Prepare request kwargs for Responses API
|
753
|
+
provider_config = self.get_provider_config()
|
754
|
+
kwargs = {
|
755
|
+
"model": self.model_name,
|
756
|
+
"input": messages # Responses API uses 'input' instead of 'messages'
|
757
|
+
}
|
758
|
+
|
759
|
+
# Responses API uses max_output_tokens instead of max_completion_tokens
|
760
|
+
max_tokens_value = provider_config.get("max_tokens", 4096)
|
761
|
+
kwargs["max_output_tokens"] = max_tokens_value
|
762
|
+
|
763
|
+
# Deep research models require web_search_preview tool when search is enabled
|
764
|
+
if search_enabled:
|
765
|
+
kwargs["tools"] = [
|
766
|
+
{
|
767
|
+
"type": "web_search_preview"
|
768
|
+
}
|
769
|
+
]
|
770
|
+
|
771
|
+
# Add any additional bound tools
|
772
|
+
tool_schemas = await self._prepare_tools_for_request()
|
773
|
+
if tool_schemas:
|
774
|
+
if "tools" not in kwargs:
|
775
|
+
kwargs["tools"] = []
|
776
|
+
kwargs["tools"].extend(tool_schemas)
|
777
|
+
|
778
|
+
# Check if streaming is enabled
|
779
|
+
if self.streaming:
|
780
|
+
# Use streaming mode for deep research
|
781
|
+
logger.info(f"Using Responses API streaming for deep research model {self.model_name}")
|
782
|
+
kwargs["stream"] = True
|
783
|
+
|
784
|
+
content_chunks = []
|
785
|
+
stream = await self.client.responses.create(**kwargs)
|
786
|
+
|
787
|
+
async for event in stream:
|
788
|
+
if event.type == 'response.output_text.delta':
|
789
|
+
if event.delta:
|
790
|
+
content_chunks.append(event.delta)
|
791
|
+
|
792
|
+
message_content = "".join(content_chunks)
|
793
|
+
|
794
|
+
# Track estimated usage for streaming
|
795
|
+
messages = self._prepare_messages(input_data)
|
796
|
+
self._track_streaming_usage(messages, message_content)
|
797
|
+
|
798
|
+
# Format response
|
799
|
+
formatted_response = self._format_response(message_content or "", input_data)
|
800
|
+
else:
|
801
|
+
# Use non-streaming mode for deep research
|
802
|
+
logger.info(f"Using Responses API for deep research model {self.model_name}")
|
803
|
+
response = await self.client.responses.create(**kwargs)
|
804
|
+
|
805
|
+
# Extract the response content from Responses API format
|
806
|
+
if hasattr(response, 'output_text'):
|
807
|
+
# Modern Responses API format
|
808
|
+
message_content = response.output_text
|
809
|
+
usage_info = getattr(response, 'usage', None)
|
810
|
+
elif hasattr(response, 'body') and hasattr(response.body, 'response'):
|
811
|
+
# Legacy Responses API format
|
812
|
+
message_content = response.body.response
|
813
|
+
usage_info = getattr(response.body, 'usage', None)
|
814
|
+
elif hasattr(response, 'choices') and response.choices:
|
815
|
+
# Fallback to standard format
|
816
|
+
message_content = response.choices[0].message.content
|
817
|
+
usage_info = getattr(response, 'usage', None)
|
818
|
+
else:
|
819
|
+
# Handle unexpected format
|
820
|
+
message_content = str(response)
|
821
|
+
usage_info = None
|
822
|
+
|
823
|
+
# Update usage tracking if available
|
824
|
+
if usage_info:
|
825
|
+
self._update_token_usage(usage_info)
|
826
|
+
await self._track_billing(usage_info)
|
827
|
+
|
828
|
+
# Format response
|
829
|
+
formatted_response = self._format_response(message_content or "", input_data)
|
830
|
+
|
831
|
+
return {
|
832
|
+
"result": formatted_response,
|
833
|
+
"research_type": research_type,
|
834
|
+
"search_enabled": search_enabled,
|
835
|
+
"success": True,
|
836
|
+
"metadata": {
|
837
|
+
"model": self.model_name,
|
838
|
+
"provider": self.provider_name,
|
839
|
+
"supports_deep_research": self.supports_deep_research,
|
840
|
+
"reasoning_model": self.is_reasoning_model,
|
841
|
+
"api_used": "responses"
|
842
|
+
}
|
843
|
+
}
|
844
|
+
|
845
|
+
except Exception as e:
|
846
|
+
logger.error(f"Deep research failed: {e}")
|
847
|
+
return {
|
848
|
+
"result": None,
|
849
|
+
"success": False,
|
850
|
+
"error": str(e),
|
851
|
+
"metadata": {
|
852
|
+
"model": self.model_name,
|
853
|
+
"provider": self.provider_name,
|
854
|
+
"api_used": "responses"
|
855
|
+
}
|
856
|
+
}
|
857
|
+
|
302
858
|
async def close(self):
|
303
859
|
"""Close the backend client"""
|
304
|
-
await self.client.close()
|
860
|
+
await self.client.close()
|
861
|
+
|
862
|
+
def _get_streaming_billing_info(self) -> Dict[str, Any]:
|
863
|
+
"""Get billing information for streaming requests"""
|
864
|
+
try:
|
865
|
+
# Check if service has model_manager with billing_tracker
|
866
|
+
if hasattr(self, 'model_manager') and hasattr(self.model_manager, 'billing_tracker'):
|
867
|
+
billing_tracker = self.model_manager.billing_tracker
|
868
|
+
|
869
|
+
# Get the latest usage record for this model
|
870
|
+
model_records = [
|
871
|
+
record for record in billing_tracker.usage_records
|
872
|
+
if record.model_id == self.model_name
|
873
|
+
]
|
874
|
+
|
875
|
+
if model_records:
|
876
|
+
# Get the most recent record
|
877
|
+
latest_record = max(model_records, key=lambda r: r.timestamp)
|
878
|
+
|
879
|
+
return {
|
880
|
+
"cost_usd": latest_record.cost_usd,
|
881
|
+
"input_tokens": latest_record.input_tokens,
|
882
|
+
"output_tokens": latest_record.output_tokens,
|
883
|
+
"total_tokens": latest_record.total_tokens,
|
884
|
+
"operation": latest_record.operation,
|
885
|
+
"timestamp": latest_record.timestamp,
|
886
|
+
"currency": "USD"
|
887
|
+
}
|
888
|
+
|
889
|
+
# Fallback: use last token usage with estimated cost
|
890
|
+
last_usage = self.get_last_token_usage()
|
891
|
+
estimated_cost = 0.0
|
892
|
+
|
893
|
+
if hasattr(self, 'model_manager'):
|
894
|
+
estimated_cost = self.model_manager.calculate_cost(
|
895
|
+
provider=self.provider_name,
|
896
|
+
model_name=self.model_name,
|
897
|
+
input_tokens=last_usage.get("prompt_tokens", 0),
|
898
|
+
output_tokens=last_usage.get("completion_tokens", 0)
|
899
|
+
)
|
900
|
+
|
901
|
+
return {
|
902
|
+
"cost_usd": estimated_cost,
|
903
|
+
"input_tokens": last_usage.get("prompt_tokens", 0),
|
904
|
+
"output_tokens": last_usage.get("completion_tokens", 0),
|
905
|
+
"total_tokens": last_usage.get("total_tokens", 0),
|
906
|
+
"operation": "chat",
|
907
|
+
"timestamp": None,
|
908
|
+
"currency": "USD",
|
909
|
+
"note": "Estimated from last token usage"
|
910
|
+
}
|
911
|
+
|
912
|
+
except Exception as e:
|
913
|
+
logger.warning(f"Failed to get streaming billing info: {e}")
|
914
|
+
return {
|
915
|
+
"cost_usd": 0.0,
|
916
|
+
"error": str(e),
|
917
|
+
"currency": "USD"
|
918
|
+
}
|