PyPI - isa-model - Versions diffs - 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

isa_model/client.py +1166 -584
isa_model/core/cache/redis_cache.py +410 -0
isa_model/core/config/config_manager.py +282 -12
isa_model/core/config.py +91 -1
isa_model/core/database/__init__.py +1 -0
isa_model/core/database/direct_db_client.py +114 -0
isa_model/core/database/migration_manager.py +563 -0
isa_model/core/database/migrations.py +297 -0
isa_model/core/database/supabase_client.py +258 -0
isa_model/core/dependencies.py +316 -0
isa_model/core/discovery/__init__.py +19 -0
isa_model/core/discovery/consul_discovery.py +190 -0
isa_model/core/logging/__init__.py +54 -0
isa_model/core/logging/influx_logger.py +523 -0
isa_model/core/logging/loki_logger.py +160 -0
isa_model/core/models/__init__.py +46 -0
isa_model/core/models/config_models.py +625 -0
isa_model/core/models/deployment_billing_tracker.py +430 -0
isa_model/core/models/model_billing_tracker.py +60 -88
isa_model/core/models/model_manager.py +66 -25
isa_model/core/models/model_metadata.py +690 -0
isa_model/core/models/model_repo.py +217 -55
isa_model/core/models/model_statistics_tracker.py +234 -0
isa_model/core/models/model_storage.py +0 -1
isa_model/core/models/model_version_manager.py +959 -0
isa_model/core/models/system_models.py +857 -0
isa_model/core/pricing_manager.py +2 -249
isa_model/core/repositories/__init__.py +9 -0
isa_model/core/repositories/config_repository.py +912 -0
isa_model/core/resilience/circuit_breaker.py +366 -0
isa_model/core/security/secrets.py +358 -0
isa_model/core/services/__init__.py +2 -4
isa_model/core/services/intelligent_model_selector.py +479 -370
isa_model/core/storage/hf_storage.py +2 -2
isa_model/core/types.py +8 -0
isa_model/deployment/__init__.py +5 -48
isa_model/deployment/core/__init__.py +2 -31
isa_model/deployment/core/deployment_manager.py +1278 -368
isa_model/deployment/local/__init__.py +31 -0
isa_model/deployment/local/config.py +248 -0
isa_model/deployment/local/gpu_gateway.py +607 -0
isa_model/deployment/local/health_checker.py +428 -0
isa_model/deployment/local/provider.py +586 -0
isa_model/deployment/local/tensorrt_service.py +621 -0
isa_model/deployment/local/transformers_service.py +644 -0
isa_model/deployment/local/vllm_service.py +527 -0
isa_model/deployment/modal/__init__.py +8 -0
isa_model/deployment/modal/config.py +136 -0
isa_model/deployment/modal/deployer.py +894 -0
isa_model/deployment/modal/services/__init__.py +3 -0
isa_model/deployment/modal/services/audio/__init__.py +1 -0
isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
isa_model/deployment/modal/services/embedding/__init__.py +1 -0
isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
isa_model/deployment/modal/services/llm/__init__.py +1 -0
isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
isa_model/deployment/modal/services/video/__init__.py +1 -0
isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
isa_model/deployment/modal/services/vision/__init__.py +1 -0
isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/storage/__init__.py +5 -0
isa_model/deployment/storage/deployment_repository.py +824 -0
isa_model/deployment/triton/__init__.py +10 -0
isa_model/deployment/triton/config.py +196 -0
isa_model/deployment/triton/configs/__init__.py +1 -0
isa_model/deployment/triton/provider.py +512 -0
isa_model/deployment/triton/scripts/__init__.py +1 -0
isa_model/deployment/triton/templates/__init__.py +1 -0
isa_model/inference/__init__.py +47 -1
isa_model/inference/ai_factory.py +179 -16
isa_model/inference/legacy_services/__init__.py +21 -0
isa_model/inference/legacy_services/model_evaluation.py +637 -0
isa_model/inference/legacy_services/model_service.py +573 -0
isa_model/inference/legacy_services/model_serving.py +717 -0
isa_model/inference/legacy_services/model_training.py +561 -0
isa_model/inference/models/__init__.py +21 -0
isa_model/inference/models/inference_config.py +551 -0
isa_model/inference/models/inference_record.py +675 -0
isa_model/inference/models/performance_models.py +714 -0
isa_model/inference/repositories/__init__.py +9 -0
isa_model/inference/repositories/inference_repository.py +828 -0
isa_model/inference/services/audio/__init__.py +21 -0
isa_model/inference/services/audio/base_realtime_service.py +225 -0
isa_model/inference/services/audio/base_stt_service.py +184 -11
isa_model/inference/services/audio/isa_tts_service.py +0 -0
isa_model/inference/services/audio/openai_realtime_service.py +320 -124
isa_model/inference/services/audio/openai_stt_service.py +53 -11
isa_model/inference/services/base_service.py +17 -1
isa_model/inference/services/custom_model_manager.py +277 -0
isa_model/inference/services/embedding/__init__.py +13 -0
isa_model/inference/services/embedding/base_embed_service.py +111 -8
isa_model/inference/services/embedding/isa_embed_service.py +305 -0
isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
isa_model/inference/services/embedding/openai_embed_service.py +2 -4
isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
isa_model/inference/services/img/__init__.py +2 -2
isa_model/inference/services/img/base_image_gen_service.py +24 -7
isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
isa_model/inference/services/img/services/replicate_flux.py +226 -0
isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
isa_model/inference/services/img/tests/test_img_client.py +297 -0
isa_model/inference/services/llm/__init__.py +10 -2
isa_model/inference/services/llm/base_llm_service.py +361 -26
isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
isa_model/inference/services/llm/local_llm_service.py +747 -0
isa_model/inference/services/llm/ollama_llm_service.py +11 -3
isa_model/inference/services/llm/openai_llm_service.py +670 -56
isa_model/inference/services/llm/yyds_llm_service.py +10 -3
isa_model/inference/services/vision/__init__.py +27 -6
isa_model/inference/services/vision/base_vision_service.py +118 -185
isa_model/inference/services/vision/blip_vision_service.py +359 -0
isa_model/inference/services/vision/helpers/image_utils.py +19 -10
isa_model/inference/services/vision/isa_vision_service.py +634 -0
isa_model/inference/services/vision/openai_vision_service.py +19 -10
isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
isa_model/serving/api/cache_manager.py +245 -0
isa_model/serving/api/dependencies/__init__.py +1 -0
isa_model/serving/api/dependencies/auth.py +194 -0
isa_model/serving/api/dependencies/database.py +139 -0
isa_model/serving/api/error_handlers.py +284 -0
isa_model/serving/api/fastapi_server.py +240 -18
isa_model/serving/api/middleware/auth.py +317 -0
isa_model/serving/api/middleware/security.py +268 -0
isa_model/serving/api/middleware/tenant_context.py +414 -0
isa_model/serving/api/routes/analytics.py +489 -0
isa_model/serving/api/routes/config.py +645 -0
isa_model/serving/api/routes/deployment_billing.py +315 -0
isa_model/serving/api/routes/deployments.py +475 -0
isa_model/serving/api/routes/gpu_gateway.py +440 -0
isa_model/serving/api/routes/health.py +32 -12
isa_model/serving/api/routes/inference_monitoring.py +486 -0
isa_model/serving/api/routes/local_deployments.py +448 -0
isa_model/serving/api/routes/logs.py +430 -0
isa_model/serving/api/routes/settings.py +582 -0
isa_model/serving/api/routes/tenants.py +575 -0
isa_model/serving/api/routes/unified.py +992 -171
isa_model/serving/api/routes/webhooks.py +479 -0
isa_model/serving/api/startup.py +318 -0
isa_model/serving/modal_proxy_server.py +249 -0
isa_model/utils/gpu_utils.py +311 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
isa_model-0.4.3.dist-info/RECORD +193 -0
isa_model/deployment/cloud/__init__.py +0 -9
isa_model/deployment/cloud/modal/__init__.py +0 -10
isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
isa_model/deployment/cloud/modal/register_models.py +0 -321
isa_model/deployment/core/deployment_config.py +0 -356
isa_model/deployment/core/isa_deployment_service.py +0 -401
isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
isa_model/deployment/runtime/deployed_service.py +0 -338
isa_model/deployment/services/__init__.py +0 -9
isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
isa_model/deployment/services/model_service.py +0 -332
isa_model/deployment/services/service_monitor.py +0 -356
isa_model/deployment/services/service_registry.py +0 -527
isa_model/eval/__init__.py +0 -92
isa_model/eval/benchmarks.py +0 -469
isa_model/eval/config/__init__.py +0 -10
isa_model/eval/config/evaluation_config.py +0 -108
isa_model/eval/evaluators/__init__.py +0 -18
isa_model/eval/evaluators/base_evaluator.py +0 -503
isa_model/eval/evaluators/llm_evaluator.py +0 -472
isa_model/eval/factory.py +0 -531
isa_model/eval/infrastructure/__init__.py +0 -24
isa_model/eval/infrastructure/experiment_tracker.py +0 -466
isa_model/eval/metrics.py +0 -798
isa_model/inference/adapter/unified_api.py +0 -248
isa_model/inference/services/helpers/stacked_config.py +0 -148
isa_model/inference/services/img/flux_professional_service.py +0 -603
isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/others/table_transformer_service.py +0 -61
isa_model/inference/services/vision/doc_analysis_service.py +0 -640
isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/vision/ui_analysis_service.py +0 -823
isa_model/scripts/inference_tracker.py +0 -283
isa_model/scripts/mlflow_manager.py +0 -379
isa_model/scripts/model_registry.py +0 -465
isa_model/scripts/register_models.py +0 -370
isa_model/scripts/register_models_with_embeddings.py +0 -510
isa_model/scripts/start_mlflow.py +0 -95
isa_model/scripts/training_tracker.py +0 -257
isa_model/training/__init__.py +0 -74
isa_model/training/annotation/annotation_schema.py +0 -47
isa_model/training/annotation/processors/annotation_processor.py +0 -126
isa_model/training/annotation/storage/dataset_manager.py +0 -131
isa_model/training/annotation/storage/dataset_schema.py +0 -44
isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
isa_model/training/annotation/tests/test_minio copy.py +0 -113
isa_model/training/annotation/tests/test_minio_upload.py +0 -43
isa_model/training/annotation/views/annotation_controller.py +0 -158
isa_model/training/cloud/__init__.py +0 -22
isa_model/training/cloud/job_orchestrator.py +0 -402
isa_model/training/cloud/runpod_trainer.py +0 -454
isa_model/training/cloud/storage_manager.py +0 -482
isa_model/training/core/__init__.py +0 -23
isa_model/training/core/config.py +0 -181
isa_model/training/core/dataset.py +0 -222
isa_model/training/core/trainer.py +0 -720
isa_model/training/core/utils.py +0 -213
isa_model/training/factory.py +0 -424
isa_model-0.3.91.dist-info/RECORD +0 -138
/isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
/isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0

isa_model/inference/services/llm/openai_llm_service.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import os
 import json
+import asyncio
 from typing import Dict, Any, List, Union, AsyncGenerator, Optional, Callable
 # 使用官方 OpenAI 库
@@ -17,9 +18,18 @@ class OpenAILLMService(BaseLLMService):
     def __init__(self, model_name: str = "gpt-4o-mini", provider_name: str = "openai", **kwargs):
         super().__init__(provider_name, model_name, **kwargs)
+        # Check if this is an O-series reasoning model
+        self.is_reasoning_model = model_name.startswith("o4-") or model_name.startswith("o3-")
+        self.uses_completion_tokens = self.is_reasoning_model or model_name.startswith("gpt-5")
+        self.requires_default_temperature = self.is_reasoning_model or model_name.startswith("gpt-5")
+        self.supports_deep_research = "deep-search" in model_name or "deep-research" in model_name
         # Get configuration from centralized config manager
         provider_config = self.get_provider_config()
+        # Check if reasoning summary is enabled (requires verified organization)
+        self.enable_reasoning_summary = provider_config.get("enable_reasoning_summary", False)
         # Initialize AsyncOpenAI client with provider configuration
         try:
             if not provider_config.get("api_key"):
@@ -28,7 +38,9 @@ class OpenAILLMService(BaseLLMService):
             self.client = AsyncOpenAI(
                 api_key=provider_config["api_key"],
                 base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
-                organization=provider_config.get("organization")
+                organization=provider_config.get("organization"),
+                timeout=10.0,  # 10 second timeout for first token (much faster than 600s default)
+                max_retries=2  # Retry on timeout
             )
             logger.info(f"Initialized OpenAILLMService with model {self.model_name} and endpoint {self.client.base_url}")
@@ -40,11 +52,42 @@ class OpenAILLMService(BaseLLMService):
         self.last_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
         self.total_token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "requests_count": 0}
+        # For O-series models, track reasoning tokens separately
+        if self.is_reasoning_model:
+            self.last_token_usage["reasoning_tokens"] = 0
+            self.total_token_usage["reasoning_tokens"] = 0
     def _create_bound_copy(self) -> 'OpenAILLMService':
         """Create a copy of this service for tool binding"""
-        bound_service = OpenAILLMService(self.model_name, self.provider_name)
-        bound_service._bound_tools = self._bound_tools.copy()
+        # Create new instance but bypass full initialization
+        bound_service = object.__new__(OpenAILLMService)
+        # Copy all essential attributes from original service
+        bound_service.model_name = self.model_name
+        bound_service.provider_name = self.provider_name
+        bound_service.client = self.client  # Reuse the same OpenAI client
+        bound_service.last_token_usage = self.last_token_usage.copy()
+        bound_service.total_token_usage = self.total_token_usage.copy()
+        bound_service._bound_tools = self._bound_tools.copy() if self._bound_tools else []
+        bound_service.adapter_manager = self.adapter_manager  # Reuse adapter manager
+        # Copy OpenAI-specific attributes
+        bound_service.is_reasoning_model = self.is_reasoning_model
+        bound_service.uses_completion_tokens = self.uses_completion_tokens
+        bound_service.requires_default_temperature = self.requires_default_temperature
+        bound_service.supports_deep_research = self.supports_deep_research
+        # Copy base class attributes
+        bound_service.streaming = self.streaming
+        bound_service.max_tokens = self.max_tokens
+        bound_service.temperature = self.temperature
+        bound_service._tool_mappings = {}
+        # Copy BaseService attributes that are needed
+        bound_service.config_manager = self.config_manager
+        bound_service.model_manager = self.model_manager
         return bound_service
     def bind_tools(self, tools: List[Any], **kwargs) -> 'OpenAILLMService':
@@ -66,16 +109,133 @@ class OpenAILLMService(BaseLLMService):
         return bound_service
-    async def astream(self, input_data: Union[str, List[Dict[str, str]], Any]) -> AsyncGenerator[str, None]:
+    async def astream(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
         """
         True streaming method - yields tokens one by one as they arrive
         Args:
             input_data: Same as ainvoke
+            show_reasoning: If True and model supports it, show reasoning process using Responses API
         Yields:
-            Individual tokens as they arrive from the API
+            Individual tokens as they arrive from the API, plus final result object with tool_calls
         """
+        try:
+            # Determine which API to use for streaming
+            use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
+            if use_responses_api:
+                logger.info(f"Using Responses API streaming for {self.model_name}")
+                # Use Responses API streaming
+                async for chunk in self._astream_responses_api(input_data, show_reasoning, **extra_kwargs):
+                    yield chunk
+            else:
+                logger.debug(f"Using Chat Completions API streaming for {self.model_name}")
+                # Use Chat Completions API streaming
+                async for chunk in self._astream_chat_completions_api(input_data, **extra_kwargs):
+                    yield chunk
+        except Exception as e:
+            logger.error(f"Error in astream: {e}")
+            raise
+    async def _astream_responses_api(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
+        """Stream using Responses API for reasoning models and deep research models"""
+        try:
+            # Use adapter manager to prepare messages
+            messages = self._prepare_messages(input_data)
+            # Prepare request kwargs for Responses API
+            provider_config = self.get_provider_config()
+            kwargs = {
+                "model": self.model_name,
+                "input": messages,  # Responses API uses 'input' instead of 'messages'
+                "stream": True
+            }
+            # Responses API uses max_output_tokens
+            max_tokens_value = provider_config.get("max_tokens", 1024)
+            kwargs["max_output_tokens"] = max_tokens_value
+            # Add reasoning configuration if needed (optional - requires verified organization)
+            if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
+                kwargs["reasoning"] = {"summary": "auto"}
+                logger.info("Reasoning summary enabled - using verified organization features")
+            elif show_reasoning and self.is_reasoning_model:
+                logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
+            # Deep research models require web_search_preview tool
+            if self.supports_deep_research:
+                kwargs["tools"] = [{"type": "web_search_preview"}]
+            # Add any additional bound tools
+            tool_schemas = await self._prepare_tools_for_request()
+            if tool_schemas:
+                if "tools" not in kwargs:
+                    kwargs["tools"] = []
+                kwargs["tools"].extend(tool_schemas)
+            # Stream using Responses API
+            content_chunks = []
+            reasoning_items = []
+            try:
+                logger.info(f"Streaming with Responses API for model {self.model_name}")
+                stream = await self.client.responses.create(**kwargs)
+                async for event in stream:
+                    # Handle different event types from Responses API
+                    if event.type == 'response.output_text.delta':
+                        # Stream text content
+                        if event.delta:
+                            content_chunks.append(event.delta)
+                            yield event.delta
+                    elif event.type == 'response.reasoning.delta' and show_reasoning:
+                        # Stream reasoning content (if enabled)
+                        if hasattr(event, 'delta') and event.delta:
+                            yield f"[思考: {event.delta}]"
+                    elif event.type == 'response.output_item.done':
+                        # Handle completed items (reasoning, function calls, etc.)
+                        if hasattr(event, 'item'):
+                            if event.item.type == 'reasoning':
+                                reasoning_items.append(event.item)
+                            elif event.item.type == 'function_call':
+                                # Handle function call completion
+                                logger.debug(f"Function call completed: {event.item}")
+                # Create final response object
+                full_content = "".join(content_chunks)
+                # Track usage for streaming
+                self._track_streaming_usage(messages, full_content)
+                # Get billing info
+                await asyncio.sleep(0.01)
+                billing_info = self._get_streaming_billing_info()
+                # Format final result
+                final_result = self._format_response(full_content, input_data)
+                # Yield final result with metadata
+                yield {
+                    "result": final_result,
+                    "billing": billing_info,
+                    "reasoning_items": len(reasoning_items),
+                    "api_used": "responses"
+                }
+            except Exception as e:
+                logger.error(f"Error in Responses API streaming: {e}")
+                raise
+        except Exception as e:
+            logger.error(f"Error in _astream_responses_api: {e}")
+            raise
+    async def _astream_chat_completions_api(self, input_data: Union[str, List[Dict[str, str]], Any], **extra_kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
+        """Stream using Chat Completions API for standard models"""
         try:
             # Use adapter manager to prepare messages
             messages = self._prepare_messages(input_data)
@@ -85,86 +245,275 @@ class OpenAILLMService(BaseLLMService):
             kwargs = {
                 "model": self.model_name,
                 "messages": messages,
-                "temperature": provider_config.get("temperature", 0.7),
-                "max_tokens": provider_config.get("max_tokens", 1024),
                 "stream": True
             }
+            # O4 and GPT-5 models only support temperature=1 (default)
+            if not self.requires_default_temperature:
+                kwargs["temperature"] = provider_config.get("temperature", 0.7)
+            # O4 and GPT-5 models use max_completion_tokens instead of max_tokens
+            max_tokens_value = provider_config.get("max_tokens", 1024)
+            if self.uses_completion_tokens:
+                kwargs["max_completion_tokens"] = max_tokens_value
+            else:
+                kwargs["max_tokens"] = max_tokens_value
             # Add tools if bound using adapter manager
             tool_schemas = await self._prepare_tools_for_request()
             if tool_schemas:
                 kwargs["tools"] = tool_schemas
                 kwargs["tool_choice"] = "auto"
-            # Stream tokens one by one
+            # Add response_format if specified (for JSON mode)
+            if 'response_format' in extra_kwargs:
+                kwargs['response_format'] = extra_kwargs['response_format']
+                logger.debug(f"Using response_format in streaming: {extra_kwargs['response_format']}")
+            # Stream tokens and detect tool calls
             content_chunks = []
+            tool_calls_accumulator = {}  # Track complete tool calls by ID
+            has_tool_calls = False
             try:
                 stream = await self.client.chat.completions.create(**kwargs)
                 async for chunk in stream:
-                    content = chunk.choices[0].delta.content
-                    if content:
-                        content_chunks.append(content)
-                        yield content
+                    delta = chunk.choices[0].delta
+                    # Check for tool calls first
+                    if hasattr(delta, 'tool_calls') and delta.tool_calls:
+                        has_tool_calls = True
+                        for tool_call in delta.tool_calls:
+                            tool_index = getattr(tool_call, 'index', 0)  # OpenAI uses index for streaming
+                            # Use index as key since streaming tool calls use index
+                            tool_key = f"tool_{tool_index}"
+                            # Initialize tool call if not seen before
+                            if tool_key not in tool_calls_accumulator:
+                                tool_calls_accumulator[tool_key] = {
+                                    'id': getattr(tool_call, 'id', f"call_{tool_index}"),
+                                    'type': 'function',
+                                    'function': {
+                                        'name': '',
+                                        'arguments': ''
+                                    }
+                                }
+                            # Accumulate function name
+                            if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'name') and tool_call.function.name:
+                                tool_calls_accumulator[tool_key]['function']['name'] += tool_call.function.name
+                            # Accumulate function arguments
+                            if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
+                                if tool_call.function.arguments:
+                                    tool_calls_accumulator[tool_key]['function']['arguments'] += tool_call.function.arguments
+                    # Handle regular content - only stream if no tool calls detected
+                    elif delta.content:
+                        content_chunks.append(delta.content)
+                        if not has_tool_calls:  # Only yield content if no tool calls
+                            yield delta.content
+                # Always yield final result at the end
+                # - If has tool_calls: complete structured response (no prior streaming)
+                # - If no tool_calls: AIMessage after streaming content
+                # Create a mock message object for adapter processing
+                class MockMessage:
+                    def __init__(self):
+                        self.content = "".join(content_chunks) or ""
+                        self.tool_calls = []
+                        # Add tool_calls if any
+                        if tool_calls_accumulator:
+                            for tool_data in tool_calls_accumulator.values():
+                                mock_tool_call = type('MockToolCall', (), {
+                                    'id': tool_data['id'],
+                                    'function': type('MockFunction', (), {
+                                        'name': tool_data['function']['name'],
+                                        'arguments': tool_data['function']['arguments']
+                                    })()
+                                })()
+                                self.tool_calls.append(mock_tool_call)
+                mock_message = MockMessage()
+                logger.debug(f"Streaming complete - tool calls collected: {len(mock_message.tool_calls)}")
+                for i, tc in enumerate(mock_message.tool_calls):
+                    logger.debug(f"  Tool call {i+1}: {tc.function.name} with args: {tc.function.arguments}")
+                # Format response using adapter (this handles LangChain conversion)
+                final_result = self._format_response(mock_message, input_data)
+                logger.debug(f"Final result type after adapter: {type(final_result)}")
+                logger.debug(f"Final result has tool_calls: {hasattr(final_result, 'tool_calls')}")
                 # Track usage after streaming is complete
                 full_content = "".join(content_chunks)
                 self._track_streaming_usage(messages, full_content)
+                # Get billing info after tracking (wait a moment for billing to be recorded)
+                await asyncio.sleep(0.01)
+                billing_info = self._get_streaming_billing_info()
+                # Yield the final result with billing info
+                yield {
+                    "result": final_result,
+                    "billing": billing_info,
+                    "api_used": "chat_completions"
+                }
             except Exception as e:
-                logger.error(f"Error in streaming: {e}")
+                logger.error(f"Error in Chat Completions streaming: {e}")
                 raise
         except Exception as e:
-            logger.error(f"Error in astream: {e}")
+            logger.error(f"Error in _astream_chat_completions_api: {e}")
             raise
-    async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any]) -> Union[str, Any]:
-        """Unified invoke method for all input types"""
+    async def ainvoke(self, input_data: Union[str, List[Dict[str, str]], Any], show_reasoning: bool = False, **extra_kwargs) -> Union[str, Any]:
+        """
+        Unified invoke method for all input types
+        Args:
+            input_data: Input messages or text
+            show_reasoning: If True and model supports it, show reasoning process using Responses API
+            **extra_kwargs: Additional parameters to pass to the API (e.g., response_format)
+        """
         try:
             # Use adapter manager to prepare messages
             messages = self._prepare_messages(input_data)
+            # Determine which API to use
+            # Responses API is required for:
+            # 1. Reasoning models with show_reasoning=True
+            # 2. Deep research models (they only work with Responses API)
+            use_responses_api = (show_reasoning and self.is_reasoning_model) or self.supports_deep_research
             # Prepare request kwargs
             provider_config = self.get_provider_config()
             kwargs = {
                 "model": self.model_name,
-                "messages": messages,
-                "temperature": provider_config.get("temperature", 0.7),
-                "max_tokens": provider_config.get("max_tokens", 1024)
+                "messages": messages
             }
+            # O4 and GPT-5 models only support temperature=1 (default)
+            if not self.requires_default_temperature:
+                kwargs["temperature"] = provider_config.get("temperature", 0.7)
+            # O4 and GPT-5 models use max_completion_tokens instead of max_tokens
+            max_tokens_value = provider_config.get("max_tokens", 1024)
+            if self.uses_completion_tokens:
+                kwargs["max_completion_tokens"] = max_tokens_value
+            else:
+                kwargs["max_tokens"] = max_tokens_value
             # Add tools if bound using adapter manager
             tool_schemas = await self._prepare_tools_for_request()
             if tool_schemas:
                 kwargs["tools"] = tool_schemas
-                kwargs["tool_choice"] = "auto"
+                if not use_responses_api:  # Responses API handles tool choice differently
+                    kwargs["tool_choice"] = "auto"
+            # Add response_format if specified (for JSON mode)
+            if 'response_format' in extra_kwargs:
+                kwargs['response_format'] = extra_kwargs['response_format']
+                logger.debug(f"Using response_format: {extra_kwargs['response_format']}")
             # Handle streaming vs non-streaming
             if self.streaming:
                 # TRUE STREAMING MODE - collect all chunks from the stream
                 content_chunks = []
-                async for token in self.astream(input_data):
-                    content_chunks.append(token)
-                content = "".join(content_chunks)
+                async for token in self.astream(input_data, show_reasoning=show_reasoning, **extra_kwargs):
+                    if isinstance(token, str):
+                        content_chunks.append(token)
+                    elif isinstance(token, dict) and "result" in token:
+                        # Return the final result from streaming
+                        return token["result"]
+                # Fallback: join collected content
+                content = "".join(content_chunks)
                 return self._format_response(content, input_data)
             else:
-                # Non-streaming mode
-                response = await self.client.chat.completions.create(**kwargs)
-                message = response.choices[0].message
-                # Update usage tracking
-                if response.usage:
-                    self._update_token_usage(response.usage)
-                    await self._track_billing(response.usage)
-                # Handle tool calls if present - let adapter process the complete message
-                if message.tool_calls:
-                    # Pass the complete message object to adapter for proper tool_calls handling
-                    return self._format_response(message, input_data)
-                # Return appropriate format based on input type
-                return self._format_response(message.content or "", input_data)
+                # Non-streaming mode - choose API based on reasoning visibility
+                if use_responses_api:
+                    logger.info(f"Using Responses API for model {self.model_name}")
+                    # Convert kwargs for Responses API
+                    responses_kwargs = {
+                        "model": kwargs["model"],
+                        "input": kwargs["messages"]  # Responses API uses 'input' instead of 'messages'
+                    }
+                    # Handle max tokens parameter
+                    if "max_completion_tokens" in kwargs:
+                        responses_kwargs["max_output_tokens"] = kwargs["max_completion_tokens"]
+                    elif "max_tokens" in kwargs:
+                        responses_kwargs["max_output_tokens"] = kwargs["max_tokens"]
+                    # Add tools if present
+                    if "tools" in kwargs:
+                        responses_kwargs["tools"] = kwargs["tools"]
+                    # Add reasoning configuration for reasoning models (requires verified organization)
+                    if show_reasoning and self.is_reasoning_model and self.enable_reasoning_summary:
+                        responses_kwargs["reasoning"] = {"summary": "auto"}
+                        logger.info("Reasoning summary enabled - using verified organization features")
+                    elif show_reasoning and self.is_reasoning_model:
+                        logger.info("Reasoning visibility requested - using Responses API without summary (requires verified org)")
+                    # Deep research models require web_search_preview tool
+                    if self.supports_deep_research:
+                        if "tools" not in responses_kwargs:
+                            responses_kwargs["tools"] = []
+                        responses_kwargs["tools"].insert(0, {"type": "web_search_preview"})
+                    response = await self.client.responses.create(**responses_kwargs)
+                    # Handle Responses API format
+                    if hasattr(response, 'output_text'):
+                        # Modern Responses API format
+                        content = response.output_text
+                        usage_info = getattr(response, 'usage', None)
+                    elif hasattr(response, 'body') and hasattr(response.body, 'response'):
+                        # Legacy format
+                        content = response.body.response
+                        usage_info = getattr(response.body, 'usage', None)
+                    else:
+                        # Fallback handling
+                        content = str(response)
+                        usage_info = None
+                    # Update usage tracking if available
+                    if usage_info:
+                        self._update_token_usage(usage_info)
+                        await self._track_billing(usage_info)
+                    return self._format_response(content, input_data)
+                else:
+                    # Standard Chat Completions API
+                    response = await self.client.chat.completions.create(**kwargs)
+                    message = response.choices[0].message
+                    # Debug: Log the raw OpenAI response
+                    logger.debug(f"OpenAI response message: {message}")
+                    if message.tool_calls:
+                        logger.debug(f"Tool calls found: {len(message.tool_calls)}")
+                        for i, tc in enumerate(message.tool_calls):
+                            logger.debug(f"  Tool call {i+1}: id={tc.id}, function={tc.function.name}, args={tc.function.arguments}")
+                    # Update usage tracking
+                    if response.usage:
+                        self._update_token_usage(response.usage)
+                        await self._track_billing(response.usage)
+                    # Handle tool calls if present - let adapter process the complete message
+                    if message.tool_calls:
+                        # Pass the complete message object to adapter for proper tool_calls handling
+                        return self._format_response(message, input_data)
+                    # Return appropriate format based on input type
+                    return self._format_response(message.content or "", input_data)
         except Exception as e:
             logger.error(f"Error in ainvoke: {e}")
@@ -210,11 +559,42 @@ class OpenAILLMService(BaseLLMService):
     def _update_token_usage(self, usage):
         """Update token usage statistics"""
-        self.last_token_usage = {
-            "prompt_tokens": usage.prompt_tokens,
-            "completion_tokens": usage.completion_tokens,
-            "total_tokens": usage.total_tokens
-        }
+        # Handle different usage object structures (Chat Completions vs Responses API)
+        if hasattr(usage, 'prompt_tokens'):
+            # Chat Completions API format
+            self.last_token_usage = {
+                "prompt_tokens": usage.prompt_tokens,
+                "completion_tokens": usage.completion_tokens,
+                "total_tokens": usage.total_tokens
+            }
+        elif hasattr(usage, 'input_tokens'):
+            # Responses API format
+            self.last_token_usage = {
+                "prompt_tokens": usage.input_tokens,
+                "completion_tokens": usage.output_tokens,
+                "total_tokens": usage.total_tokens
+            }
+        else:
+            # Fallback for unknown usage format
+            logger.warning(f"Unknown usage format: {type(usage)}, attributes: {dir(usage)}")
+            self.last_token_usage = {
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "total_tokens": 0
+            }
+        # For O-series models, track reasoning tokens if available
+        if self.is_reasoning_model:
+            reasoning_tokens = 0
+            if hasattr(usage, 'reasoning_tokens'):
+                reasoning_tokens = usage.reasoning_tokens
+            elif hasattr(usage, 'output_tokens_details') and hasattr(usage.output_tokens_details, 'reasoning_tokens'):
+                reasoning_tokens = usage.output_tokens_details.reasoning_tokens
+            self.last_token_usage["reasoning_tokens"] = reasoning_tokens
+            if "reasoning_tokens" not in self.total_token_usage:
+                self.total_token_usage["reasoning_tokens"] = 0
+            self.total_token_usage["reasoning_tokens"] += reasoning_tokens
         # Update total usage
         self.total_token_usage["prompt_tokens"] += self.last_token_usage["prompt_tokens"]
@@ -225,15 +605,35 @@ class OpenAILLMService(BaseLLMService):
     async def _track_billing(self, usage):
         """Track billing information"""
         provider_config = self.get_provider_config()
+        # Prepare metadata for tracking
+        metadata = {
+            "temperature": provider_config.get("temperature", 0.7),
+            "max_tokens": provider_config.get("max_tokens", 1024),
+            "is_reasoning_model": self.is_reasoning_model
+        }
+        # Add reasoning tokens if available for O-series models
+        if self.is_reasoning_model and hasattr(usage, 'reasoning_tokens'):
+            metadata["reasoning_tokens"] = usage.reasoning_tokens
+        # Get tokens using the same logic as _update_token_usage
+        if hasattr(usage, 'prompt_tokens'):
+            input_tokens = usage.prompt_tokens
+            output_tokens = usage.completion_tokens
+        elif hasattr(usage, 'input_tokens'):
+            input_tokens = usage.input_tokens
+            output_tokens = usage.output_tokens
+        else:
+            input_tokens = 0
+            output_tokens = 0
         await self._track_usage(
             service_type=ServiceType.LLM,
             operation="chat",
-            input_tokens=usage.prompt_tokens,
-            output_tokens=usage.completion_tokens,
-            metadata={
-                "temperature": provider_config.get("temperature", 0.7),
-                "max_tokens": provider_config.get("max_tokens", 1024)
-            }
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            metadata=metadata
         )
     def get_token_usage(self) -> Dict[str, Any]:
@@ -252,14 +652,18 @@ class OpenAILLMService(BaseLLMService):
             "max_tokens": provider_config.get("max_tokens", 1024),
             "supports_streaming": True,
             "supports_functions": True,
-            "provider": "openai"
+            "supports_reasoning": self.is_reasoning_model,
+            "supports_deep_research": self.supports_deep_research,
+            "provider": "openai",
+            "model_type": "reasoning" if self.is_reasoning_model else "standard"
         }
     async def chat(
         self,
         input_data: Union[str, List[Dict[str, str]], Any],
-        max_tokens: Optional[int] = None
+        max_tokens: Optional[int] = None,
+        show_reasoning: bool = False
     ) -> Dict[str, Any]:
         """
         Chat method that wraps ainvoke for compatibility with base class
@@ -267,13 +671,14 @@ class OpenAILLMService(BaseLLMService):
         Args:
             input_data: Input messages
             max_tokens: Maximum tokens to generate
+            show_reasoning: Whether to show reasoning process (for O4 models)
         Returns:
             Dict containing chat response with properly formatted message object
         """
         try:
-            # Call ainvoke and get the response (already processed by adapter)
-            response = await self.ainvoke(input_data)
+            # Call ainvoke with show_reasoning parameter
+            response = await self.ainvoke(input_data, show_reasoning=show_reasoning)
             # Return the response as-is (adapter already formatted it correctly)
             # For LangChain inputs, this will be an AIMessage object
@@ -284,7 +689,9 @@ class OpenAILLMService(BaseLLMService):
                 "metadata": {
                     "model": self.model_name,
                     "provider": self.provider_name,
-                    "max_tokens": max_tokens or self.max_tokens
+                    "max_tokens": max_tokens or self.max_tokens,
+                    "show_reasoning": show_reasoning,
+                    "is_reasoning_model": self.is_reasoning_model
                 }
             }
         except Exception as e:
@@ -299,6 +706,213 @@ class OpenAILLMService(BaseLLMService):
                 }
             }
+    async def deep_research(
+        self,
+        input_data: Union[str, List[Dict[str, str]], Any],
+        research_type: Optional[str] = None,
+        search_enabled: bool = True
+    ) -> Dict[str, Any]:
+        """
+        深度研究任务 - 专为深度研究模型设计，使用OpenAI Responses API
+        Args:
+            input_data: 研究查询或问题
+            research_type: 研究类型 (academic, market, competitive, etc.)
+            search_enabled: 是否启用网络搜索
+        Returns:
+            Dict containing research results
+        """
+        if not self.supports_deep_research:
+            # Fallback to regular chat for non-deep-research models
+            logger.info(f"Model {self.model_name} doesn't support deep research, falling back to regular chat")
+            return await self.chat(input_data)
+        try:
+            # Prepare messages with research context
+            messages = self._prepare_messages(input_data)
+            # Add research-specific system prompt if research_type is specified
+            if research_type and messages:
+                research_prompts = {
+                    "academic": "You are conducting academic research. Please provide thorough, well-sourced analysis with proper citations and methodical reasoning.",
+                    "market": "You are conducting market research. Focus on market trends, competitive analysis, and business insights.",
+                    "competitive": "You are conducting competitive analysis. Compare and contrast different approaches, solutions, or entities.",
+                    "technical": "You are conducting technical research. Provide detailed technical analysis with implementation considerations."
+                }
+                if research_type in research_prompts:
+                    # Insert system message at the beginning
+                    system_msg = {"role": "system", "content": research_prompts[research_type]}
+                    if messages[0].get("role") == "system":
+                        messages[0]["content"] = research_prompts[research_type] + "\n\n" + messages[0]["content"]
+                    else:
+                        messages.insert(0, system_msg)
+            # Prepare request kwargs for Responses API
+            provider_config = self.get_provider_config()
+            kwargs = {
+                "model": self.model_name,
+                "input": messages  # Responses API uses 'input' instead of 'messages'
+            }
+            # Responses API uses max_output_tokens instead of max_completion_tokens
+            max_tokens_value = provider_config.get("max_tokens", 4096)
+            kwargs["max_output_tokens"] = max_tokens_value
+            # Deep research models require web_search_preview tool when search is enabled
+            if search_enabled:
+                kwargs["tools"] = [
+                    {
+                        "type": "web_search_preview"
+                    }
+                ]
+            # Add any additional bound tools
+            tool_schemas = await self._prepare_tools_for_request()
+            if tool_schemas:
+                if "tools" not in kwargs:
+                    kwargs["tools"] = []
+                kwargs["tools"].extend(tool_schemas)
+            # Check if streaming is enabled
+            if self.streaming:
+                # Use streaming mode for deep research
+                logger.info(f"Using Responses API streaming for deep research model {self.model_name}")
+                kwargs["stream"] = True
+                content_chunks = []
+                stream = await self.client.responses.create(**kwargs)
+                async for event in stream:
+                    if event.type == 'response.output_text.delta':
+                        if event.delta:
+                            content_chunks.append(event.delta)
+                message_content = "".join(content_chunks)
+                # Track estimated usage for streaming
+                messages = self._prepare_messages(input_data)
+                self._track_streaming_usage(messages, message_content)
+                # Format response
+                formatted_response = self._format_response(message_content or "", input_data)
+            else:
+                # Use non-streaming mode for deep research
+                logger.info(f"Using Responses API for deep research model {self.model_name}")
+                response = await self.client.responses.create(**kwargs)
+                # Extract the response content from Responses API format
+                if hasattr(response, 'output_text'):
+                    # Modern Responses API format
+                    message_content = response.output_text
+                    usage_info = getattr(response, 'usage', None)
+                elif hasattr(response, 'body') and hasattr(response.body, 'response'):
+                    # Legacy Responses API format
+                    message_content = response.body.response
+                    usage_info = getattr(response.body, 'usage', None)
+                elif hasattr(response, 'choices') and response.choices:
+                    # Fallback to standard format
+                    message_content = response.choices[0].message.content
+                    usage_info = getattr(response, 'usage', None)
+                else:
+                    # Handle unexpected format
+                    message_content = str(response)
+                    usage_info = None
+                # Update usage tracking if available
+                if usage_info:
+                    self._update_token_usage(usage_info)
+                    await self._track_billing(usage_info)
+                # Format response
+                formatted_response = self._format_response(message_content or "", input_data)
+            return {
+                "result": formatted_response,
+                "research_type": research_type,
+                "search_enabled": search_enabled,
+                "success": True,
+                "metadata": {
+                    "model": self.model_name,
+                    "provider": self.provider_name,
+                    "supports_deep_research": self.supports_deep_research,
+                    "reasoning_model": self.is_reasoning_model,
+                    "api_used": "responses"
+                }
+            }
+        except Exception as e:
+            logger.error(f"Deep research failed: {e}")
+            return {
+                "result": None,
+                "success": False,
+                "error": str(e),
+                "metadata": {
+                    "model": self.model_name,
+                    "provider": self.provider_name,
+                    "api_used": "responses"
+                }
+            }
     async def close(self):
         """Close the backend client"""
-        await self.client.close()
+        await self.client.close()
+    def _get_streaming_billing_info(self) -> Dict[str, Any]:
+        """Get billing information for streaming requests"""
+        try:
+            # Check if service has model_manager with billing_tracker
+            if hasattr(self, 'model_manager') and hasattr(self.model_manager, 'billing_tracker'):
+                billing_tracker = self.model_manager.billing_tracker
+                # Get the latest usage record for this model
+                model_records = [
+                    record for record in billing_tracker.usage_records
+                    if record.model_id == self.model_name
+                ]
+                if model_records:
+                    # Get the most recent record
+                    latest_record = max(model_records, key=lambda r: r.timestamp)
+                    return {
+                        "cost_usd": latest_record.cost_usd,
+                        "input_tokens": latest_record.input_tokens,
+                        "output_tokens": latest_record.output_tokens,
+                        "total_tokens": latest_record.total_tokens,
+                        "operation": latest_record.operation,
+                        "timestamp": latest_record.timestamp,
+                        "currency": "USD"
+                    }
+            # Fallback: use last token usage with estimated cost
+            last_usage = self.get_last_token_usage()
+            estimated_cost = 0.0
+            if hasattr(self, 'model_manager'):
+                estimated_cost = self.model_manager.calculate_cost(
+                    provider=self.provider_name,
+                    model_name=self.model_name,
+                    input_tokens=last_usage.get("prompt_tokens", 0),
+                    output_tokens=last_usage.get("completion_tokens", 0)
+                )
+            return {
+                "cost_usd": estimated_cost,
+                "input_tokens": last_usage.get("prompt_tokens", 0),
+                "output_tokens": last_usage.get("completion_tokens", 0),
+                "total_tokens": last_usage.get("total_tokens", 0),
+                "operation": "chat",
+                "timestamp": None,
+                "currency": "USD",
+                "note": "Estimated from last token usage"
+            }
+        except Exception as e:
+            logger.warning(f"Failed to get streaming billing info: {e}")
+            return {
+                "cost_usd": 0.0,
+                "error": str(e),
+                "currency": "USD"
+            }

isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl