PyPI - isa-model - Versions diffs - 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

isa_model/client.py +1166 -584
isa_model/core/cache/redis_cache.py +410 -0
isa_model/core/config/config_manager.py +282 -12
isa_model/core/config.py +91 -1
isa_model/core/database/__init__.py +1 -0
isa_model/core/database/direct_db_client.py +114 -0
isa_model/core/database/migration_manager.py +563 -0
isa_model/core/database/migrations.py +297 -0
isa_model/core/database/supabase_client.py +258 -0
isa_model/core/dependencies.py +316 -0
isa_model/core/discovery/__init__.py +19 -0
isa_model/core/discovery/consul_discovery.py +190 -0
isa_model/core/logging/__init__.py +54 -0
isa_model/core/logging/influx_logger.py +523 -0
isa_model/core/logging/loki_logger.py +160 -0
isa_model/core/models/__init__.py +46 -0
isa_model/core/models/config_models.py +625 -0
isa_model/core/models/deployment_billing_tracker.py +430 -0
isa_model/core/models/model_billing_tracker.py +60 -88
isa_model/core/models/model_manager.py +66 -25
isa_model/core/models/model_metadata.py +690 -0
isa_model/core/models/model_repo.py +217 -55
isa_model/core/models/model_statistics_tracker.py +234 -0
isa_model/core/models/model_storage.py +0 -1
isa_model/core/models/model_version_manager.py +959 -0
isa_model/core/models/system_models.py +857 -0
isa_model/core/pricing_manager.py +2 -249
isa_model/core/repositories/__init__.py +9 -0
isa_model/core/repositories/config_repository.py +912 -0
isa_model/core/resilience/circuit_breaker.py +366 -0
isa_model/core/security/secrets.py +358 -0
isa_model/core/services/__init__.py +2 -4
isa_model/core/services/intelligent_model_selector.py +479 -370
isa_model/core/storage/hf_storage.py +2 -2
isa_model/core/types.py +8 -0
isa_model/deployment/__init__.py +5 -48
isa_model/deployment/core/__init__.py +2 -31
isa_model/deployment/core/deployment_manager.py +1278 -368
isa_model/deployment/local/__init__.py +31 -0
isa_model/deployment/local/config.py +248 -0
isa_model/deployment/local/gpu_gateway.py +607 -0
isa_model/deployment/local/health_checker.py +428 -0
isa_model/deployment/local/provider.py +586 -0
isa_model/deployment/local/tensorrt_service.py +621 -0
isa_model/deployment/local/transformers_service.py +644 -0
isa_model/deployment/local/vllm_service.py +527 -0
isa_model/deployment/modal/__init__.py +8 -0
isa_model/deployment/modal/config.py +136 -0
isa_model/deployment/modal/deployer.py +894 -0
isa_model/deployment/modal/services/__init__.py +3 -0
isa_model/deployment/modal/services/audio/__init__.py +1 -0
isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
isa_model/deployment/modal/services/embedding/__init__.py +1 -0
isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
isa_model/deployment/modal/services/llm/__init__.py +1 -0
isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
isa_model/deployment/modal/services/video/__init__.py +1 -0
isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
isa_model/deployment/modal/services/vision/__init__.py +1 -0
isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/storage/__init__.py +5 -0
isa_model/deployment/storage/deployment_repository.py +824 -0
isa_model/deployment/triton/__init__.py +10 -0
isa_model/deployment/triton/config.py +196 -0
isa_model/deployment/triton/configs/__init__.py +1 -0
isa_model/deployment/triton/provider.py +512 -0
isa_model/deployment/triton/scripts/__init__.py +1 -0
isa_model/deployment/triton/templates/__init__.py +1 -0
isa_model/inference/__init__.py +47 -1
isa_model/inference/ai_factory.py +179 -16
isa_model/inference/legacy_services/__init__.py +21 -0
isa_model/inference/legacy_services/model_evaluation.py +637 -0
isa_model/inference/legacy_services/model_service.py +573 -0
isa_model/inference/legacy_services/model_serving.py +717 -0
isa_model/inference/legacy_services/model_training.py +561 -0
isa_model/inference/models/__init__.py +21 -0
isa_model/inference/models/inference_config.py +551 -0
isa_model/inference/models/inference_record.py +675 -0
isa_model/inference/models/performance_models.py +714 -0
isa_model/inference/repositories/__init__.py +9 -0
isa_model/inference/repositories/inference_repository.py +828 -0
isa_model/inference/services/audio/__init__.py +21 -0
isa_model/inference/services/audio/base_realtime_service.py +225 -0
isa_model/inference/services/audio/base_stt_service.py +184 -11
isa_model/inference/services/audio/isa_tts_service.py +0 -0
isa_model/inference/services/audio/openai_realtime_service.py +320 -124
isa_model/inference/services/audio/openai_stt_service.py +53 -11
isa_model/inference/services/base_service.py +17 -1
isa_model/inference/services/custom_model_manager.py +277 -0
isa_model/inference/services/embedding/__init__.py +13 -0
isa_model/inference/services/embedding/base_embed_service.py +111 -8
isa_model/inference/services/embedding/isa_embed_service.py +305 -0
isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
isa_model/inference/services/embedding/openai_embed_service.py +2 -4
isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
isa_model/inference/services/img/__init__.py +2 -2
isa_model/inference/services/img/base_image_gen_service.py +24 -7
isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
isa_model/inference/services/img/services/replicate_flux.py +226 -0
isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
isa_model/inference/services/img/tests/test_img_client.py +297 -0
isa_model/inference/services/llm/__init__.py +10 -2
isa_model/inference/services/llm/base_llm_service.py +361 -26
isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
isa_model/inference/services/llm/local_llm_service.py +747 -0
isa_model/inference/services/llm/ollama_llm_service.py +11 -3
isa_model/inference/services/llm/openai_llm_service.py +670 -56
isa_model/inference/services/llm/yyds_llm_service.py +10 -3
isa_model/inference/services/vision/__init__.py +27 -6
isa_model/inference/services/vision/base_vision_service.py +118 -185
isa_model/inference/services/vision/blip_vision_service.py +359 -0
isa_model/inference/services/vision/helpers/image_utils.py +19 -10
isa_model/inference/services/vision/isa_vision_service.py +634 -0
isa_model/inference/services/vision/openai_vision_service.py +19 -10
isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
isa_model/serving/api/cache_manager.py +245 -0
isa_model/serving/api/dependencies/__init__.py +1 -0
isa_model/serving/api/dependencies/auth.py +194 -0
isa_model/serving/api/dependencies/database.py +139 -0
isa_model/serving/api/error_handlers.py +284 -0
isa_model/serving/api/fastapi_server.py +240 -18
isa_model/serving/api/middleware/auth.py +317 -0
isa_model/serving/api/middleware/security.py +268 -0
isa_model/serving/api/middleware/tenant_context.py +414 -0
isa_model/serving/api/routes/analytics.py +489 -0
isa_model/serving/api/routes/config.py +645 -0
isa_model/serving/api/routes/deployment_billing.py +315 -0
isa_model/serving/api/routes/deployments.py +475 -0
isa_model/serving/api/routes/gpu_gateway.py +440 -0
isa_model/serving/api/routes/health.py +32 -12
isa_model/serving/api/routes/inference_monitoring.py +486 -0
isa_model/serving/api/routes/local_deployments.py +448 -0
isa_model/serving/api/routes/logs.py +430 -0
isa_model/serving/api/routes/settings.py +582 -0
isa_model/serving/api/routes/tenants.py +575 -0
isa_model/serving/api/routes/unified.py +992 -171
isa_model/serving/api/routes/webhooks.py +479 -0
isa_model/serving/api/startup.py +318 -0
isa_model/serving/modal_proxy_server.py +249 -0
isa_model/utils/gpu_utils.py +311 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
isa_model-0.4.3.dist-info/RECORD +193 -0
isa_model/deployment/cloud/__init__.py +0 -9
isa_model/deployment/cloud/modal/__init__.py +0 -10
isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
isa_model/deployment/cloud/modal/register_models.py +0 -321
isa_model/deployment/core/deployment_config.py +0 -356
isa_model/deployment/core/isa_deployment_service.py +0 -401
isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
isa_model/deployment/runtime/deployed_service.py +0 -338
isa_model/deployment/services/__init__.py +0 -9
isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
isa_model/deployment/services/model_service.py +0 -332
isa_model/deployment/services/service_monitor.py +0 -356
isa_model/deployment/services/service_registry.py +0 -527
isa_model/eval/__init__.py +0 -92
isa_model/eval/benchmarks.py +0 -469
isa_model/eval/config/__init__.py +0 -10
isa_model/eval/config/evaluation_config.py +0 -108
isa_model/eval/evaluators/__init__.py +0 -18
isa_model/eval/evaluators/base_evaluator.py +0 -503
isa_model/eval/evaluators/llm_evaluator.py +0 -472
isa_model/eval/factory.py +0 -531
isa_model/eval/infrastructure/__init__.py +0 -24
isa_model/eval/infrastructure/experiment_tracker.py +0 -466
isa_model/eval/metrics.py +0 -798
isa_model/inference/adapter/unified_api.py +0 -248
isa_model/inference/services/helpers/stacked_config.py +0 -148
isa_model/inference/services/img/flux_professional_service.py +0 -603
isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/others/table_transformer_service.py +0 -61
isa_model/inference/services/vision/doc_analysis_service.py +0 -640
isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/vision/ui_analysis_service.py +0 -823
isa_model/scripts/inference_tracker.py +0 -283
isa_model/scripts/mlflow_manager.py +0 -379
isa_model/scripts/model_registry.py +0 -465
isa_model/scripts/register_models.py +0 -370
isa_model/scripts/register_models_with_embeddings.py +0 -510
isa_model/scripts/start_mlflow.py +0 -95
isa_model/scripts/training_tracker.py +0 -257
isa_model/training/__init__.py +0 -74
isa_model/training/annotation/annotation_schema.py +0 -47
isa_model/training/annotation/processors/annotation_processor.py +0 -126
isa_model/training/annotation/storage/dataset_manager.py +0 -131
isa_model/training/annotation/storage/dataset_schema.py +0 -44
isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
isa_model/training/annotation/tests/test_minio copy.py +0 -113
isa_model/training/annotation/tests/test_minio_upload.py +0 -43
isa_model/training/annotation/views/annotation_controller.py +0 -158
isa_model/training/cloud/__init__.py +0 -22
isa_model/training/cloud/job_orchestrator.py +0 -402
isa_model/training/cloud/runpod_trainer.py +0 -454
isa_model/training/cloud/storage_manager.py +0 -482
isa_model/training/core/__init__.py +0 -23
isa_model/training/core/config.py +0 -181
isa_model/training/core/dataset.py +0 -222
isa_model/training/core/trainer.py +0 -720
isa_model/training/core/utils.py +0 -213
isa_model/training/factory.py +0 -424
isa_model-0.3.91.dist-info/RECORD +0 -138
/isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
/isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0

isa_model/client.py CHANGED Viewed

@@ -2,17 +2,84 @@
 # -*- coding: utf-8 -*-
 """
-ISA Model Client - Unified interface for all AI services
-Provides intelligent model selection and simplified API
+ISA Model Client - Unified AI Service Interface
+===============================================
+功能描述：
+ISA Model平台的统一客户端接口，提供智能模型选择和简化的API调用
+主要功能：
+- 多模态AI服务统一接口：文本、视觉、音频、图像生成、嵌入向量
+- 智能模型自动选择：基于任务类型和输入数据自动选择最适合的模型
+- 流式响应支持：支持实时流式文本生成，提供更好的用户体验
+- 远程/本地服务：支持本地服务调用和远程API调用两种模式
+- 成本跟踪：自动计算和跟踪API调用成本
+- 工具支持：支持LangChain工具集成，扩展模型能力
+- 缓存机制：服务实例缓存，提高性能
+输入接口：
+- input_data: 多类型输入数据（文本、图像路径、音频文件、字节数据等）
+- task: 任务类型（chat, analyze, generate_speech, transcribe等）
+- service_type: 服务类型（text, vision, audio, image, embedding）
+- model: 可选模型名称（如不指定则智能选择）
+- provider: 可选提供商名称（openai, ollama, replicate等）
+输出格式：
+- 统一响应字典，包含result和metadata
+- 流式响应：包含stream异步生成器
+- 非流式响应：包含result结果数据
+- metadata：包含模型信息、计费信息、选择原因等
+核心依赖：
+- isa_model.inference.ai_factory: AI服务工厂
+- isa_model.core.services.intelligent_model_selector: 智能模型选择器
+- aiohttp: HTTP客户端（远程API模式）
+- asyncio: 异步编程支持
+使用示例：
+```python
+# 创建客户端
+client = ISAModelClient()
+# 流式文本生成
+result = await client.invoke("写一个故事", "chat", "text")
+async for token in result["stream"]:
+    print(token, end="", flush=True)
+# 图像分析
+result = await client.invoke("image.jpg", "analyze", "vision")
+print(result["result"])
+# 语音合成
+result = await client.invoke("Hello world", "generate_speech", "audio")
+print(result["result"])
+```
+架构特点：
+- 单例模式：确保配置一致性
+- 异步支持：所有操作都是异步的
+- 错误处理：统一的错误处理和响应格式
+- 可扩展性：支持新的服务提供商和模型
+优化建议：
+1. 增加请求重试机制：处理网络不稳定情况
+2. 添加请求限流：避免超出API限制
+3. 优化缓存策略：支持LRU缓存和TTL过期
+4. 增加监控指标：记录延迟、成功率等指标
+5. 支持批处理：提高大量请求的处理效率
+6. 添加配置验证：启动时验证API密钥和配置
 """
 import logging
 import asyncio
+import time
+import uuid
 from typing import Any, Dict, Optional, List, Union
 from pathlib import Path
-import aiohttp
+from datetime import datetime, timezone
 from isa_model.inference.ai_factory import AIFactory
+from isa_model.core.logging import get_inference_logger, generate_request_id
 try:
     from isa_model.core.services.intelligent_model_selector import IntelligentModelSelector, get_model_selector
@@ -36,41 +103,104 @@ class ISAModelClient:
         response = await client.invoke("audio.mp3", "transcribe", "audio")
     """
+    # Consolidated task mappings for all service types
+    TASK_MAPPINGS = {
+        "vision": {
+            # Core tasks (direct mapping)
+            "analyze": "analyze",
+            "describe": "describe",
+            "extract": "extract",
+            "detect": "detect",
+            "classify": "classify",
+            "compare": "compare",
+            # Common aliases (backward compatibility)
+            "analyze_image": "analyze",
+            "describe_image": "describe",
+            "extract_text": "extract",
+            "extract_table": "extract",
+            "detect_objects": "detect",
+            "detect_ui": "detect",
+            "detect_ui_elements": "detect",
+            "get_coordinates": "detect",
+            "ocr": "extract",
+            "ui_analysis": "analyze",
+            "navigation": "analyze"
+        },
+        "audio": {
+            "generate_speech": "synthesize",
+            "text_to_speech": "synthesize",
+            "tts": "synthesize",
+            "transcribe": "transcribe",
+            "speech_to_text": "transcribe",
+            "stt": "transcribe",
+            "translate": "translate",
+            "detect_language": "detect_language"
+        },
+        "text": {
+            "chat": "chat",
+            "generate": "generate",
+            "complete": "complete",
+            "translate": "translate",
+            "summarize": "summarize",
+            "analyze": "analyze",
+            "extract": "extract",
+            "classify": "classify"
+        },
+        "image": {
+            "generate_image": "generate",
+            "generate": "generate",
+            "img2img": "img2img",
+            "image_to_image": "img2img",
+            "generate_batch": "generate_batch"
+        },
+        "embedding": {
+            "create_embedding": "embed",
+            "embed": "embed",
+            "embed_batch": "embed_batch",
+            "chunk_and_embed": "chunk_and_embed",
+            "similarity": "similarity",
+            "find_similar": "find_similar",
+            "rerank": "rerank",
+            "rerank_documents": "rerank_documents",
+            "document_ranking": "document_ranking"
+        }
+    }
+    # Service type configuration
+    SUPPORTED_SERVICE_TYPES = {"vision", "audio", "text", "image", "embedding"}
     def __init__(self,
                  config: Optional[Dict[str, Any]] = None,
-                 mode: str = "local",
-                 api_url: Optional[str] = None,
+                 service_endpoint: Optional[str] = None,
                  api_key: Optional[str] = None):
         """Initialize ISA Model Client
         Args:
             config: Optional configuration override
-            mode: "local" for direct AI Factory, "api" for HTTP API calls
-            api_url: API base URL (required if mode="api")
-            api_key: API key for authentication (optional)
+            service_endpoint: Optional service endpoint URL (if None, uses local AI Factory)
+            api_key: Optional API key for authentication (can also be set via ISA_API_KEY env var)
         """
         self.config = config or {}
-        self.mode = mode
-        self.api_url = api_url.rstrip('/') if api_url else None
-        self.api_key = api_key
-        # Setup HTTP headers for API mode
-        if self.mode == "api":
-            if not self.api_url:
-                raise ValueError("api_url is required when mode='api'")
-            self.headers = {
-                "Content-Type": "application/json",
-                "User-Agent": "ISA-Model-Client/1.0.0"
-            }
-            if self.api_key:
-                self.headers["Authorization"] = f"Bearer {self.api_key}"
+        self.service_endpoint = service_endpoint
+        # Handle API key authentication
+        import os
+        self.api_key = api_key or os.getenv("ISA_API_KEY")
+        if self.api_key:
+            logger.info("API key provided for authentication")
+        else:
+            logger.debug("No API key provided - using anonymous access")
-        # Initialize AI Factory for local mode
-        if self.mode == "local":
+        # Initialize AI Factory for direct service access (when service_endpoint is None)
+        if not self.service_endpoint:
             self.ai_factory = AIFactory.get_instance()
         else:
             self.ai_factory = None
+            logger.info(f"Using remote service endpoint: {self.service_endpoint}")
+        # HTTP client for remote API calls
+        self._http_session = None
         # Initialize intelligent model selector
         self.model_selector = None
@@ -87,169 +217,474 @@ class ISAModelClient:
         # Cache for frequently used services
         self._service_cache: Dict[str, Any] = {}
+        # Initialize inference logger
+        self.inference_logger = get_inference_logger()
         logger.info("ISA Model Client initialized")
-    async def stream(
+    async def _get_http_session(self):
+        """Get or create HTTP session for remote API calls"""
+        if self._http_session is None:
+            import aiohttp
+            headers = {}
+            # Add API key authentication if available
+            if self.api_key:
+                headers["Authorization"] = f"Bearer {self.api_key}"
+                headers["X-API-Key"] = self.api_key
+            self._http_session = aiohttp.ClientSession(headers=headers)
+        return self._http_session
+    async def _make_api_request(self, endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Make HTTP request to remote API endpoint"""
+        if not self.service_endpoint:
+            raise ValueError("Service endpoint not configured for remote API calls")
+        session = await self._get_http_session()
+        url = f"{self.service_endpoint.rstrip('/')}/{endpoint.lstrip('/')}"
+        try:
+            async with session.post(url, json=data) as response:
+                if response.status == 401:
+                    raise Exception("Authentication required or invalid API key")
+                elif response.status == 403:
+                    raise Exception("Insufficient permissions")
+                elif not response.ok:
+                    error_detail = await response.text()
+                    raise Exception(f"API request failed ({response.status}): {error_detail}")
+                return await response.json()
+        except Exception as e:
+            logger.error(f"Remote API request failed: {e}")
+            raise
+    async def close(self):
+        """Close HTTP session and cleanup resources"""
+        if self._http_session:
+            await self._http_session.close()
+            self._http_session = None
+    async def _invoke_remote_api(
         self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
-        task: str,
+        input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
+        task: str,
         service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
+        model: Optional[str] = None,
+        provider: Optional[str] = None,
+        stream: Optional[bool] = None,
         **kwargs
-    ):
-        """
-        Streaming invoke method that yields tokens in real-time
-        Args:
-            input_data: Input data (text for LLM streaming)
-            task: Task to perform
-            service_type: Type of service (only "text" supports streaming)
-            model_hint: Optional model preference
-            provider_hint: Optional provider preference
-            **kwargs: Additional parameters
-        Yields:
-            Individual tokens as they arrive from the model
-        Example:
-            async for token in client.stream("Hello world", "chat", "text"):
-                print(token, end="", flush=True)
-        """
-        if service_type != "text":
-            raise ValueError("Streaming is only supported for text/LLM services")
+    ) -> Dict[str, Any]:
+        """Invoke remote API endpoint"""
         try:
-            if self.mode == "api":
-                async for token in self._stream_api(input_data, task, service_type, model_hint, provider_hint, **kwargs):
-                    yield token
+            # Prepare request data for unified API
+            request_data = {
+                "task": task,
+                "service_type": service_type,
+                **kwargs
+            }
+            # Add model and provider if specified
+            if model:
+                request_data["model"] = model
+            if provider:
+                request_data["provider"] = provider
+            # For remote API, disable streaming to get JSON response
+            request_data["stream"] = False
+            # Handle different input data types
+            if isinstance(input_data, (str, Path)):
+                request_data["input_data"] = str(input_data)
+            elif isinstance(input_data, (dict, list)):
+                request_data["input_data"] = input_data
             else:
-                async for token in self._stream_local(input_data, task, service_type, model_hint, provider_hint, **kwargs):
-                    yield token
+                # For binary data, convert to base64
+                import base64
+                if isinstance(input_data, bytes):
+                    request_data["input_data"] = base64.b64encode(input_data).decode()
+                    request_data["data_type"] = "base64"
+                else:
+                    request_data["input_data"] = str(input_data)
+            # Make API request
+            response = await self._make_api_request("api/v1/invoke", request_data)
+            return response
         except Exception as e:
-            logger.error(f"Failed to stream {task} on {service_type}: {e}")
-            raise
+            logger.error(f"Remote API invocation failed: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "metadata": {
+                    "task": task,
+                    "service_type": service_type,
+                    "endpoint": "remote"
+                }
+            }
     async def invoke(
         self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
+        input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
         task: str,
         service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
-        stream: bool = False,
-        tools: Optional[List[Any]] = None,
+        model: Optional[str] = None,
+        provider: Optional[str] = None,
+        stream: Optional[bool] = None,
+        show_reasoning: Optional[bool] = False,
+        output_format: Optional[str] = None,
+        json_schema: Optional[Dict] = None,
+        repair_attempts: Optional[int] = 3,
         **kwargs
-    ) -> Union[Dict[str, Any], object]:
+    ) -> Dict[str, Any]:
         """
         Unified invoke method with intelligent model selection
         Args:
-            input_data: Input data (image path, text, audio, etc.)
-            task: Task to perform (analyze_image, generate_speech, transcribe, etc.)
-            service_type: Type of service (vision, audio, text, image, embedding)
-            model_hint: Optional model preference
-            provider_hint: Optional provider preference
-            stream: Enable streaming for text services (returns AsyncGenerator)
-            tools: Optional list of tools for function calling (only for text services)
-            **kwargs: Additional task-specific parameters
+            input_data: Input data (str, LangChain messages, image path, audio, etc.)
+            task: Task to perform (chat, analyze_image, generate_speech, transcribe, etc.)
+            service_type: Type of service (text, vision, audio, image, embedding)
+            model: Model name (if None, uses intelligent selection)
+            provider: Provider name (if None, uses intelligent selection)
+            stream: Enable streaming for text tasks (default True for chat/generate tasks, supports tools)
+            show_reasoning: Show reasoning process for O4 models (uses Responses API)
+            **kwargs: Additional task-specific parameters (including tools for LangChain)
         Returns:
-            If stream=False: Unified response dictionary with result and metadata
-            If stream=True: AsyncGenerator yielding tokens (only for text services)
+            Unified response dictionary with result and metadata
+            For streaming: result["stream"] contains async generator
+            For non-streaming: result["result"] contains the response
         Examples:
-            # Vision tasks
-            await client.invoke("image.jpg", "analyze_image", "vision")
-            await client.invoke("screenshot.png", "detect_ui_elements", "vision")
-            await client.invoke("document.pdf", "extract_table", "vision")
-            # Audio tasks
-            await client.invoke("Hello world", "generate_speech", "audio")
-            await client.invoke("audio.mp3", "transcribe", "audio")
-            # Text tasks
-            await client.invoke("Translate this text", "translate", "text")
-            await client.invoke("What is AI?", "chat", "text")
+            # Text tasks with streaming (default for chat)
+            result = await client.invoke("Write a story", "chat", "text")
+            if "stream" in result:
+                async for chunk in result["stream"]:
+                    print(chunk, end="", flush=True)
+            else:
+                print(result["result"])
-            # Streaming text
-            async for token in await client.invoke("Hello", "chat", "text", stream=True):
-                print(token, end="", flush=True)
+            # Text tasks with tools (also supports streaming)
+            result = await client.invoke("What's the weather?", "chat", "text", tools=[get_weather])
+            if "stream" in result:
+                async for chunk in result["stream"]:
+                    print(chunk, end="", flush=True)
+            else:
+                print(result["result"])
-            # Text with tools
-            await client.invoke("What's 5+3?", "chat", "text", tools=[calculator_function])
+            # Vision tasks (always non-streaming)
+            result = await client.invoke("image.jpg", "analyze", "vision")
+            print(result["result"])
-            # Streaming with tools
-            async for token in await client.invoke("What's 5+3?", "chat", "text", stream=True, tools=[calculator_function]):
-                print(token, end="")
+            # Audio tasks
+            result = await client.invoke("Hello world", "generate_speech", "audio")
+            print(result["result"])
             # Image generation
-            await client.invoke("A beautiful sunset", "generate_image", "image")
+            result = await client.invoke("A beautiful sunset", "generate_image", "image")
+            print(result["result"])
             # Embedding
-            await client.invoke("Text to embed", "create_embedding", "embedding")
+            result = await client.invoke("Text to embed", "create_embedding", "embedding")
+            print(result["result"])
         """
         try:
-            # Handle streaming case
-            if stream:
-                if service_type != "text":
-                    raise ValueError("Streaming is only supported for text services")
-                if self.mode == "api":
-                    return self._stream_api(
-                        input_data=input_data,
-                        task=task,
-                        service_type=service_type,
-                        model_hint=model_hint,
-                        provider_hint=provider_hint,
-                        tools=tools,
-                        **kwargs
-                    )
+            # If using remote service endpoint, make API call
+            if self.service_endpoint:
+                return await self._invoke_remote_api(
+                    input_data=input_data,
+                    task=task,
+                    service_type=service_type,
+                    model=model,
+                    provider=provider,
+                    stream=stream,
+                    **kwargs
+                )
+            # Set default streaming for text tasks
+            if stream is None and service_type == "text":
+                if task in ["chat", "generate"]:
+                    stream = True   # Enable streaming for chat and generate tasks
                 else:
-                    return self._stream_local(
-                        input_data=input_data,
-                        task=task,
-                        service_type=service_type,
-                        model_hint=model_hint,
-                        provider_hint=provider_hint,
-                        tools=tools,
-                        **kwargs
-                    )
+                    stream = False  # Disable for other text tasks
-            # Route to appropriate mode for non-streaming
-            if self.mode == "api":
-                return await self._invoke_api(
+            # If streaming is enabled for text tasks, return streaming response
+            if stream and service_type == "text":
+                return await self._invoke_service_streaming(
                     input_data=input_data,
                     task=task,
                     service_type=service_type,
-                    model_hint=model_hint,
-                    provider_hint=provider_hint,
-                    tools=tools,
+                    model_hint=model,
+                    provider_hint=provider,
+                    show_reasoning=show_reasoning,  # Explicitly pass show_reasoning
+                    output_format=output_format,
+                    json_schema=json_schema,
+                    repair_attempts=repair_attempts,
                     **kwargs
                 )
             else:
-                return await self._invoke_local(
+                # Use regular non-streaming service
+                return await self._invoke_service(
                     input_data=input_data,
                     task=task,
                     service_type=service_type,
-                    model_hint=model_hint,
-                    provider_hint=provider_hint,
-                    tools=tools,
+                    model_hint=model,
+                    provider_hint=provider,
+                    stream=False,  # Force non-streaming
+                    output_format=output_format,
+                    json_schema=json_schema,
+                    repair_attempts=repair_attempts,
                     **kwargs
                 )
         except Exception as e:
-            logger.error(f"Failed to invoke {task} on {service_type}: {e}")
-            return {
-                "success": False,
-                "error": str(e),
-                "metadata": {
-                    "task": task,
-                    "service_type": service_type,
-                    "input_type": type(input_data).__name__
-                }
-            }
+            return self._handle_error(e, {
+                "operation": "invoke",
+                "task": task,
+                "service_type": service_type,
+                "input_type": type(input_data).__name__
+            })
+    async def invoke_stream(
+        self,
+        input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
+        task: str,
+        service_type: str,
+        model: Optional[str] = None,
+        provider: Optional[str] = None,
+        return_metadata: bool = False,
+        **kwargs
+    ):
+        """
+        Unified streaming invoke method - returns async generator for real-time token streaming
+        Args:
+            input_data: Input data (str, LangChain messages, image path, audio, etc.)
+            task: Task to perform (chat, analyze_image, generate_speech, transcribe, etc.)
+            service_type: Type of service (text, vision, audio, image, embedding)
+            model: Model name (if None, uses intelligent selection)
+            provider: Provider name (if None, uses intelligent selection)
+            return_metadata: If True, yields ('metadata', metadata_dict) as final item
+            **kwargs: Additional task-specific parameters (including tools for LangChain)
+        Returns:
+            For text services: AsyncGenerator[Union[str, Tuple[str, Dict]], None] - yields tokens as they arrive
+            - Normal items: token strings
+            - Final item (if return_metadata=True): ('metadata', metadata_dict) with billing info
+            For other services: Raises ValueError (streaming not supported)
+        Examples:
+            # Simple streaming
+            async for token in client.invoke_stream("Hello!", "chat", "text"):
+                print(token, end='', flush=True)
+            # Streaming with metadata
+            async for item in client.invoke_stream("Hello!", "chat", "text", return_metadata=True):
+                if isinstance(item, tuple) and item[0] == 'metadata':
+                    print(f"\nBilling: {item[1]['billing']}")
+                else:
+                    print(item, end='', flush=True)
+        """
+        try:
+            # Only text services support streaming
+            if service_type != "text":
+                raise ValueError(f"Streaming not supported for service type: {service_type}")
+            # Tools are supported with streaming
+            # Step 1: Select best model for this task
+            selected_model = await self._select_model(
+                input_data=input_data,
+                task=task,
+                service_type=service_type,
+                model_hint=model,
+                provider_hint=provider
+            )
+            # Step 2: Get appropriate service
+            service, _ = await self._get_service(
+                service_type=service_type,
+                model_name=selected_model["model_id"],
+                provider=selected_model["provider"],
+                task=task,
+                use_cache=False  # Don't cache for streaming to avoid state issues
+            )
+            # Step 3: Ensure service supports streaming
+            if not hasattr(service, 'astream'):
+                raise ValueError(f"Service {selected_model['provider']}/{selected_model['model_id']} does not support streaming")
+            # Step 4: Enable streaming on the service
+            if hasattr(service, 'streaming'):
+                service.streaming = True
+            # Step 5: Stream tokens and collect for billing
+            content_chunks = []
+            async for token in service.astream(input_data):
+                content_chunks.append(token)
+                # Only yield string tokens for streaming (filter out dict/objects)
+                if isinstance(token, str):
+                    yield token
+            # Step 6: After streaming is complete, calculate billing info and optionally return metadata
+            try:
+                await asyncio.sleep(0.01)  # Small delay to ensure billing tracking completes
+                # Get billing info (similar to _invoke_service)
+                billing_info = self._get_billing_info(service, selected_model["model_id"])
+                # Log billing info for tracking
+                logger.info(f"Streaming completed - Model: {selected_model['model_id']}, "
+                           f"Tokens: {billing_info.get('total_tokens', 'N/A')}, "
+                           f"Cost: ${billing_info.get('cost_usd', 0):.4f}")
+                # Return metadata if requested
+                if return_metadata:
+                    metadata = {
+                        "model_used": selected_model["model_id"],
+                        "provider": selected_model["provider"],
+                        "task": task,
+                        "service_type": service_type,
+                        "selection_reason": selected_model.get("reason", "Default selection"),
+                        "billing": billing_info,
+                        "streaming": True,
+                        "tokens_streamed": len(content_chunks),
+                        "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
+                    }
+                    yield ('metadata', metadata)
+            except Exception as billing_error:
+                logger.warning(f"Failed to track billing for streaming: {billing_error}")
+                if return_metadata:
+                    # Return fallback metadata even if billing fails
+                    fallback_metadata = {
+                        "model_used": selected_model["model_id"],
+                        "provider": selected_model["provider"],
+                        "task": task,
+                        "service_type": service_type,
+                        "selection_reason": selected_model.get("reason", "Default selection"),
+                        "billing": {
+                            "cost_usd": 0.0,
+                            "error": str(billing_error),
+                            "currency": "USD"
+                        },
+                        "streaming": True,
+                        "tokens_streamed": len(content_chunks),
+                        "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
+                    }
+                    yield ('metadata', fallback_metadata)
+        except Exception as e:
+            logger.error(f"Streaming invoke failed: {e}")
+            raise
+    def _is_rate_limit_error(self, error: Exception) -> bool:
+        """Check if an error is due to rate limiting"""
+        error_str = str(error).lower()
+        # Check for common rate limit indicators
+        rate_limit_indicators = [
+            'rate limit',
+            'rate_limit',
+            'ratelimit',
+            'too many requests',
+            'quota exceeded',
+            'limit exceeded',
+            'throttled',
+            '429'
+        ]
+        return any(indicator in error_str for indicator in rate_limit_indicators)
+    async def _invoke_with_fallback(
+        self,
+        service_type: str,
+        task: str,
+        input_data: Any,
+        selected_model: Dict[str, Any],
+        **kwargs
+    ) -> Any:
+        """Invoke service with automatic fallback on rate limit"""
+        try:
+            # First attempt with selected model
+            return await self._invoke_service_direct(service_type, task, input_data, selected_model, **kwargs)
+        except Exception as e:
+            # Check if this is a rate limit error
+            if self._is_rate_limit_error(e):
+                logger.warning(f"Rate limit detected for {selected_model['provider']}: {e}")
+                # Try to get fallback model using intelligent model selector
+                if INTELLIGENT_SELECTOR_AVAILABLE and self.model_selector:
+                    try:
+                        fallback_selection = self.model_selector.get_rate_limit_fallback(
+                            service_type,
+                            selected_model['provider']
+                        )
+                        if fallback_selection.get('success') and fallback_selection.get('is_fallback'):
+                            fallback_model = fallback_selection['selected_model']
+                            logger.info(f"Switching to fallback: {fallback_model['provider']}/{fallback_model['model_id']}")
+                            # Retry with fallback model
+                            return await self._invoke_service_direct(service_type, task, input_data, fallback_model, **kwargs)
+                    except Exception as fallback_error:
+                        logger.error(f"Fallback also failed: {fallback_error}")
+                        raise e  # Raise original rate limit error
+            # Re-raise the original error if not rate limit or fallback failed
+            raise
+    async def _invoke_service_direct(
+        self,
+        service_type: str,
+        task: str,
+        input_data: Any,
+        model_config: Dict[str, Any],
+        **kwargs
+    ) -> Any:
+        """Direct service invocation without fallback logic"""
+        # Get appropriate service
+        factory = AIFactory.get_instance()
+        # Create service with the specified model
+        if service_type == "text":
+            service = factory.get_llm(model_config["model_id"], model_config["provider"])
+        elif service_type == "vision":
+            service = factory.get_vision(model_config["model_id"], model_config["provider"])
+        elif service_type == "audio":
+            service = factory.get_audio(model_config["model_id"], model_config["provider"])
+        elif service_type == "image":
+            service = factory.get_image(model_config["model_id"], model_config["provider"])
+        elif service_type == "embedding":
+            service = factory.get_embed(model_config["model_id"], model_config["provider"])
+        else:
+            raise ValueError(f"Unsupported service type: {service_type}")
+        # Invoke the service
+        if service_type == "text":
+            show_reasoning = kwargs.pop('show_reasoning', False)
+            # Check if service supports show_reasoning parameter (mainly OpenAI services)
+            if model_config["provider"] == "openai":
+                result = await service.invoke(
+                    input_data=input_data,
+                    task=task,
+                    show_reasoning=show_reasoning,
+                    **kwargs
+                )
+            else:
+                # For other providers like yyds, don't pass show_reasoning
+                result = await service.invoke(
+                    input_data=input_data,
+                    task=task,
+                    **kwargs
+                )
+            return result
+        else:
+            return await service.invoke(input_data=input_data, task=task, **kwargs)
     async def _select_model(
         self,
         input_data: Any,
@@ -268,8 +703,26 @@ class ISAModelClient:
                 "reason": "User specified"
             }
+        # If model_hint provided but no provider_hint, handle special cases
+        if model_hint:
+            # Special handling for hybrid service
+            if model_hint == "hybrid":
+                return {
+                    "model_id": model_hint,
+                    "provider": "hybrid",
+                    "reason": "Hybrid service requested"
+                }
+            # If only model_hint provided, use default provider for that service type
+            elif provider_hint is None:
+                default_provider = self._get_default_provider(service_type)
+                return {
+                    "model_id": model_hint,
+                    "provider": default_provider,
+                    "reason": "Model specified with default provider"
+                }
         # Use intelligent model selector if available
-        if INTELLIGENT_SELECTOR_AVAILABLE:
+        if INTELLIGENT_SELECTOR_AVAILABLE and get_model_selector:
             try:
                 # Initialize model selector if not already done
                 if self.model_selector is None:
@@ -304,6 +757,17 @@ class ISAModelClient:
         # Fallback to default model selection
         return self._get_default_model(service_type, task, provider_hint)
+    def _get_default_provider(self, service_type: str) -> str:
+        """Get default provider for service type"""
+        defaults = {
+            "vision": "openai",
+            "audio": "openai",
+            "text": "openai",
+            "image": "replicate",
+            "embedding": "openai"
+        }
+        return defaults.get(service_type, "openai")
     def _get_default_model(
         self,
         service_type: str,
@@ -314,16 +778,17 @@ class ISAModelClient:
         defaults = {
             "vision": {
-                "model_id": "gpt-4o-mini",
+                "model_id": "gpt-4.1-nano",
                 "provider": "openai"
             },
             "audio": {
                 "tts": {"model_id": "tts-1", "provider": "openai"},
                 "stt": {"model_id": "whisper-1", "provider": "openai"},
+                "realtime": {"model_id": "gpt-4o-realtime-preview-2024-10-01", "provider": "openai"},
                 "default": {"model_id": "whisper-1", "provider": "openai"}
             },
             "text": {
-                "model_id": "gpt-4.1-mini",
+                "model_id": "gpt-4.1-nano",
                 "provider": "openai"
             },
             "image": {
@@ -331,19 +796,33 @@ class ISAModelClient:
                 "provider": "replicate"
             },
             "embedding": {
-                "model_id": "text-embedding-3-small",
-                "provider": "openai"
+                "embed": {"model_id": "text-embedding-3-small", "provider": "openai"},
+                "rerank": {"model_id": "isa-jina-reranker-v2-service", "provider": "isa"},
+                "default": {"model_id": "text-embedding-3-small", "provider": "openai"}
             }
         }
         # Handle audio service type with task-specific models
         if service_type == "audio":
-            if "speech" in task or "tts" in task:
+            # Realtime audio tasks
+            if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
+                default = defaults["audio"]["realtime"]
+            # Traditional TTS tasks
+            elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
                 default = defaults["audio"]["tts"]
-            elif "transcribe" in task or "stt" in task:
+            # Traditional STT tasks
+            elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
                 default = defaults["audio"]["stt"]
             else:
                 default = defaults["audio"]["default"]
+        # Handle embedding service type with task-specific models
+        elif service_type == "embedding":
+            if "rerank" in task:
+                default = defaults["embedding"]["rerank"]
+            elif "embed" in task:
+                default = defaults["embedding"]["embed"]
+            else:
+                default = defaults["embedding"]["default"]
         else:
             default = defaults.get(service_type, defaults["vision"])
@@ -363,59 +842,80 @@ class ISAModelClient:
         model_name: str,
         provider: str,
         task: str,
-        tools: Optional[List[Any]] = None
-    ) -> Any:
-        """Get appropriate service instance"""
+        use_cache: bool = True
+    ) -> tuple[Any, str]:
+        """Get appropriate service instance and return actual model used"""
-        cache_key = f"{service_type}_{provider}_{model_name}"
+        cache_key = f"{service_type}_{provider}_{model_name}_{task}"
+        actual_model_used = model_name  # Track the actual model used
-        # Check cache first
-        if cache_key in self._service_cache:
-            service = self._service_cache[cache_key]
-            # If tools are needed, bind them to the service
-            if tools and service_type == "text":
-                return service.bind_tools(tools)
-            return service
+        # Check cache first (if caching is enabled)
+        if use_cache and cache_key in self._service_cache:
+            cached_service, cached_model = self._service_cache[cache_key]
+            return cached_service, cached_model
         try:
+            # Validate service type
+            self._validate_service_type(service_type)
             # Route to appropriate AIFactory method
             if service_type == "vision":
                 service = self.ai_factory.get_vision(model_name, provider)
+                actual_model_used = model_name
             elif service_type == "audio":
-                if "speech" in task or "tts" in task:
-                    service = self.ai_factory.get_tts(model_name, provider)
-                elif "transcribe" in task or "stt" in task:
-                    service = self.ai_factory.get_stt(model_name, provider)
+                # Realtime audio tasks
+                if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
+                    # Use realtime model
+                    realtime_model = "gpt-4o-realtime-preview-2024-10-01" if model_name == "tts-1" or model_name == "whisper-1" else model_name
+                    service = self.ai_factory.get_realtime(realtime_model, provider)
+                    actual_model_used = realtime_model
+                # Traditional TTS tasks
+                elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
+                    # Use TTS model
+                    tts_model = "tts-1" if model_name == "whisper-1" else model_name
+                    service = self.ai_factory.get_tts(tts_model, provider)
+                    actual_model_used = tts_model
+                # Traditional STT tasks
+                elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
+                    # Use STT model
+                    stt_model = "whisper-1" if model_name == "tts-1" else model_name
+                    service = self.ai_factory.get_stt(stt_model, provider)
+                    actual_model_used = stt_model
+                # Default to STT for backward compatibility
                 else:
-                    # Default to STT for unknown audio tasks
-                    service = self.ai_factory.get_stt(model_name, provider)
+                    # Use STT model by default
+                    stt_model = "whisper-1" if model_name == "tts-1" else model_name
+                    service = self.ai_factory.get_stt(stt_model, provider)
+                    actual_model_used = stt_model
             elif service_type == "text":
                 service = self.ai_factory.get_llm(model_name, provider)
+                actual_model_used = model_name
             elif service_type == "image":
                 service = self.ai_factory.get_img("t2i", model_name, provider)
+                actual_model_used = model_name
             elif service_type == "embedding":
                 service = self.ai_factory.get_embed(model_name, provider)
+                actual_model_used = model_name
-            else:
-                raise ValueError(f"Unsupported service type: {service_type}")
-            # Cache the service
-            self._service_cache[cache_key] = service
-            # If tools are needed, bind them to the service
-            if tools and service_type == "text":
-                return service.bind_tools(tools)
-            return service
+            # Cache the service and actual model (if caching is enabled)
+            if use_cache:
+                self._service_cache[cache_key] = (service, actual_model_used)
+            return service, actual_model_used
         except Exception as e:
             logger.error(f"Failed to get service {service_type}/{provider}/{model_name}: {e}")
             raise
+    def _validate_service_type(self, service_type: str) -> None:
+        """Validate service type is supported"""
+        if service_type not in self.SUPPORTED_SERVICE_TYPES:
+            raise ValueError(f"Unsupported service type: {service_type}")
+    def _map_task(self, task: str, service_type: str) -> str:
+        """Map common task names to unified task names"""
+        task_mapping = self.TASK_MAPPINGS.get(service_type, {})
+        return task_mapping.get(task, task)
     async def _execute_task(
         self,
         service: Any,
@@ -427,166 +927,119 @@ class ISAModelClient:
         """Execute the task using the appropriate service"""
         try:
+            self._validate_service_type(service_type)
+            unified_task = self._map_task(task, service_type)
             if service_type == "vision":
-                return await self._execute_vision_task(service, input_data, task, **kwargs)
+                return await service.invoke(
+                    image=input_data,
+                    task=unified_task,
+                    **kwargs
+                )
             elif service_type == "audio":
-                return await self._execute_audio_task(service, input_data, task, **kwargs)
+                # Realtime audio tasks
+                if any(realtime_task in unified_task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
+                    # For realtime text_chat and audio_chat, pass text parameter
+                    if unified_task in ["text_chat", "audio_chat"]:
+                        if isinstance(input_data, str):
+                            kwargs['text'] = input_data
+                        elif isinstance(input_data, bytes):
+                            kwargs['audio_data'] = input_data
+                    return await service.invoke(
+                        task=unified_task,
+                        **kwargs
+                    )
+                # Traditional TTS tasks
+                elif unified_task in ["synthesize", "text_to_speech", "tts", "generate_speech"]:
+                    return await service.invoke(
+                        text=input_data,
+                        task=unified_task,
+                        **kwargs
+                    )
+                # Traditional STT tasks
+                else:
+                    return await service.invoke(
+                        audio_input=input_data,
+                        task=unified_task,
+                        **kwargs
+                    )
             elif service_type == "text":
-                return await self._execute_text_task(service, input_data, task, **kwargs)
+                # Extract show_reasoning from kwargs if present
+                show_reasoning = kwargs.pop('show_reasoning', False)
+                # Check if service provider supports show_reasoning
+                # Only OpenAI services support this parameter
+                if hasattr(service, 'provider_name') and service.provider_name == 'openai':
+                    result = await service.invoke(
+                        input_data=input_data,
+                        task=unified_task,
+                        show_reasoning=show_reasoning,
+                        **kwargs
+                    )
+                else:
+                    # For other providers like yyds, don't pass show_reasoning
+                    result = await service.invoke(
+                        input_data=input_data,
+                        task=unified_task,
+                        **kwargs
+                    )
+                logger.debug(f"Service result type: {type(result)}")
+                logger.debug(f"Service result: {result}")
+                # Check if this is a formatted result from invoke method
+                if isinstance(result, dict) and 'formatted' in result:
+                    # This is a formatted result from the new invoke method
+                    logger.debug(f"Returning formatted result: {result}")
+                    return result
+                elif isinstance(result, dict) and 'message' in result:
+                    # This is a traditional message result
+                    message = result['message']
+                    logger.debug(f"Extracted message type: {type(message)}")
+                    logger.debug(f"Extracted message length: {len(str(message)) if message else 0}")
+                    # Handle AIMessage objects from LangChain
+                    if hasattr(message, 'content'):
+                        # Check if there are tool_calls
+                        if hasattr(message, 'tool_calls') and message.tool_calls:
+                            logger.debug(f"AIMessage contains tool_calls: {len(message.tool_calls)}")
+                            # Return a dict with both content and tool_calls
+                            return {
+                                "content": message.content if message.content else "",
+                                "tool_calls": message.tool_calls
+                            }
+                        else:
+                            content = message.content
+                            logger.debug(f"Extracted content from AIMessage: {len(content) if content else 0} chars")
+                            return content
+                    else:
+                        # Direct string message
+                        logger.debug(f"Returning direct message: {len(str(message)) if message else 0} chars")
+                        return message
+                else:
+                    logger.debug(f"Returning result directly: {result}")
+                    return result
             elif service_type == "image":
-                return await self._execute_image_task(service, input_data, task, **kwargs)
+                return await service.invoke(
+                    prompt=input_data,
+                    task=unified_task,
+                    **kwargs
+                )
             elif service_type == "embedding":
-                return await self._execute_embedding_task(service, input_data, task, **kwargs)
-            else:
-                raise ValueError(f"Unsupported service type: {service_type}")
+                return await service.invoke(
+                    input_data=input_data,
+                    task=unified_task,
+                    **kwargs
+                )
         except Exception as e:
             logger.error(f"Task execution failed: {e}")
             raise
-    async def _execute_vision_task(self, service, input_data, task, **kwargs):
-        """Execute vision-related tasks using unified invoke method"""
-        # Map common task names to unified task names
-        task_mapping = {
-            "analyze_image": "analyze_image",
-            "detect_ui_elements": "detect_ui",
-            "extract_table": "extract_table",
-            "extract_text": "extract_text",
-            "ocr": "extract_text",
-            "describe": "analyze_image"
-        }
-        unified_task = task_mapping.get(task, task)
-        # Use unified invoke method with proper parameters
-        return await service.invoke(
-            image=input_data,
-            task=unified_task,
-            **kwargs
-        )
-    async def _execute_audio_task(self, service, input_data, task, **kwargs):
-        """Execute audio-related tasks using unified invoke method"""
-        # Map common task names to unified task names
-        task_mapping = {
-            "generate_speech": "synthesize",
-            "text_to_speech": "synthesize",
-            "tts": "synthesize",
-            "transcribe": "transcribe",
-            "speech_to_text": "transcribe",
-            "stt": "transcribe",
-            "translate": "translate",
-            "detect_language": "detect_language"
-        }
-        unified_task = task_mapping.get(task, task)
-        # Use unified invoke method with correct parameter name based on task type
-        if unified_task in ["synthesize", "text_to_speech", "tts"]:
-            # TTS services expect 'text' parameter
-            return await service.invoke(
-                text=input_data,
-                task=unified_task,
-                **kwargs
-            )
-        else:
-            # STT services expect 'audio_input' parameter
-            return await service.invoke(
-                audio_input=input_data,
-                task=unified_task,
-                **kwargs
-            )
-    async def _execute_text_task(self, service, input_data, task, **kwargs):
-        """Execute text-related tasks using unified invoke method"""
-        # Map common task names to unified task names
-        task_mapping = {
-            "chat": "chat",
-            "generate": "generate",
-            "complete": "complete",
-            "translate": "translate",
-            "summarize": "summarize",
-            "analyze": "analyze",
-            "extract": "extract",
-            "classify": "classify"
-        }
-        unified_task = task_mapping.get(task, task)
-        # Use unified invoke method
-        result = await service.invoke(
-            input_data=input_data,
-            task=unified_task,
-            **kwargs
-        )
-        # Handle the new response format from LLM services
-        # LLM services now return {"message": ..., "success": ..., "metadata": ...}
-        if isinstance(result, dict) and "message" in result:
-            # Extract the message content (convert AIMessage to string)
-            message = result["message"]
-            if hasattr(message, 'content'):
-                # Handle langchain AIMessage objects
-                return message.content
-            elif isinstance(message, str):
-                return message
-            else:
-                # Fallback: convert to string
-                return str(message)
-        # Fallback for other service types or legacy format
-        return result
-    async def _execute_image_task(self, service, input_data, task, **kwargs):
-        """Execute image generation tasks using unified invoke method"""
-        # Map common task names to unified task names
-        task_mapping = {
-            "generate_image": "generate",
-            "generate": "generate",
-            "img2img": "img2img",
-            "image_to_image": "img2img",
-            "generate_batch": "generate_batch"
-        }
-        unified_task = task_mapping.get(task, task)
-        # Use unified invoke method
-        return await service.invoke(
-            prompt=input_data,
-            task=unified_task,
-            **kwargs
-        )
-    async def _execute_embedding_task(self, service, input_data, task, **kwargs):
-        """Execute embedding tasks using unified invoke method"""
-        # Map common task names to unified task names
-        task_mapping = {
-            "create_embedding": "embed",
-            "embed": "embed",
-            "embed_batch": "embed_batch",
-            "chunk_and_embed": "chunk_and_embed",
-            "similarity": "similarity",
-            "find_similar": "find_similar"
-        }
-        unified_task = task_mapping.get(task, task)
-        # Use unified invoke method
-        return await service.invoke(
-            input_data=input_data,
-            task=unified_task,
-            **kwargs
-        )
     def clear_cache(self):
         """Clear service cache"""
@@ -602,7 +1055,7 @@ class ISAModelClient:
         Returns:
             List of available models with metadata
         """
-        if INTELLIGENT_SELECTOR_AVAILABLE:
+        if INTELLIGENT_SELECTOR_AVAILABLE and get_model_selector:
             try:
                 if self.model_selector is None:
                     self.model_selector = await get_model_selector(self.config)
@@ -636,7 +1089,7 @@ class ISAModelClient:
             for service_type, provider, model in test_services:
                 try:
-                    await self._get_service(service_type, model, provider, "test")
+                    service, _ = await self._get_service(service_type, model, provider, "test")
                     health_status["services"][f"{service_type}_{provider}"] = "healthy"
                 except Exception as e:
                     health_status["services"][f"{service_type}_{provider}"] = f"error: {str(e)}"
@@ -649,17 +1102,35 @@ class ISAModelClient:
                 "error": str(e)
             }
-    async def _invoke_local(
+    def _handle_error(self, e: Exception, context: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle errors consistently across methods"""
+        error_msg = f"Failed to {context.get('operation', 'execute')} {context.get('task', '')} on {context.get('service_type', '')}: {e}"
+        logger.error(error_msg)
+        return {
+            "success": False,
+            "error": str(e),
+            "metadata": context
+        }
+    async def _invoke_service_streaming(
         self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
+        input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
         task: str,
         service_type: str,
         model_hint: Optional[str] = None,
         provider_hint: Optional[str] = None,
-        tools: Optional[List[Any]] = None,
+        output_format: Optional[str] = None,
+        json_schema: Optional[Dict] = None,
+        repair_attempts: Optional[int] = 3,
         **kwargs
     ) -> Dict[str, Any]:
-        """Local invoke using AI Factory (original logic)"""
+        """Service invoke that returns streaming response with async generator"""
+        # Generate unique request ID for logging
+        request_id = generate_request_id()
+        start_time = datetime.now(timezone.utc)
+        execution_start_time = time.time()
         try:
             # Step 1: Select best model for this task
             selected_model = await self._select_model(
@@ -671,310 +1142,421 @@ class ISAModelClient:
             )
             # Step 2: Get appropriate service
-            service = await self._get_service(
+            service, actual_model_used = await self._get_service(
                 service_type=service_type,
                 model_name=selected_model["model_id"],
                 provider=selected_model["provider"],
                 task=task,
-                tools=tools
+                use_cache=False  # Don't cache for streaming to avoid state issues
             )
+            # Update selected model with actual model used
+            selected_model["model_id"] = actual_model_used
-            # Step 3: Execute task with unified interface
-            result = await self._execute_task(
-                service=service,
-                input_data=input_data,
-                task=task,
-                service_type=service_type,
-                **kwargs
-            )
+            # Step 3: Handle tools for LLM services (bind tools if provided)
+            tools = kwargs.pop("tools", None)
+            if service_type == "text" and tools:
+                service, _ = await self._get_service(
+                    service_type=service_type,
+                    model_name=selected_model["model_id"],
+                    provider=selected_model["provider"],
+                    task=task,
+                    use_cache=False
+                )
+                service = service.bind_tools(tools)
+            # Step 4: Ensure service supports streaming
+            if not hasattr(service, 'astream'):
+                raise ValueError(f"Service {selected_model['provider']}/{selected_model['model_id']} does not support streaming")
-            # Step 4: Return unified response
+            # Step 5: Enable streaming on the service
+            if hasattr(service, 'streaming'):
+                service.streaming = True
+            # Step 6: Create async generator wrapper that yields tokens
+            async def stream_generator():
+                # Pass show_reasoning parameter if available for LLM services
+                if service_type == "text" and hasattr(service, 'astream'):
+                    show_reasoning = kwargs.get('show_reasoning', False)
+                    logger.debug(f"Stream generator: show_reasoning={show_reasoning}")
+                    # Only pass show_reasoning to OpenAI providers
+                    if 'show_reasoning' in kwargs and hasattr(service, 'provider_name') and service.provider_name == 'openai':
+                        async for token in service.astream(input_data, show_reasoning=show_reasoning):
+                            yield token
+                    else:
+                        async for token in service.astream(input_data):
+                            yield token
+                else:
+                    async for token in service.astream(input_data):
+                        yield token
+            # Return response with stream generator and metadata
             return {
                 "success": True,
-                "result": result,
+                "stream": stream_generator(),
                 "metadata": {
                     "model_used": selected_model["model_id"],
                     "provider": selected_model["provider"],
                     "task": task,
                     "service_type": service_type,
-                    "selection_reason": selected_model.get("reason", "Default selection")
+                    "selection_reason": selected_model.get("reason", "Default selection"),
+                    "streaming": True
                 }
             }
         except Exception as e:
-            logger.error(f"Local invoke failed: {e}")
+            logger.error(f"Streaming service invoke failed: {e}")
             raise
-    async def _invoke_api(
+    async def _invoke_service(
         self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
+        input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
         task: str,
         service_type: str,
         model_hint: Optional[str] = None,
         provider_hint: Optional[str] = None,
+        stream: Optional[bool] = None,
+        output_format: Optional[str] = None,
+        json_schema: Optional[Dict] = None,
+        repair_attempts: Optional[int] = 3,
         **kwargs
     ) -> Dict[str, Any]:
-        """API invoke using HTTP requests"""
+        """Direct service invoke - passes LangChain objects and tools directly to services"""
-        # Handle file inputs
-        if isinstance(input_data, Path):
-            return await self._invoke_api_file(
-                file_path=input_data,
-                task=task,
+        # Generate unique request ID for logging
+        request_id = generate_request_id()
+        start_time = datetime.now(timezone.utc)
+        execution_start_time = time.time()
+        try:
+            # Step 1: Select best model for this task
+            selected_model = await self._select_model(
+                input_data=input_data,
+                task=task,
                 service_type=service_type,
                 model_hint=model_hint,
-                provider_hint=provider_hint,
-                **kwargs
+                provider_hint=provider_hint
             )
-        # Handle binary data
-        if isinstance(input_data, bytes):
-            return await self._invoke_api_binary(
-                data=input_data,
+            # Step 1.5: Log inference start
+            self.inference_logger.log_inference_start(
+                request_id=request_id,
+                service_type=service_type,
                 task=task,
+                provider=selected_model["provider"],
+                model_name=selected_model["model_id"],
+                input_data=input_data if self.inference_logger.log_detailed_requests else None,
+                is_streaming=stream or False,
+                custom_metadata={
+                    "selection_reason": selected_model.get("reason", "Default selection"),
+                    "has_tools": "tools" in kwargs
+                }
+            )
+            # Step 2: Get appropriate service
+            service, actual_model_used = await self._get_service(
                 service_type=service_type,
-                model_hint=model_hint,
-                provider_hint=provider_hint,
-                **kwargs
+                model_name=selected_model["model_id"],
+                provider=selected_model["provider"],
+                task=task
             )
-        # Handle text/JSON data
-        payload = {
-            "input_data": input_data,
-            "task": task,
-            "service_type": service_type,
-            "model_hint": model_hint,
-            "provider_hint": provider_hint,
-            "parameters": kwargs
-        }
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
+            # Update selected model with actual model used
+            selected_model["model_id"] = actual_model_used
+            # Step 3: Handle tools for LLM services (bind tools if provided)
+            tools = kwargs.pop("tools", None)
+            if service_type == "text" and tools:
+                service, _ = await self._get_service(
+                    service_type=service_type,
+                    model_name=selected_model["model_id"],
+                    provider=selected_model["provider"],
+                    task=task,
+                    use_cache=False
+                )
+                service = service.bind_tools(tools)
+                # Note: streaming is still supported with tools
+            # Step 4: Set streaming for text services
+            if service_type == "text" and stream is not None:
+                if hasattr(service, 'streaming'):
+                    service.streaming = stream
+            # Step 5: Execute task with unified interface
+            # Pass JSON formatting parameters to the service
+            task_kwargs = kwargs.copy()
+            if service_type == "text":
+                if output_format:
+                    task_kwargs["output_format"] = output_format
+                if json_schema:
+                    task_kwargs["json_schema"] = json_schema
+                if repair_attempts is not None:
+                    task_kwargs["repair_attempts"] = repair_attempts
+            # Try to execute with rate limit detection
             try:
-                async with session.post(
-                    f"{self.api_url}/api/v1/invoke",
-                    json=payload,
-                    headers=self.headers
-                ) as response:
-                    if response.status == 200:
-                        return await response.json()
-                    else:
-                        error_data = await response.text()
-                        raise Exception(f"API error {response.status}: {error_data}")
+                result = await self._execute_task(
+                    service=service,
+                    input_data=input_data,
+                    task=task,
+                    service_type=service_type,
+                    **task_kwargs
+                )
             except Exception as e:
-                logger.error(f"API invoke failed: {e}")
-                raise
-    async def _invoke_api_file(
-        self,
-        file_path: Path,
-        task: str,
-        service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """API file upload"""
-        if not file_path.exists():
-            raise FileNotFoundError(f"File not found: {file_path}")
-        data = aiohttp.FormData()
-        data.add_field('task', task)
-        data.add_field('service_type', service_type)
-        if model_hint:
-            data.add_field('model_hint', model_hint)
-        if provider_hint:
-            data.add_field('provider_hint', provider_hint)
-        data.add_field('file',
-                      open(file_path, 'rb'),
-                      filename=file_path.name,
-                      content_type='application/octet-stream')
-        headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
-            try:
-                async with session.post(
-                    f"{self.api_url}/api/v1/invoke-file",
-                    data=data,
-                    headers=headers
-                ) as response:
+                # Check if this is a rate limit error and we can fallback
+                if self._is_rate_limit_error(e) and service_type == "text":
+                    # Ensure model selector is initialized
+                    if not self.model_selector:
+                        self.model_selector = await get_model_selector(self.config)
-                    if response.status == 200:
-                        return await response.json()
-                    else:
-                        error_data = await response.text()
-                        raise Exception(f"API error {response.status}: {error_data}")
-            except Exception as e:
-                logger.error(f"API file upload failed: {e}")
-                raise
-    async def _invoke_api_binary(
-        self,
-        data: bytes,
-        task: str,
-        service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """API binary upload"""
-        form_data = aiohttp.FormData()
-        form_data.add_field('task', task)
-        form_data.add_field('service_type', service_type)
-        if model_hint:
-            form_data.add_field('model_hint', model_hint)
-        if provider_hint:
-            form_data.add_field('provider_hint', provider_hint)
-        form_data.add_field('file',
-                           data,
-                           filename='data.bin',
-                           content_type='application/octet-stream')
-        headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
-            try:
-                async with session.post(
-                    f"{self.api_url}/api/v1/invoke-file",
-                    data=form_data,
-                    headers=headers
-                ) as response:
+                    # Get fallback model selection
+                    fallback_selection = self.model_selector.get_rate_limit_fallback(
+                        service_type=service_type,
+                        original_provider=selected_model["provider"]
+                    )
-                    if response.status == 200:
-                        return await response.json()
-                    else:
-                        error_data = await response.text()
-                        raise Exception(f"API error {response.status}: {error_data}")
+                    if fallback_selection.get('success'):
+                        fallback_model = fallback_selection.get('selected_model', {})
+                        logger.info(f"Rate limit hit, switching to fallback: {fallback_model}")
-            except Exception as e:
-                logger.error(f"API binary upload failed: {e}")
-                raise
-    async def _stream_local(
-        self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
-        task: str,
-        service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
-        tools: Optional[List[Any]] = None,
-        **kwargs
-    ):
-        """Local streaming using AI Factory"""
-        # Step 1: Select best model for this task
-        selected_model = await self._select_model(
-            input_data=input_data,
-            task=task,
-            service_type=service_type,
-            model_hint=model_hint,
-            provider_hint=provider_hint
-        )
-        # Step 2: Get appropriate service
-        service = await self._get_service(
-            service_type=service_type,
-            model_name=selected_model["model_id"],
-            provider=selected_model["provider"],
-            task=task,
-            tools=tools
-        )
-        # Step 3: Yield tokens from the stream
-        async for token in service.astream(input_data):
-            yield token
-    async def _stream_api(
-        self,
-        input_data: Union[str, bytes, Path, Dict[str, Any]],
-        task: str,
-        service_type: str,
-        model_hint: Optional[str] = None,
-        provider_hint: Optional[str] = None,
-        **kwargs
-    ):
-        """API streaming using Server-Sent Events (SSE)"""
-        # Only support text streaming for now
-        if not isinstance(input_data, (str, dict)):
-            raise ValueError("API streaming only supports text input")
-        payload = {
-            "input_data": input_data,
-            "task": task,
-            "service_type": service_type,
-            "model_hint": model_hint,
-            "provider_hint": provider_hint,
-            "stream": True,
-            "parameters": kwargs
-        }
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
-            try:
-                async with session.post(
-                    f"{self.api_url}/api/v1/stream",
-                    json=payload,
-                    headers=self.headers
-                ) as response:
-                    if response.status == 200:
-                        # Parse SSE stream
-                        async for line in response.content:
-                            if line:
-                                line_str = line.decode().strip()
-                                if line_str.startswith("data: "):
-                                    try:
-                                        # Parse SSE data
-                                        import json
-                                        json_str = line_str[6:]  # Remove "data: " prefix
-                                        data = json.loads(json_str)
-                                        if data.get("type") == "token" and "token" in data:
-                                            yield data["token"]
-                                        elif data.get("type") == "completion":
-                                            # End of stream
-                                            break
-                                        elif data.get("type") == "error":
-                                            raise Exception(f"Server error: {data.get('error')}")
-                                    except json.JSONDecodeError:
-                                        # Skip malformed lines
-                                        continue
-                    else:
-                        error_data = await response.text()
-                        raise Exception(f"API streaming error {response.status}: {error_data}")
+                        # Get fallback service
+                        fallback_service, fallback_model_used = await self._get_service(
+                            service_type=service_type,
+                            model_name=fallback_model["model_id"],
+                            provider=fallback_model["provider"],
+                            task=task
+                        )
-            except Exception as e:
-                logger.error(f"API streaming failed: {e}")
-                raise
+                        # Update selected model for metadata
+                        selected_model = fallback_model
+                        selected_model["model_id"] = fallback_model_used
+                        selected_model["reason"] = "Rate limit fallback"
+                        # Retry with fallback service
+                        result = await self._execute_task(
+                            service=fallback_service,
+                            input_data=input_data,
+                            task=task,
+                            service_type=service_type,
+                            **task_kwargs
+                        )
+                    else:
+                        # No fallback available, re-raise original error
+                        raise
+                else:
+                    # Not a rate limit error or no fallback, re-raise
+                    raise
+            # Step 6: Wait for billing tracking to complete, then get billing information
+            await asyncio.sleep(0.01)  # Small delay to ensure billing tracking completes
+            billing_info = self._get_billing_info(service, selected_model["model_id"])
+            # Step 6.5: Calculate execution time and log completion
+            execution_time_ms = int((time.time() - execution_start_time) * 1000)
+            # Log inference completion
+            self.inference_logger.log_inference_complete(
+                request_id=request_id,
+                status="completed",
+                execution_time_ms=execution_time_ms,
+                input_tokens=billing_info.get("input_tokens"),
+                output_tokens=billing_info.get("output_tokens"),
+                estimated_cost_usd=billing_info.get("cost_usd"),
+                output_data=result if self.inference_logger.log_detailed_requests else None,
+                custom_metadata={
+                    "billing_operation": billing_info.get("operation"),
+                    "timestamp": billing_info.get("timestamp")
+                }
+            )
+            # Log detailed token usage if available
+            if billing_info.get("input_tokens") and billing_info.get("output_tokens"):
+                self.inference_logger.log_token_usage(
+                    request_id=request_id,
+                    provider=selected_model["provider"],
+                    model_name=selected_model["model_id"],
+                    prompt_tokens=billing_info.get("input_tokens"),
+                    completion_tokens=billing_info.get("output_tokens"),
+                    prompt_cost_usd=billing_info.get("cost_usd", 0) * 0.6 if billing_info.get("cost_usd") else None,  # Rough estimate
+                    completion_cost_usd=billing_info.get("cost_usd", 0) * 0.4 if billing_info.get("cost_usd") else None
+                )
+            # Handle formatting - check if result is already formatted
+            formatted_result = result
+            if service_type == "text" and output_format:
+                # Check if result is already formatted by the service
+                if isinstance(result, dict) and result.get("formatted"):
+                    # Result is already formatted by the service
+                    formatted_result = result.get("result", result)
+                    billing_info["formatting"] = {
+                        "output_format": output_format,
+                        "format_success": True,
+                        "format_method": "service_level",
+                        "format_errors": result.get("format_errors", []),
+                        "repaired": False,
+                        "pre_formatted": True
+                    }
+                else:
+                    # Apply formatting at client level (fallback)
+                    try:
+                        service, _ = await self._get_service(
+                            service_type=service_type,
+                            model_name=selected_model["model_id"],
+                            provider=selected_model["provider"],
+                            task=task
+                        )
+                        if hasattr(service, 'format_structured_output'):
+                            formatting_result = service.format_structured_output(
+                                response=result,
+                                output_format=output_format,
+                                schema=json_schema,
+                                repair_attempts=repair_attempts or 3
+                            )
+                            # Update result and add formatting metadata
+                            if formatting_result.get("success") and formatting_result.get("data") is not None:
+                                # Extract the actual formatted data
+                                formatted_data = formatting_result["data"]
+                                # For JSON output, ensure we return clean data
+                                if output_format == "json" and isinstance(formatted_data, dict):
+                                    formatted_result = formatted_data
+                                else:
+                                    formatted_result = formatted_data
+                            else:
+                                # Keep original result if formatting failed
+                                formatted_result = result
+                            # Add formatting info to metadata
+                            billing_info["formatting"] = {
+                                "output_format": output_format,
+                                "format_success": formatting_result.get("success", False),
+                                "format_method": formatting_result.get("method"),
+                                "format_errors": formatting_result.get("errors", []),
+                                "repaired": formatting_result.get("repaired", False),
+                                "pre_formatted": False
+                            }
+                    except Exception as format_error:
+                        logger.warning(f"Failed to apply output formatting: {format_error}")
+                        # Continue with unformatted result
+                        formatted_result = result
+                        billing_info["formatting"] = {
+                            "output_format": output_format,
+                            "format_success": False,
+                            "format_error": str(format_error)
+                        }
+            # Return unified response
+            response = {
+                "success": True,
+                "result": formatted_result,
+                "metadata": {
+                    "request_id": request_id,  # Include request ID for tracking
+                    "model_used": selected_model["model_id"],
+                    "provider": selected_model["provider"],
+                    "task": task,
+                    "service_type": service_type,
+                    "selection_reason": selected_model.get("reason", "Default selection"),
+                    "execution_time_ms": execution_time_ms,
+                    "billing": billing_info
+                }
+            }
+            return response
+        except Exception as e:
+            # Calculate execution time even for errors
+            execution_time_ms = int((time.time() - execution_start_time) * 1000)
+            # Log inference error
+            error_type = type(e).__name__
+            error_message = str(e)
+            self.inference_logger.log_inference_complete(
+                request_id=request_id,
+                status="failed",
+                execution_time_ms=execution_time_ms,
+                error_message=error_message,
+                error_code=error_type,
+                custom_metadata={
+                    "error_location": "client._invoke_service"
+                }
+            )
+            # Also log to the error table
+            self.inference_logger.log_error(
+                request_id=request_id,
+                error_type=error_type,
+                error_message=error_message,
+                provider=model_hint or "unknown",
+                model_name=provider_hint or "unknown"
+            )
+            logger.error(f"Service invoke failed: {e}")
+            raise
+    def _get_billing_info(self, service: Any, model_id: str) -> Dict[str, Any]:
+        """Extract billing information from service after task execution"""
+        try:
+            # Check if service has model_manager with billing_tracker
+            if hasattr(service, 'model_manager') and hasattr(service.model_manager, 'billing_tracker'):
+                billing_tracker = service.model_manager.billing_tracker
+                # Get the latest usage record for this model
+                model_records = [
+                    record for record in billing_tracker.usage_records
+                    if record.model_id == model_id
+                ]
+                if model_records:
+                    # Get the most recent record
+                    latest_record = max(model_records, key=lambda r: r.timestamp)
+                    return {
+                        "cost_usd": latest_record.cost_usd,
+                        "input_tokens": latest_record.input_tokens,
+                        "output_tokens": latest_record.output_tokens,
+                        "total_tokens": latest_record.total_tokens,
+                        "operation": latest_record.operation,
+                        "timestamp": latest_record.timestamp,
+                        "currency": "USD"
+                    }
+            # Fallback: no billing info available
+            return {
+                "cost_usd": 0.0,
+                "input_tokens": None,
+                "output_tokens": None,
+                "total_tokens": None,
+                "operation": None,
+                "timestamp": None,
+                "currency": "USD",
+                "note": "Billing information not available"
+            }
+        except Exception as e:
+            logger.warning(f"Failed to get billing info: {e}")
+            return {
+                "cost_usd": 0.0,
+                "error": str(e),
+                "currency": "USD"
+            }
 # Convenience function for quick access
 def create_client(
     config: Optional[Dict[str, Any]] = None,
-    mode: str = "local",
-    api_url: Optional[str] = None,
+    service_endpoint: Optional[str] = None,
     api_key: Optional[str] = None
 ) -> ISAModelClient:
     """Create ISA Model Client instance
     Args:
         config: Optional configuration
-        mode: "local" for direct AI Factory, "api" for HTTP API calls
-        api_url: API base URL (required if mode="api")
-        api_key: API key for authentication (optional)
+        service_endpoint: Optional service endpoint URL (if None, uses local AI Factory)
+        api_key: Optional API key for authentication (can also be set via ISA_API_KEY env var)
     Returns:
         ISAModelClient instance
     """
-    return ISAModelClient(config=config, mode=mode, api_url=api_url, api_key=api_key)
+    return ISAModelClient(config=config, service_endpoint=service_endpoint, api_key=api_key)
 # Export for easy import

isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl