isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/client.py CHANGED
@@ -2,17 +2,84 @@
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
4
  """
5
- ISA Model Client - Unified interface for all AI services
6
- Provides intelligent model selection and simplified API
5
+ ISA Model Client - Unified AI Service Interface
6
+ ===============================================
7
+
8
+ 功能描述:
9
+ ISA Model平台的统一客户端接口,提供智能模型选择和简化的API调用
10
+
11
+ 主要功能:
12
+ - 多模态AI服务统一接口:文本、视觉、音频、图像生成、嵌入向量
13
+ - 智能模型自动选择:基于任务类型和输入数据自动选择最适合的模型
14
+ - 流式响应支持:支持实时流式文本生成,提供更好的用户体验
15
+ - 远程/本地服务:支持本地服务调用和远程API调用两种模式
16
+ - 成本跟踪:自动计算和跟踪API调用成本
17
+ - 工具支持:支持LangChain工具集成,扩展模型能力
18
+ - 缓存机制:服务实例缓存,提高性能
19
+
20
+ 输入接口:
21
+ - input_data: 多类型输入数据(文本、图像路径、音频文件、字节数据等)
22
+ - task: 任务类型(chat, analyze, generate_speech, transcribe等)
23
+ - service_type: 服务类型(text, vision, audio, image, embedding)
24
+ - model: 可选模型名称(如不指定则智能选择)
25
+ - provider: 可选提供商名称(openai, ollama, replicate等)
26
+
27
+ 输出格式:
28
+ - 统一响应字典,包含result和metadata
29
+ - 流式响应:包含stream异步生成器
30
+ - 非流式响应:包含result结果数据
31
+ - metadata:包含模型信息、计费信息、选择原因等
32
+
33
+ 核心依赖:
34
+ - isa_model.inference.ai_factory: AI服务工厂
35
+ - isa_model.core.services.intelligent_model_selector: 智能模型选择器
36
+ - aiohttp: HTTP客户端(远程API模式)
37
+ - asyncio: 异步编程支持
38
+
39
+ 使用示例:
40
+ ```python
41
+ # 创建客户端
42
+ client = ISAModelClient()
43
+
44
+ # 流式文本生成
45
+ result = await client.invoke("写一个故事", "chat", "text")
46
+ async for token in result["stream"]:
47
+ print(token, end="", flush=True)
48
+
49
+ # 图像分析
50
+ result = await client.invoke("image.jpg", "analyze", "vision")
51
+ print(result["result"])
52
+
53
+ # 语音合成
54
+ result = await client.invoke("Hello world", "generate_speech", "audio")
55
+ print(result["result"])
56
+ ```
57
+
58
+ 架构特点:
59
+ - 单例模式:确保配置一致性
60
+ - 异步支持:所有操作都是异步的
61
+ - 错误处理:统一的错误处理和响应格式
62
+ - 可扩展性:支持新的服务提供商和模型
63
+
64
+ 优化建议:
65
+ 1. 增加请求重试机制:处理网络不稳定情况
66
+ 2. 添加请求限流:避免超出API限制
67
+ 3. 优化缓存策略:支持LRU缓存和TTL过期
68
+ 4. 增加监控指标:记录延迟、成功率等指标
69
+ 5. 支持批处理:提高大量请求的处理效率
70
+ 6. 添加配置验证:启动时验证API密钥和配置
7
71
  """
8
72
 
9
73
  import logging
10
74
  import asyncio
75
+ import time
76
+ import uuid
11
77
  from typing import Any, Dict, Optional, List, Union
12
78
  from pathlib import Path
13
- import aiohttp
79
+ from datetime import datetime, timezone
14
80
 
15
81
  from isa_model.inference.ai_factory import AIFactory
82
+ from isa_model.core.logging import get_inference_logger, generate_request_id
16
83
 
17
84
  try:
18
85
  from isa_model.core.services.intelligent_model_selector import IntelligentModelSelector, get_model_selector
@@ -36,41 +103,104 @@ class ISAModelClient:
36
103
  response = await client.invoke("audio.mp3", "transcribe", "audio")
37
104
  """
38
105
 
106
+ # Consolidated task mappings for all service types
107
+ TASK_MAPPINGS = {
108
+ "vision": {
109
+ # Core tasks (direct mapping)
110
+ "analyze": "analyze",
111
+ "describe": "describe",
112
+ "extract": "extract",
113
+ "detect": "detect",
114
+ "classify": "classify",
115
+ "compare": "compare",
116
+
117
+ # Common aliases (backward compatibility)
118
+ "analyze_image": "analyze",
119
+ "describe_image": "describe",
120
+ "extract_text": "extract",
121
+ "extract_table": "extract",
122
+ "detect_objects": "detect",
123
+ "detect_ui": "detect",
124
+ "detect_ui_elements": "detect",
125
+ "get_coordinates": "detect",
126
+ "ocr": "extract",
127
+ "ui_analysis": "analyze",
128
+ "navigation": "analyze"
129
+ },
130
+ "audio": {
131
+ "generate_speech": "synthesize",
132
+ "text_to_speech": "synthesize",
133
+ "tts": "synthesize",
134
+ "transcribe": "transcribe",
135
+ "speech_to_text": "transcribe",
136
+ "stt": "transcribe",
137
+ "translate": "translate",
138
+ "detect_language": "detect_language"
139
+ },
140
+ "text": {
141
+ "chat": "chat",
142
+ "generate": "generate",
143
+ "complete": "complete",
144
+ "translate": "translate",
145
+ "summarize": "summarize",
146
+ "analyze": "analyze",
147
+ "extract": "extract",
148
+ "classify": "classify"
149
+ },
150
+ "image": {
151
+ "generate_image": "generate",
152
+ "generate": "generate",
153
+ "img2img": "img2img",
154
+ "image_to_image": "img2img",
155
+ "generate_batch": "generate_batch"
156
+ },
157
+ "embedding": {
158
+ "create_embedding": "embed",
159
+ "embed": "embed",
160
+ "embed_batch": "embed_batch",
161
+ "chunk_and_embed": "chunk_and_embed",
162
+ "similarity": "similarity",
163
+ "find_similar": "find_similar",
164
+ "rerank": "rerank",
165
+ "rerank_documents": "rerank_documents",
166
+ "document_ranking": "document_ranking"
167
+ }
168
+ }
169
+
170
+ # Service type configuration
171
+ SUPPORTED_SERVICE_TYPES = {"vision", "audio", "text", "image", "embedding"}
172
+
39
173
  def __init__(self,
40
174
  config: Optional[Dict[str, Any]] = None,
41
- mode: str = "local",
42
- api_url: Optional[str] = None,
175
+ service_endpoint: Optional[str] = None,
43
176
  api_key: Optional[str] = None):
44
177
  """Initialize ISA Model Client
45
178
 
46
179
  Args:
47
180
  config: Optional configuration override
48
- mode: "local" for direct AI Factory, "api" for HTTP API calls
49
- api_url: API base URL (required if mode="api")
50
- api_key: API key for authentication (optional)
181
+ service_endpoint: Optional service endpoint URL (if None, uses local AI Factory)
182
+ api_key: Optional API key for authentication (can also be set via ISA_API_KEY env var)
51
183
  """
52
184
  self.config = config or {}
53
- self.mode = mode
54
- self.api_url = api_url.rstrip('/') if api_url else None
55
- self.api_key = api_key
56
-
57
- # Setup HTTP headers for API mode
58
- if self.mode == "api":
59
- if not self.api_url:
60
- raise ValueError("api_url is required when mode='api'")
61
-
62
- self.headers = {
63
- "Content-Type": "application/json",
64
- "User-Agent": "ISA-Model-Client/1.0.0"
65
- }
66
- if self.api_key:
67
- self.headers["Authorization"] = f"Bearer {self.api_key}"
185
+ self.service_endpoint = service_endpoint
186
+
187
+ # Handle API key authentication
188
+ import os
189
+ self.api_key = api_key or os.getenv("ISA_API_KEY")
190
+ if self.api_key:
191
+ logger.info("API key provided for authentication")
192
+ else:
193
+ logger.debug("No API key provided - using anonymous access")
68
194
 
69
- # Initialize AI Factory for local mode
70
- if self.mode == "local":
195
+ # Initialize AI Factory for direct service access (when service_endpoint is None)
196
+ if not self.service_endpoint:
71
197
  self.ai_factory = AIFactory.get_instance()
72
198
  else:
73
199
  self.ai_factory = None
200
+ logger.info(f"Using remote service endpoint: {self.service_endpoint}")
201
+
202
+ # HTTP client for remote API calls
203
+ self._http_session = None
74
204
 
75
205
  # Initialize intelligent model selector
76
206
  self.model_selector = None
@@ -87,169 +217,474 @@ class ISAModelClient:
87
217
  # Cache for frequently used services
88
218
  self._service_cache: Dict[str, Any] = {}
89
219
 
220
+ # Initialize inference logger
221
+ self.inference_logger = get_inference_logger()
222
+
90
223
  logger.info("ISA Model Client initialized")
91
224
 
92
- async def stream(
225
+ async def _get_http_session(self):
226
+ """Get or create HTTP session for remote API calls"""
227
+ if self._http_session is None:
228
+ import aiohttp
229
+ headers = {}
230
+
231
+ # Add API key authentication if available
232
+ if self.api_key:
233
+ headers["Authorization"] = f"Bearer {self.api_key}"
234
+ headers["X-API-Key"] = self.api_key
235
+
236
+ self._http_session = aiohttp.ClientSession(headers=headers)
237
+
238
+ return self._http_session
239
+
240
+ async def _make_api_request(self, endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]:
241
+ """Make HTTP request to remote API endpoint"""
242
+ if not self.service_endpoint:
243
+ raise ValueError("Service endpoint not configured for remote API calls")
244
+
245
+ session = await self._get_http_session()
246
+ url = f"{self.service_endpoint.rstrip('/')}/{endpoint.lstrip('/')}"
247
+
248
+ try:
249
+ async with session.post(url, json=data) as response:
250
+ if response.status == 401:
251
+ raise Exception("Authentication required or invalid API key")
252
+ elif response.status == 403:
253
+ raise Exception("Insufficient permissions")
254
+ elif not response.ok:
255
+ error_detail = await response.text()
256
+ raise Exception(f"API request failed ({response.status}): {error_detail}")
257
+
258
+ return await response.json()
259
+
260
+ except Exception as e:
261
+ logger.error(f"Remote API request failed: {e}")
262
+ raise
263
+
264
+ async def close(self):
265
+ """Close HTTP session and cleanup resources"""
266
+ if self._http_session:
267
+ await self._http_session.close()
268
+ self._http_session = None
269
+
270
+ async def _invoke_remote_api(
93
271
  self,
94
- input_data: Union[str, bytes, Path, Dict[str, Any]],
95
- task: str,
272
+ input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
273
+ task: str,
96
274
  service_type: str,
97
- model_hint: Optional[str] = None,
98
- provider_hint: Optional[str] = None,
275
+ model: Optional[str] = None,
276
+ provider: Optional[str] = None,
277
+ stream: Optional[bool] = None,
99
278
  **kwargs
100
- ):
101
- """
102
- Streaming invoke method that yields tokens in real-time
103
-
104
- Args:
105
- input_data: Input data (text for LLM streaming)
106
- task: Task to perform
107
- service_type: Type of service (only "text" supports streaming)
108
- model_hint: Optional model preference
109
- provider_hint: Optional provider preference
110
- **kwargs: Additional parameters
111
-
112
- Yields:
113
- Individual tokens as they arrive from the model
114
-
115
- Example:
116
- async for token in client.stream("Hello world", "chat", "text"):
117
- print(token, end="", flush=True)
118
- """
119
- if service_type != "text":
120
- raise ValueError("Streaming is only supported for text/LLM services")
121
-
279
+ ) -> Dict[str, Any]:
280
+ """Invoke remote API endpoint"""
122
281
  try:
123
- if self.mode == "api":
124
- async for token in self._stream_api(input_data, task, service_type, model_hint, provider_hint, **kwargs):
125
- yield token
282
+ # Prepare request data for unified API
283
+ request_data = {
284
+ "task": task,
285
+ "service_type": service_type,
286
+ **kwargs
287
+ }
288
+
289
+ # Add model and provider if specified
290
+ if model:
291
+ request_data["model"] = model
292
+ if provider:
293
+ request_data["provider"] = provider
294
+ # For remote API, disable streaming to get JSON response
295
+ request_data["stream"] = False
296
+
297
+ # Handle different input data types
298
+ if isinstance(input_data, (str, Path)):
299
+ request_data["input_data"] = str(input_data)
300
+ elif isinstance(input_data, (dict, list)):
301
+ request_data["input_data"] = input_data
126
302
  else:
127
- async for token in self._stream_local(input_data, task, service_type, model_hint, provider_hint, **kwargs):
128
- yield token
303
+ # For binary data, convert to base64
304
+ import base64
305
+ if isinstance(input_data, bytes):
306
+ request_data["input_data"] = base64.b64encode(input_data).decode()
307
+ request_data["data_type"] = "base64"
308
+ else:
309
+ request_data["input_data"] = str(input_data)
310
+
311
+ # Make API request
312
+ response = await self._make_api_request("api/v1/invoke", request_data)
313
+
314
+ return response
315
+
129
316
  except Exception as e:
130
- logger.error(f"Failed to stream {task} on {service_type}: {e}")
131
- raise
132
-
317
+ logger.error(f"Remote API invocation failed: {e}")
318
+ return {
319
+ "success": False,
320
+ "error": str(e),
321
+ "metadata": {
322
+ "task": task,
323
+ "service_type": service_type,
324
+ "endpoint": "remote"
325
+ }
326
+ }
327
+
133
328
  async def invoke(
134
329
  self,
135
- input_data: Union[str, bytes, Path, Dict[str, Any]],
330
+ input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
136
331
  task: str,
137
332
  service_type: str,
138
- model_hint: Optional[str] = None,
139
- provider_hint: Optional[str] = None,
140
- stream: bool = False,
141
- tools: Optional[List[Any]] = None,
333
+ model: Optional[str] = None,
334
+ provider: Optional[str] = None,
335
+ stream: Optional[bool] = None,
336
+ show_reasoning: Optional[bool] = False,
337
+ output_format: Optional[str] = None,
338
+ json_schema: Optional[Dict] = None,
339
+ repair_attempts: Optional[int] = 3,
142
340
  **kwargs
143
- ) -> Union[Dict[str, Any], object]:
341
+ ) -> Dict[str, Any]:
144
342
  """
145
343
  Unified invoke method with intelligent model selection
146
344
 
147
345
  Args:
148
- input_data: Input data (image path, text, audio, etc.)
149
- task: Task to perform (analyze_image, generate_speech, transcribe, etc.)
150
- service_type: Type of service (vision, audio, text, image, embedding)
151
- model_hint: Optional model preference
152
- provider_hint: Optional provider preference
153
- stream: Enable streaming for text services (returns AsyncGenerator)
154
- tools: Optional list of tools for function calling (only for text services)
155
- **kwargs: Additional task-specific parameters
346
+ input_data: Input data (str, LangChain messages, image path, audio, etc.)
347
+ task: Task to perform (chat, analyze_image, generate_speech, transcribe, etc.)
348
+ service_type: Type of service (text, vision, audio, image, embedding)
349
+ model: Model name (if None, uses intelligent selection)
350
+ provider: Provider name (if None, uses intelligent selection)
351
+ stream: Enable streaming for text tasks (default True for chat/generate tasks, supports tools)
352
+ show_reasoning: Show reasoning process for O4 models (uses Responses API)
353
+ **kwargs: Additional task-specific parameters (including tools for LangChain)
156
354
 
157
355
  Returns:
158
- If stream=False: Unified response dictionary with result and metadata
159
- If stream=True: AsyncGenerator yielding tokens (only for text services)
356
+ Unified response dictionary with result and metadata
357
+ For streaming: result["stream"] contains async generator
358
+ For non-streaming: result["result"] contains the response
160
359
 
161
360
  Examples:
162
- # Vision tasks
163
- await client.invoke("image.jpg", "analyze_image", "vision")
164
- await client.invoke("screenshot.png", "detect_ui_elements", "vision")
165
- await client.invoke("document.pdf", "extract_table", "vision")
166
-
167
- # Audio tasks
168
- await client.invoke("Hello world", "generate_speech", "audio")
169
- await client.invoke("audio.mp3", "transcribe", "audio")
170
-
171
- # Text tasks
172
- await client.invoke("Translate this text", "translate", "text")
173
- await client.invoke("What is AI?", "chat", "text")
361
+ # Text tasks with streaming (default for chat)
362
+ result = await client.invoke("Write a story", "chat", "text")
363
+ if "stream" in result:
364
+ async for chunk in result["stream"]:
365
+ print(chunk, end="", flush=True)
366
+ else:
367
+ print(result["result"])
174
368
 
175
- # Streaming text
176
- async for token in await client.invoke("Hello", "chat", "text", stream=True):
177
- print(token, end="", flush=True)
369
+ # Text tasks with tools (also supports streaming)
370
+ result = await client.invoke("What's the weather?", "chat", "text", tools=[get_weather])
371
+ if "stream" in result:
372
+ async for chunk in result["stream"]:
373
+ print(chunk, end="", flush=True)
374
+ else:
375
+ print(result["result"])
178
376
 
179
- # Text with tools
180
- await client.invoke("What's 5+3?", "chat", "text", tools=[calculator_function])
377
+ # Vision tasks (always non-streaming)
378
+ result = await client.invoke("image.jpg", "analyze", "vision")
379
+ print(result["result"])
181
380
 
182
- # Streaming with tools
183
- async for token in await client.invoke("What's 5+3?", "chat", "text", stream=True, tools=[calculator_function]):
184
- print(token, end="")
381
+ # Audio tasks
382
+ result = await client.invoke("Hello world", "generate_speech", "audio")
383
+ print(result["result"])
185
384
 
186
385
  # Image generation
187
- await client.invoke("A beautiful sunset", "generate_image", "image")
386
+ result = await client.invoke("A beautiful sunset", "generate_image", "image")
387
+ print(result["result"])
188
388
 
189
389
  # Embedding
190
- await client.invoke("Text to embed", "create_embedding", "embedding")
390
+ result = await client.invoke("Text to embed", "create_embedding", "embedding")
391
+ print(result["result"])
191
392
  """
192
393
  try:
193
- # Handle streaming case
194
- if stream:
195
- if service_type != "text":
196
- raise ValueError("Streaming is only supported for text services")
197
-
198
- if self.mode == "api":
199
- return self._stream_api(
200
- input_data=input_data,
201
- task=task,
202
- service_type=service_type,
203
- model_hint=model_hint,
204
- provider_hint=provider_hint,
205
- tools=tools,
206
- **kwargs
207
- )
394
+ # If using remote service endpoint, make API call
395
+ if self.service_endpoint:
396
+ return await self._invoke_remote_api(
397
+ input_data=input_data,
398
+ task=task,
399
+ service_type=service_type,
400
+ model=model,
401
+ provider=provider,
402
+ stream=stream,
403
+ **kwargs
404
+ )
405
+
406
+ # Set default streaming for text tasks
407
+ if stream is None and service_type == "text":
408
+ if task in ["chat", "generate"]:
409
+ stream = True # Enable streaming for chat and generate tasks
208
410
  else:
209
- return self._stream_local(
210
- input_data=input_data,
211
- task=task,
212
- service_type=service_type,
213
- model_hint=model_hint,
214
- provider_hint=provider_hint,
215
- tools=tools,
216
- **kwargs
217
- )
411
+ stream = False # Disable for other text tasks
218
412
 
219
- # Route to appropriate mode for non-streaming
220
- if self.mode == "api":
221
- return await self._invoke_api(
413
+ # If streaming is enabled for text tasks, return streaming response
414
+ if stream and service_type == "text":
415
+ return await self._invoke_service_streaming(
222
416
  input_data=input_data,
223
417
  task=task,
224
418
  service_type=service_type,
225
- model_hint=model_hint,
226
- provider_hint=provider_hint,
227
- tools=tools,
419
+ model_hint=model,
420
+ provider_hint=provider,
421
+ show_reasoning=show_reasoning, # Explicitly pass show_reasoning
422
+ output_format=output_format,
423
+ json_schema=json_schema,
424
+ repair_attempts=repair_attempts,
228
425
  **kwargs
229
426
  )
230
427
  else:
231
- return await self._invoke_local(
428
+ # Use regular non-streaming service
429
+ return await self._invoke_service(
232
430
  input_data=input_data,
233
431
  task=task,
234
432
  service_type=service_type,
235
- model_hint=model_hint,
236
- provider_hint=provider_hint,
237
- tools=tools,
433
+ model_hint=model,
434
+ provider_hint=provider,
435
+ stream=False, # Force non-streaming
436
+ output_format=output_format,
437
+ json_schema=json_schema,
438
+ repair_attempts=repair_attempts,
238
439
  **kwargs
239
440
  )
240
441
 
241
442
  except Exception as e:
242
- logger.error(f"Failed to invoke {task} on {service_type}: {e}")
243
- return {
244
- "success": False,
245
- "error": str(e),
246
- "metadata": {
247
- "task": task,
248
- "service_type": service_type,
249
- "input_type": type(input_data).__name__
250
- }
251
- }
443
+ return self._handle_error(e, {
444
+ "operation": "invoke",
445
+ "task": task,
446
+ "service_type": service_type,
447
+ "input_type": type(input_data).__name__
448
+ })
449
+
450
+ async def invoke_stream(
451
+ self,
452
+ input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
453
+ task: str,
454
+ service_type: str,
455
+ model: Optional[str] = None,
456
+ provider: Optional[str] = None,
457
+ return_metadata: bool = False,
458
+ **kwargs
459
+ ):
460
+ """
461
+ Unified streaming invoke method - returns async generator for real-time token streaming
462
+
463
+ Args:
464
+ input_data: Input data (str, LangChain messages, image path, audio, etc.)
465
+ task: Task to perform (chat, analyze_image, generate_speech, transcribe, etc.)
466
+ service_type: Type of service (text, vision, audio, image, embedding)
467
+ model: Model name (if None, uses intelligent selection)
468
+ provider: Provider name (if None, uses intelligent selection)
469
+ return_metadata: If True, yields ('metadata', metadata_dict) as final item
470
+ **kwargs: Additional task-specific parameters (including tools for LangChain)
471
+
472
+ Returns:
473
+ For text services: AsyncGenerator[Union[str, Tuple[str, Dict]], None] - yields tokens as they arrive
474
+ - Normal items: token strings
475
+ - Final item (if return_metadata=True): ('metadata', metadata_dict) with billing info
476
+ For other services: Raises ValueError (streaming not supported)
477
+
478
+ Examples:
479
+ # Simple streaming
480
+ async for token in client.invoke_stream("Hello!", "chat", "text"):
481
+ print(token, end='', flush=True)
482
+
483
+ # Streaming with metadata
484
+ async for item in client.invoke_stream("Hello!", "chat", "text", return_metadata=True):
485
+ if isinstance(item, tuple) and item[0] == 'metadata':
486
+ print(f"\nBilling: {item[1]['billing']}")
487
+ else:
488
+ print(item, end='', flush=True)
489
+ """
490
+ try:
491
+ # Only text services support streaming
492
+ if service_type != "text":
493
+ raise ValueError(f"Streaming not supported for service type: {service_type}")
494
+
495
+ # Tools are supported with streaming
496
+
497
+ # Step 1: Select best model for this task
498
+ selected_model = await self._select_model(
499
+ input_data=input_data,
500
+ task=task,
501
+ service_type=service_type,
502
+ model_hint=model,
503
+ provider_hint=provider
504
+ )
505
+
506
+ # Step 2: Get appropriate service
507
+ service, _ = await self._get_service(
508
+ service_type=service_type,
509
+ model_name=selected_model["model_id"],
510
+ provider=selected_model["provider"],
511
+ task=task,
512
+ use_cache=False # Don't cache for streaming to avoid state issues
513
+ )
514
+
515
+ # Step 3: Ensure service supports streaming
516
+ if not hasattr(service, 'astream'):
517
+ raise ValueError(f"Service {selected_model['provider']}/{selected_model['model_id']} does not support streaming")
518
+
519
+ # Step 4: Enable streaming on the service
520
+ if hasattr(service, 'streaming'):
521
+ service.streaming = True
522
+
523
+ # Step 5: Stream tokens and collect for billing
524
+ content_chunks = []
525
+ async for token in service.astream(input_data):
526
+ content_chunks.append(token)
527
+ # Only yield string tokens for streaming (filter out dict/objects)
528
+ if isinstance(token, str):
529
+ yield token
530
+
531
+ # Step 6: After streaming is complete, calculate billing info and optionally return metadata
532
+ try:
533
+ await asyncio.sleep(0.01) # Small delay to ensure billing tracking completes
534
+
535
+ # Get billing info (similar to _invoke_service)
536
+ billing_info = self._get_billing_info(service, selected_model["model_id"])
537
+
538
+ # Log billing info for tracking
539
+ logger.info(f"Streaming completed - Model: {selected_model['model_id']}, "
540
+ f"Tokens: {billing_info.get('total_tokens', 'N/A')}, "
541
+ f"Cost: ${billing_info.get('cost_usd', 0):.4f}")
542
+
543
+ # Return metadata if requested
544
+ if return_metadata:
545
+ metadata = {
546
+ "model_used": selected_model["model_id"],
547
+ "provider": selected_model["provider"],
548
+ "task": task,
549
+ "service_type": service_type,
550
+ "selection_reason": selected_model.get("reason", "Default selection"),
551
+ "billing": billing_info,
552
+ "streaming": True,
553
+ "tokens_streamed": len(content_chunks),
554
+ "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
555
+ }
556
+ yield ('metadata', metadata)
557
+
558
+ except Exception as billing_error:
559
+ logger.warning(f"Failed to track billing for streaming: {billing_error}")
560
+ if return_metadata:
561
+ # Return fallback metadata even if billing fails
562
+ fallback_metadata = {
563
+ "model_used": selected_model["model_id"],
564
+ "provider": selected_model["provider"],
565
+ "task": task,
566
+ "service_type": service_type,
567
+ "selection_reason": selected_model.get("reason", "Default selection"),
568
+ "billing": {
569
+ "cost_usd": 0.0,
570
+ "error": str(billing_error),
571
+ "currency": "USD"
572
+ },
573
+ "streaming": True,
574
+ "tokens_streamed": len(content_chunks),
575
+ "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
576
+ }
577
+ yield ('metadata', fallback_metadata)
578
+
579
+ except Exception as e:
580
+ logger.error(f"Streaming invoke failed: {e}")
581
+ raise
582
+
583
+ def _is_rate_limit_error(self, error: Exception) -> bool:
584
+ """Check if an error is due to rate limiting"""
585
+ error_str = str(error).lower()
586
+
587
+ # Check for common rate limit indicators
588
+ rate_limit_indicators = [
589
+ 'rate limit',
590
+ 'rate_limit',
591
+ 'ratelimit',
592
+ 'too many requests',
593
+ 'quota exceeded',
594
+ 'limit exceeded',
595
+ 'throttled',
596
+ '429'
597
+ ]
598
+
599
+ return any(indicator in error_str for indicator in rate_limit_indicators)
600
+
601
+ async def _invoke_with_fallback(
602
+ self,
603
+ service_type: str,
604
+ task: str,
605
+ input_data: Any,
606
+ selected_model: Dict[str, Any],
607
+ **kwargs
608
+ ) -> Any:
609
+ """Invoke service with automatic fallback on rate limit"""
610
+ try:
611
+ # First attempt with selected model
612
+ return await self._invoke_service_direct(service_type, task, input_data, selected_model, **kwargs)
613
+ except Exception as e:
614
+ # Check if this is a rate limit error
615
+ if self._is_rate_limit_error(e):
616
+ logger.warning(f"Rate limit detected for {selected_model['provider']}: {e}")
617
+
618
+ # Try to get fallback model using intelligent model selector
619
+ if INTELLIGENT_SELECTOR_AVAILABLE and self.model_selector:
620
+ try:
621
+ fallback_selection = self.model_selector.get_rate_limit_fallback(
622
+ service_type,
623
+ selected_model['provider']
624
+ )
625
+
626
+ if fallback_selection.get('success') and fallback_selection.get('is_fallback'):
627
+ fallback_model = fallback_selection['selected_model']
628
+ logger.info(f"Switching to fallback: {fallback_model['provider']}/{fallback_model['model_id']}")
629
+
630
+ # Retry with fallback model
631
+ return await self._invoke_service_direct(service_type, task, input_data, fallback_model, **kwargs)
632
+ except Exception as fallback_error:
633
+ logger.error(f"Fallback also failed: {fallback_error}")
634
+ raise e # Raise original rate limit error
635
+
636
+ # Re-raise the original error if not rate limit or fallback failed
637
+ raise
252
638
 
639
+ async def _invoke_service_direct(
640
+ self,
641
+ service_type: str,
642
+ task: str,
643
+ input_data: Any,
644
+ model_config: Dict[str, Any],
645
+ **kwargs
646
+ ) -> Any:
647
+ """Direct service invocation without fallback logic"""
648
+ # Get appropriate service
649
+ factory = AIFactory.get_instance()
650
+
651
+ # Create service with the specified model
652
+ if service_type == "text":
653
+ service = factory.get_llm(model_config["model_id"], model_config["provider"])
654
+ elif service_type == "vision":
655
+ service = factory.get_vision(model_config["model_id"], model_config["provider"])
656
+ elif service_type == "audio":
657
+ service = factory.get_audio(model_config["model_id"], model_config["provider"])
658
+ elif service_type == "image":
659
+ service = factory.get_image(model_config["model_id"], model_config["provider"])
660
+ elif service_type == "embedding":
661
+ service = factory.get_embed(model_config["model_id"], model_config["provider"])
662
+ else:
663
+ raise ValueError(f"Unsupported service type: {service_type}")
664
+
665
+ # Invoke the service
666
+ if service_type == "text":
667
+ show_reasoning = kwargs.pop('show_reasoning', False)
668
+
669
+ # Check if service supports show_reasoning parameter (mainly OpenAI services)
670
+ if model_config["provider"] == "openai":
671
+ result = await service.invoke(
672
+ input_data=input_data,
673
+ task=task,
674
+ show_reasoning=show_reasoning,
675
+ **kwargs
676
+ )
677
+ else:
678
+ # For other providers like yyds, don't pass show_reasoning
679
+ result = await service.invoke(
680
+ input_data=input_data,
681
+ task=task,
682
+ **kwargs
683
+ )
684
+ return result
685
+ else:
686
+ return await service.invoke(input_data=input_data, task=task, **kwargs)
687
+
253
688
  async def _select_model(
254
689
  self,
255
690
  input_data: Any,
@@ -268,8 +703,26 @@ class ISAModelClient:
268
703
  "reason": "User specified"
269
704
  }
270
705
 
706
+ # If model_hint provided but no provider_hint, handle special cases
707
+ if model_hint:
708
+ # Special handling for hybrid service
709
+ if model_hint == "hybrid":
710
+ return {
711
+ "model_id": model_hint,
712
+ "provider": "hybrid",
713
+ "reason": "Hybrid service requested"
714
+ }
715
+ # If only model_hint provided, use default provider for that service type
716
+ elif provider_hint is None:
717
+ default_provider = self._get_default_provider(service_type)
718
+ return {
719
+ "model_id": model_hint,
720
+ "provider": default_provider,
721
+ "reason": "Model specified with default provider"
722
+ }
723
+
271
724
  # Use intelligent model selector if available
272
- if INTELLIGENT_SELECTOR_AVAILABLE:
725
+ if INTELLIGENT_SELECTOR_AVAILABLE and get_model_selector:
273
726
  try:
274
727
  # Initialize model selector if not already done
275
728
  if self.model_selector is None:
@@ -304,6 +757,17 @@ class ISAModelClient:
304
757
  # Fallback to default model selection
305
758
  return self._get_default_model(service_type, task, provider_hint)
306
759
 
760
+ def _get_default_provider(self, service_type: str) -> str:
761
+ """Get default provider for service type"""
762
+ defaults = {
763
+ "vision": "openai",
764
+ "audio": "openai",
765
+ "text": "openai",
766
+ "image": "replicate",
767
+ "embedding": "openai"
768
+ }
769
+ return defaults.get(service_type, "openai")
770
+
307
771
  def _get_default_model(
308
772
  self,
309
773
  service_type: str,
@@ -314,16 +778,17 @@ class ISAModelClient:
314
778
 
315
779
  defaults = {
316
780
  "vision": {
317
- "model_id": "gpt-4o-mini",
781
+ "model_id": "gpt-4.1-nano",
318
782
  "provider": "openai"
319
783
  },
320
784
  "audio": {
321
785
  "tts": {"model_id": "tts-1", "provider": "openai"},
322
786
  "stt": {"model_id": "whisper-1", "provider": "openai"},
787
+ "realtime": {"model_id": "gpt-4o-realtime-preview-2024-10-01", "provider": "openai"},
323
788
  "default": {"model_id": "whisper-1", "provider": "openai"}
324
789
  },
325
790
  "text": {
326
- "model_id": "gpt-4.1-mini",
791
+ "model_id": "gpt-4.1-nano",
327
792
  "provider": "openai"
328
793
  },
329
794
  "image": {
@@ -331,19 +796,33 @@ class ISAModelClient:
331
796
  "provider": "replicate"
332
797
  },
333
798
  "embedding": {
334
- "model_id": "text-embedding-3-small",
335
- "provider": "openai"
799
+ "embed": {"model_id": "text-embedding-3-small", "provider": "openai"},
800
+ "rerank": {"model_id": "isa-jina-reranker-v2-service", "provider": "isa"},
801
+ "default": {"model_id": "text-embedding-3-small", "provider": "openai"}
336
802
  }
337
803
  }
338
804
 
339
805
  # Handle audio service type with task-specific models
340
806
  if service_type == "audio":
341
- if "speech" in task or "tts" in task:
807
+ # Realtime audio tasks
808
+ if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
809
+ default = defaults["audio"]["realtime"]
810
+ # Traditional TTS tasks
811
+ elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
342
812
  default = defaults["audio"]["tts"]
343
- elif "transcribe" in task or "stt" in task:
813
+ # Traditional STT tasks
814
+ elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
344
815
  default = defaults["audio"]["stt"]
345
816
  else:
346
817
  default = defaults["audio"]["default"]
818
+ # Handle embedding service type with task-specific models
819
+ elif service_type == "embedding":
820
+ if "rerank" in task:
821
+ default = defaults["embedding"]["rerank"]
822
+ elif "embed" in task:
823
+ default = defaults["embedding"]["embed"]
824
+ else:
825
+ default = defaults["embedding"]["default"]
347
826
  else:
348
827
  default = defaults.get(service_type, defaults["vision"])
349
828
 
@@ -363,59 +842,80 @@ class ISAModelClient:
363
842
  model_name: str,
364
843
  provider: str,
365
844
  task: str,
366
- tools: Optional[List[Any]] = None
367
- ) -> Any:
368
- """Get appropriate service instance"""
845
+ use_cache: bool = True
846
+ ) -> tuple[Any, str]:
847
+ """Get appropriate service instance and return actual model used"""
369
848
 
370
- cache_key = f"{service_type}_{provider}_{model_name}"
849
+ cache_key = f"{service_type}_{provider}_{model_name}_{task}"
850
+ actual_model_used = model_name # Track the actual model used
371
851
 
372
- # Check cache first
373
- if cache_key in self._service_cache:
374
- service = self._service_cache[cache_key]
375
- # If tools are needed, bind them to the service
376
- if tools and service_type == "text":
377
- return service.bind_tools(tools)
378
- return service
852
+ # Check cache first (if caching is enabled)
853
+ if use_cache and cache_key in self._service_cache:
854
+ cached_service, cached_model = self._service_cache[cache_key]
855
+ return cached_service, cached_model
379
856
 
380
857
  try:
858
+ # Validate service type
859
+ self._validate_service_type(service_type)
860
+
381
861
  # Route to appropriate AIFactory method
382
862
  if service_type == "vision":
383
863
  service = self.ai_factory.get_vision(model_name, provider)
384
-
864
+ actual_model_used = model_name
385
865
  elif service_type == "audio":
386
- if "speech" in task or "tts" in task:
387
- service = self.ai_factory.get_tts(model_name, provider)
388
- elif "transcribe" in task or "stt" in task:
389
- service = self.ai_factory.get_stt(model_name, provider)
866
+ # Realtime audio tasks
867
+ if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
868
+ # Use realtime model
869
+ realtime_model = "gpt-4o-realtime-preview-2024-10-01" if model_name == "tts-1" or model_name == "whisper-1" else model_name
870
+ service = self.ai_factory.get_realtime(realtime_model, provider)
871
+ actual_model_used = realtime_model
872
+ # Traditional TTS tasks
873
+ elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
874
+ # Use TTS model
875
+ tts_model = "tts-1" if model_name == "whisper-1" else model_name
876
+ service = self.ai_factory.get_tts(tts_model, provider)
877
+ actual_model_used = tts_model
878
+ # Traditional STT tasks
879
+ elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
880
+ # Use STT model
881
+ stt_model = "whisper-1" if model_name == "tts-1" else model_name
882
+ service = self.ai_factory.get_stt(stt_model, provider)
883
+ actual_model_used = stt_model
884
+ # Default to STT for backward compatibility
390
885
  else:
391
- # Default to STT for unknown audio tasks
392
- service = self.ai_factory.get_stt(model_name, provider)
393
-
886
+ # Use STT model by default
887
+ stt_model = "whisper-1" if model_name == "tts-1" else model_name
888
+ service = self.ai_factory.get_stt(stt_model, provider)
889
+ actual_model_used = stt_model
394
890
  elif service_type == "text":
395
891
  service = self.ai_factory.get_llm(model_name, provider)
396
-
892
+ actual_model_used = model_name
397
893
  elif service_type == "image":
398
894
  service = self.ai_factory.get_img("t2i", model_name, provider)
399
-
895
+ actual_model_used = model_name
400
896
  elif service_type == "embedding":
401
897
  service = self.ai_factory.get_embed(model_name, provider)
898
+ actual_model_used = model_name
402
899
 
403
- else:
404
- raise ValueError(f"Unsupported service type: {service_type}")
405
-
406
- # Cache the service
407
- self._service_cache[cache_key] = service
408
-
409
- # If tools are needed, bind them to the service
410
- if tools and service_type == "text":
411
- return service.bind_tools(tools)
412
-
413
- return service
900
+ # Cache the service and actual model (if caching is enabled)
901
+ if use_cache:
902
+ self._service_cache[cache_key] = (service, actual_model_used)
903
+ return service, actual_model_used
414
904
 
415
905
  except Exception as e:
416
906
  logger.error(f"Failed to get service {service_type}/{provider}/{model_name}: {e}")
417
907
  raise
418
908
 
909
+ def _validate_service_type(self, service_type: str) -> None:
910
+ """Validate service type is supported"""
911
+ if service_type not in self.SUPPORTED_SERVICE_TYPES:
912
+ raise ValueError(f"Unsupported service type: {service_type}")
913
+
914
+ def _map_task(self, task: str, service_type: str) -> str:
915
+ """Map common task names to unified task names"""
916
+ task_mapping = self.TASK_MAPPINGS.get(service_type, {})
917
+ return task_mapping.get(task, task)
918
+
419
919
  async def _execute_task(
420
920
  self,
421
921
  service: Any,
@@ -427,166 +927,119 @@ class ISAModelClient:
427
927
  """Execute the task using the appropriate service"""
428
928
 
429
929
  try:
930
+ self._validate_service_type(service_type)
931
+ unified_task = self._map_task(task, service_type)
932
+
430
933
  if service_type == "vision":
431
- return await self._execute_vision_task(service, input_data, task, **kwargs)
934
+ return await service.invoke(
935
+ image=input_data,
936
+ task=unified_task,
937
+ **kwargs
938
+ )
432
939
 
433
940
  elif service_type == "audio":
434
- return await self._execute_audio_task(service, input_data, task, **kwargs)
941
+ # Realtime audio tasks
942
+ if any(realtime_task in unified_task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
943
+ # For realtime text_chat and audio_chat, pass text parameter
944
+ if unified_task in ["text_chat", "audio_chat"]:
945
+ if isinstance(input_data, str):
946
+ kwargs['text'] = input_data
947
+ elif isinstance(input_data, bytes):
948
+ kwargs['audio_data'] = input_data
949
+ return await service.invoke(
950
+ task=unified_task,
951
+ **kwargs
952
+ )
953
+ # Traditional TTS tasks
954
+ elif unified_task in ["synthesize", "text_to_speech", "tts", "generate_speech"]:
955
+ return await service.invoke(
956
+ text=input_data,
957
+ task=unified_task,
958
+ **kwargs
959
+ )
960
+ # Traditional STT tasks
961
+ else:
962
+ return await service.invoke(
963
+ audio_input=input_data,
964
+ task=unified_task,
965
+ **kwargs
966
+ )
435
967
 
436
968
  elif service_type == "text":
437
- return await self._execute_text_task(service, input_data, task, **kwargs)
969
+ # Extract show_reasoning from kwargs if present
970
+ show_reasoning = kwargs.pop('show_reasoning', False)
971
+
972
+ # Check if service provider supports show_reasoning
973
+ # Only OpenAI services support this parameter
974
+ if hasattr(service, 'provider_name') and service.provider_name == 'openai':
975
+ result = await service.invoke(
976
+ input_data=input_data,
977
+ task=unified_task,
978
+ show_reasoning=show_reasoning,
979
+ **kwargs
980
+ )
981
+ else:
982
+ # For other providers like yyds, don't pass show_reasoning
983
+ result = await service.invoke(
984
+ input_data=input_data,
985
+ task=unified_task,
986
+ **kwargs
987
+ )
988
+
989
+ logger.debug(f"Service result type: {type(result)}")
990
+ logger.debug(f"Service result: {result}")
991
+
992
+ # Check if this is a formatted result from invoke method
993
+ if isinstance(result, dict) and 'formatted' in result:
994
+ # This is a formatted result from the new invoke method
995
+ logger.debug(f"Returning formatted result: {result}")
996
+ return result
997
+ elif isinstance(result, dict) and 'message' in result:
998
+ # This is a traditional message result
999
+ message = result['message']
1000
+ logger.debug(f"Extracted message type: {type(message)}")
1001
+ logger.debug(f"Extracted message length: {len(str(message)) if message else 0}")
1002
+
1003
+ # Handle AIMessage objects from LangChain
1004
+ if hasattr(message, 'content'):
1005
+ # Check if there are tool_calls
1006
+ if hasattr(message, 'tool_calls') and message.tool_calls:
1007
+ logger.debug(f"AIMessage contains tool_calls: {len(message.tool_calls)}")
1008
+ # Return a dict with both content and tool_calls
1009
+ return {
1010
+ "content": message.content if message.content else "",
1011
+ "tool_calls": message.tool_calls
1012
+ }
1013
+ else:
1014
+ content = message.content
1015
+ logger.debug(f"Extracted content from AIMessage: {len(content) if content else 0} chars")
1016
+ return content
1017
+ else:
1018
+ # Direct string message
1019
+ logger.debug(f"Returning direct message: {len(str(message)) if message else 0} chars")
1020
+ return message
1021
+ else:
1022
+ logger.debug(f"Returning result directly: {result}")
1023
+ return result
438
1024
 
439
1025
  elif service_type == "image":
440
- return await self._execute_image_task(service, input_data, task, **kwargs)
1026
+ return await service.invoke(
1027
+ prompt=input_data,
1028
+ task=unified_task,
1029
+ **kwargs
1030
+ )
441
1031
 
442
1032
  elif service_type == "embedding":
443
- return await self._execute_embedding_task(service, input_data, task, **kwargs)
444
-
445
- else:
446
- raise ValueError(f"Unsupported service type: {service_type}")
1033
+ return await service.invoke(
1034
+ input_data=input_data,
1035
+ task=unified_task,
1036
+ **kwargs
1037
+ )
447
1038
 
448
1039
  except Exception as e:
449
1040
  logger.error(f"Task execution failed: {e}")
450
1041
  raise
451
1042
 
452
- async def _execute_vision_task(self, service, input_data, task, **kwargs):
453
- """Execute vision-related tasks using unified invoke method"""
454
-
455
- # Map common task names to unified task names
456
- task_mapping = {
457
- "analyze_image": "analyze_image",
458
- "detect_ui_elements": "detect_ui",
459
- "extract_table": "extract_table",
460
- "extract_text": "extract_text",
461
- "ocr": "extract_text",
462
- "describe": "analyze_image"
463
- }
464
-
465
- unified_task = task_mapping.get(task, task)
466
-
467
- # Use unified invoke method with proper parameters
468
- return await service.invoke(
469
- image=input_data,
470
- task=unified_task,
471
- **kwargs
472
- )
473
-
474
- async def _execute_audio_task(self, service, input_data, task, **kwargs):
475
- """Execute audio-related tasks using unified invoke method"""
476
-
477
- # Map common task names to unified task names
478
- task_mapping = {
479
- "generate_speech": "synthesize",
480
- "text_to_speech": "synthesize",
481
- "tts": "synthesize",
482
- "transcribe": "transcribe",
483
- "speech_to_text": "transcribe",
484
- "stt": "transcribe",
485
- "translate": "translate",
486
- "detect_language": "detect_language"
487
- }
488
-
489
- unified_task = task_mapping.get(task, task)
490
-
491
- # Use unified invoke method with correct parameter name based on task type
492
- if unified_task in ["synthesize", "text_to_speech", "tts"]:
493
- # TTS services expect 'text' parameter
494
- return await service.invoke(
495
- text=input_data,
496
- task=unified_task,
497
- **kwargs
498
- )
499
- else:
500
- # STT services expect 'audio_input' parameter
501
- return await service.invoke(
502
- audio_input=input_data,
503
- task=unified_task,
504
- **kwargs
505
- )
506
-
507
- async def _execute_text_task(self, service, input_data, task, **kwargs):
508
- """Execute text-related tasks using unified invoke method"""
509
-
510
- # Map common task names to unified task names
511
- task_mapping = {
512
- "chat": "chat",
513
- "generate": "generate",
514
- "complete": "complete",
515
- "translate": "translate",
516
- "summarize": "summarize",
517
- "analyze": "analyze",
518
- "extract": "extract",
519
- "classify": "classify"
520
- }
521
-
522
- unified_task = task_mapping.get(task, task)
523
-
524
- # Use unified invoke method
525
- result = await service.invoke(
526
- input_data=input_data,
527
- task=unified_task,
528
- **kwargs
529
- )
530
-
531
- # Handle the new response format from LLM services
532
- # LLM services now return {"message": ..., "success": ..., "metadata": ...}
533
- if isinstance(result, dict) and "message" in result:
534
- # Extract the message content (convert AIMessage to string)
535
- message = result["message"]
536
- if hasattr(message, 'content'):
537
- # Handle langchain AIMessage objects
538
- return message.content
539
- elif isinstance(message, str):
540
- return message
541
- else:
542
- # Fallback: convert to string
543
- return str(message)
544
-
545
- # Fallback for other service types or legacy format
546
- return result
547
-
548
- async def _execute_image_task(self, service, input_data, task, **kwargs):
549
- """Execute image generation tasks using unified invoke method"""
550
-
551
- # Map common task names to unified task names
552
- task_mapping = {
553
- "generate_image": "generate",
554
- "generate": "generate",
555
- "img2img": "img2img",
556
- "image_to_image": "img2img",
557
- "generate_batch": "generate_batch"
558
- }
559
-
560
- unified_task = task_mapping.get(task, task)
561
-
562
- # Use unified invoke method
563
- return await service.invoke(
564
- prompt=input_data,
565
- task=unified_task,
566
- **kwargs
567
- )
568
-
569
- async def _execute_embedding_task(self, service, input_data, task, **kwargs):
570
- """Execute embedding tasks using unified invoke method"""
571
-
572
- # Map common task names to unified task names
573
- task_mapping = {
574
- "create_embedding": "embed",
575
- "embed": "embed",
576
- "embed_batch": "embed_batch",
577
- "chunk_and_embed": "chunk_and_embed",
578
- "similarity": "similarity",
579
- "find_similar": "find_similar"
580
- }
581
-
582
- unified_task = task_mapping.get(task, task)
583
-
584
- # Use unified invoke method
585
- return await service.invoke(
586
- input_data=input_data,
587
- task=unified_task,
588
- **kwargs
589
- )
590
1043
 
591
1044
  def clear_cache(self):
592
1045
  """Clear service cache"""
@@ -602,7 +1055,7 @@ class ISAModelClient:
602
1055
  Returns:
603
1056
  List of available models with metadata
604
1057
  """
605
- if INTELLIGENT_SELECTOR_AVAILABLE:
1058
+ if INTELLIGENT_SELECTOR_AVAILABLE and get_model_selector:
606
1059
  try:
607
1060
  if self.model_selector is None:
608
1061
  self.model_selector = await get_model_selector(self.config)
@@ -636,7 +1089,7 @@ class ISAModelClient:
636
1089
 
637
1090
  for service_type, provider, model in test_services:
638
1091
  try:
639
- await self._get_service(service_type, model, provider, "test")
1092
+ service, _ = await self._get_service(service_type, model, provider, "test")
640
1093
  health_status["services"][f"{service_type}_{provider}"] = "healthy"
641
1094
  except Exception as e:
642
1095
  health_status["services"][f"{service_type}_{provider}"] = f"error: {str(e)}"
@@ -649,17 +1102,35 @@ class ISAModelClient:
649
1102
  "error": str(e)
650
1103
  }
651
1104
 
652
- async def _invoke_local(
1105
+ def _handle_error(self, e: Exception, context: Dict[str, Any]) -> Dict[str, Any]:
1106
+ """Handle errors consistently across methods"""
1107
+ error_msg = f"Failed to {context.get('operation', 'execute')} {context.get('task', '')} on {context.get('service_type', '')}: {e}"
1108
+ logger.error(error_msg)
1109
+ return {
1110
+ "success": False,
1111
+ "error": str(e),
1112
+ "metadata": context
1113
+ }
1114
+
1115
+ async def _invoke_service_streaming(
653
1116
  self,
654
- input_data: Union[str, bytes, Path, Dict[str, Any]],
1117
+ input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
655
1118
  task: str,
656
1119
  service_type: str,
657
1120
  model_hint: Optional[str] = None,
658
1121
  provider_hint: Optional[str] = None,
659
- tools: Optional[List[Any]] = None,
1122
+ output_format: Optional[str] = None,
1123
+ json_schema: Optional[Dict] = None,
1124
+ repair_attempts: Optional[int] = 3,
660
1125
  **kwargs
661
1126
  ) -> Dict[str, Any]:
662
- """Local invoke using AI Factory (original logic)"""
1127
+ """Service invoke that returns streaming response with async generator"""
1128
+
1129
+ # Generate unique request ID for logging
1130
+ request_id = generate_request_id()
1131
+ start_time = datetime.now(timezone.utc)
1132
+ execution_start_time = time.time()
1133
+
663
1134
  try:
664
1135
  # Step 1: Select best model for this task
665
1136
  selected_model = await self._select_model(
@@ -671,310 +1142,421 @@ class ISAModelClient:
671
1142
  )
672
1143
 
673
1144
  # Step 2: Get appropriate service
674
- service = await self._get_service(
1145
+ service, actual_model_used = await self._get_service(
675
1146
  service_type=service_type,
676
1147
  model_name=selected_model["model_id"],
677
1148
  provider=selected_model["provider"],
678
1149
  task=task,
679
- tools=tools
1150
+ use_cache=False # Don't cache for streaming to avoid state issues
680
1151
  )
1152
+ # Update selected model with actual model used
1153
+ selected_model["model_id"] = actual_model_used
681
1154
 
682
- # Step 3: Execute task with unified interface
683
- result = await self._execute_task(
684
- service=service,
685
- input_data=input_data,
686
- task=task,
687
- service_type=service_type,
688
- **kwargs
689
- )
1155
+ # Step 3: Handle tools for LLM services (bind tools if provided)
1156
+ tools = kwargs.pop("tools", None)
1157
+ if service_type == "text" and tools:
1158
+ service, _ = await self._get_service(
1159
+ service_type=service_type,
1160
+ model_name=selected_model["model_id"],
1161
+ provider=selected_model["provider"],
1162
+ task=task,
1163
+ use_cache=False
1164
+ )
1165
+ service = service.bind_tools(tools)
1166
+
1167
+ # Step 4: Ensure service supports streaming
1168
+ if not hasattr(service, 'astream'):
1169
+ raise ValueError(f"Service {selected_model['provider']}/{selected_model['model_id']} does not support streaming")
690
1170
 
691
- # Step 4: Return unified response
1171
+ # Step 5: Enable streaming on the service
1172
+ if hasattr(service, 'streaming'):
1173
+ service.streaming = True
1174
+
1175
+ # Step 6: Create async generator wrapper that yields tokens
1176
+ async def stream_generator():
1177
+ # Pass show_reasoning parameter if available for LLM services
1178
+ if service_type == "text" and hasattr(service, 'astream'):
1179
+ show_reasoning = kwargs.get('show_reasoning', False)
1180
+ logger.debug(f"Stream generator: show_reasoning={show_reasoning}")
1181
+ # Only pass show_reasoning to OpenAI providers
1182
+ if 'show_reasoning' in kwargs and hasattr(service, 'provider_name') and service.provider_name == 'openai':
1183
+ async for token in service.astream(input_data, show_reasoning=show_reasoning):
1184
+ yield token
1185
+ else:
1186
+ async for token in service.astream(input_data):
1187
+ yield token
1188
+ else:
1189
+ async for token in service.astream(input_data):
1190
+ yield token
1191
+
1192
+ # Return response with stream generator and metadata
692
1193
  return {
693
1194
  "success": True,
694
- "result": result,
1195
+ "stream": stream_generator(),
695
1196
  "metadata": {
696
1197
  "model_used": selected_model["model_id"],
697
1198
  "provider": selected_model["provider"],
698
1199
  "task": task,
699
1200
  "service_type": service_type,
700
- "selection_reason": selected_model.get("reason", "Default selection")
1201
+ "selection_reason": selected_model.get("reason", "Default selection"),
1202
+ "streaming": True
701
1203
  }
702
1204
  }
703
1205
  except Exception as e:
704
- logger.error(f"Local invoke failed: {e}")
1206
+ logger.error(f"Streaming service invoke failed: {e}")
705
1207
  raise
706
-
707
- async def _invoke_api(
1208
+
1209
+ async def _invoke_service(
708
1210
  self,
709
- input_data: Union[str, bytes, Path, Dict[str, Any]],
1211
+ input_data: Union[str, bytes, Path, Dict[str, Any], List[Any]],
710
1212
  task: str,
711
1213
  service_type: str,
712
1214
  model_hint: Optional[str] = None,
713
1215
  provider_hint: Optional[str] = None,
1216
+ stream: Optional[bool] = None,
1217
+ output_format: Optional[str] = None,
1218
+ json_schema: Optional[Dict] = None,
1219
+ repair_attempts: Optional[int] = 3,
714
1220
  **kwargs
715
1221
  ) -> Dict[str, Any]:
716
- """API invoke using HTTP requests"""
1222
+ """Direct service invoke - passes LangChain objects and tools directly to services"""
717
1223
 
718
- # Handle file inputs
719
- if isinstance(input_data, Path):
720
- return await self._invoke_api_file(
721
- file_path=input_data,
722
- task=task,
1224
+ # Generate unique request ID for logging
1225
+ request_id = generate_request_id()
1226
+ start_time = datetime.now(timezone.utc)
1227
+ execution_start_time = time.time()
1228
+
1229
+ try:
1230
+ # Step 1: Select best model for this task
1231
+ selected_model = await self._select_model(
1232
+ input_data=input_data,
1233
+ task=task,
723
1234
  service_type=service_type,
724
1235
  model_hint=model_hint,
725
- provider_hint=provider_hint,
726
- **kwargs
1236
+ provider_hint=provider_hint
727
1237
  )
728
-
729
- # Handle binary data
730
- if isinstance(input_data, bytes):
731
- return await self._invoke_api_binary(
732
- data=input_data,
1238
+
1239
+ # Step 1.5: Log inference start
1240
+ self.inference_logger.log_inference_start(
1241
+ request_id=request_id,
1242
+ service_type=service_type,
733
1243
  task=task,
1244
+ provider=selected_model["provider"],
1245
+ model_name=selected_model["model_id"],
1246
+ input_data=input_data if self.inference_logger.log_detailed_requests else None,
1247
+ is_streaming=stream or False,
1248
+ custom_metadata={
1249
+ "selection_reason": selected_model.get("reason", "Default selection"),
1250
+ "has_tools": "tools" in kwargs
1251
+ }
1252
+ )
1253
+
1254
+ # Step 2: Get appropriate service
1255
+ service, actual_model_used = await self._get_service(
734
1256
  service_type=service_type,
735
- model_hint=model_hint,
736
- provider_hint=provider_hint,
737
- **kwargs
1257
+ model_name=selected_model["model_id"],
1258
+ provider=selected_model["provider"],
1259
+ task=task
738
1260
  )
739
-
740
- # Handle text/JSON data
741
- payload = {
742
- "input_data": input_data,
743
- "task": task,
744
- "service_type": service_type,
745
- "model_hint": model_hint,
746
- "provider_hint": provider_hint,
747
- "parameters": kwargs
748
- }
749
-
750
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
1261
+ # Update selected model with actual model used
1262
+ selected_model["model_id"] = actual_model_used
1263
+
1264
+ # Step 3: Handle tools for LLM services (bind tools if provided)
1265
+ tools = kwargs.pop("tools", None)
1266
+ if service_type == "text" and tools:
1267
+ service, _ = await self._get_service(
1268
+ service_type=service_type,
1269
+ model_name=selected_model["model_id"],
1270
+ provider=selected_model["provider"],
1271
+ task=task,
1272
+ use_cache=False
1273
+ )
1274
+ service = service.bind_tools(tools)
1275
+ # Note: streaming is still supported with tools
1276
+
1277
+ # Step 4: Set streaming for text services
1278
+ if service_type == "text" and stream is not None:
1279
+ if hasattr(service, 'streaming'):
1280
+ service.streaming = stream
1281
+
1282
+ # Step 5: Execute task with unified interface
1283
+ # Pass JSON formatting parameters to the service
1284
+ task_kwargs = kwargs.copy()
1285
+ if service_type == "text":
1286
+ if output_format:
1287
+ task_kwargs["output_format"] = output_format
1288
+ if json_schema:
1289
+ task_kwargs["json_schema"] = json_schema
1290
+ if repair_attempts is not None:
1291
+ task_kwargs["repair_attempts"] = repair_attempts
1292
+
1293
+ # Try to execute with rate limit detection
751
1294
  try:
752
- async with session.post(
753
- f"{self.api_url}/api/v1/invoke",
754
- json=payload,
755
- headers=self.headers
756
- ) as response:
757
-
758
- if response.status == 200:
759
- return await response.json()
760
- else:
761
- error_data = await response.text()
762
- raise Exception(f"API error {response.status}: {error_data}")
763
-
1295
+ result = await self._execute_task(
1296
+ service=service,
1297
+ input_data=input_data,
1298
+ task=task,
1299
+ service_type=service_type,
1300
+ **task_kwargs
1301
+ )
764
1302
  except Exception as e:
765
- logger.error(f"API invoke failed: {e}")
766
- raise
767
-
768
- async def _invoke_api_file(
769
- self,
770
- file_path: Path,
771
- task: str,
772
- service_type: str,
773
- model_hint: Optional[str] = None,
774
- provider_hint: Optional[str] = None,
775
- **kwargs
776
- ) -> Dict[str, Any]:
777
- """API file upload"""
778
-
779
- if not file_path.exists():
780
- raise FileNotFoundError(f"File not found: {file_path}")
781
-
782
- data = aiohttp.FormData()
783
- data.add_field('task', task)
784
- data.add_field('service_type', service_type)
785
-
786
- if model_hint:
787
- data.add_field('model_hint', model_hint)
788
- if provider_hint:
789
- data.add_field('provider_hint', provider_hint)
790
-
791
- data.add_field('file',
792
- open(file_path, 'rb'),
793
- filename=file_path.name,
794
- content_type='application/octet-stream')
795
-
796
- headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
797
-
798
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
799
- try:
800
- async with session.post(
801
- f"{self.api_url}/api/v1/invoke-file",
802
- data=data,
803
- headers=headers
804
- ) as response:
1303
+ # Check if this is a rate limit error and we can fallback
1304
+ if self._is_rate_limit_error(e) and service_type == "text":
1305
+ # Ensure model selector is initialized
1306
+ if not self.model_selector:
1307
+ self.model_selector = await get_model_selector(self.config)
805
1308
 
806
- if response.status == 200:
807
- return await response.json()
808
- else:
809
- error_data = await response.text()
810
- raise Exception(f"API error {response.status}: {error_data}")
811
-
812
- except Exception as e:
813
- logger.error(f"API file upload failed: {e}")
814
- raise
815
-
816
- async def _invoke_api_binary(
817
- self,
818
- data: bytes,
819
- task: str,
820
- service_type: str,
821
- model_hint: Optional[str] = None,
822
- provider_hint: Optional[str] = None,
823
- **kwargs
824
- ) -> Dict[str, Any]:
825
- """API binary upload"""
826
-
827
- form_data = aiohttp.FormData()
828
- form_data.add_field('task', task)
829
- form_data.add_field('service_type', service_type)
830
-
831
- if model_hint:
832
- form_data.add_field('model_hint', model_hint)
833
- if provider_hint:
834
- form_data.add_field('provider_hint', provider_hint)
835
-
836
- form_data.add_field('file',
837
- data,
838
- filename='data.bin',
839
- content_type='application/octet-stream')
840
-
841
- headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
842
-
843
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
844
- try:
845
- async with session.post(
846
- f"{self.api_url}/api/v1/invoke-file",
847
- data=form_data,
848
- headers=headers
849
- ) as response:
1309
+ # Get fallback model selection
1310
+ fallback_selection = self.model_selector.get_rate_limit_fallback(
1311
+ service_type=service_type,
1312
+ original_provider=selected_model["provider"]
1313
+ )
850
1314
 
851
- if response.status == 200:
852
- return await response.json()
853
- else:
854
- error_data = await response.text()
855
- raise Exception(f"API error {response.status}: {error_data}")
1315
+ if fallback_selection.get('success'):
1316
+ fallback_model = fallback_selection.get('selected_model', {})
1317
+ logger.info(f"Rate limit hit, switching to fallback: {fallback_model}")
856
1318
 
857
- except Exception as e:
858
- logger.error(f"API binary upload failed: {e}")
859
- raise
860
-
861
- async def _stream_local(
862
- self,
863
- input_data: Union[str, bytes, Path, Dict[str, Any]],
864
- task: str,
865
- service_type: str,
866
- model_hint: Optional[str] = None,
867
- provider_hint: Optional[str] = None,
868
- tools: Optional[List[Any]] = None,
869
- **kwargs
870
- ):
871
- """Local streaming using AI Factory"""
872
- # Step 1: Select best model for this task
873
- selected_model = await self._select_model(
874
- input_data=input_data,
875
- task=task,
876
- service_type=service_type,
877
- model_hint=model_hint,
878
- provider_hint=provider_hint
879
- )
880
-
881
- # Step 2: Get appropriate service
882
- service = await self._get_service(
883
- service_type=service_type,
884
- model_name=selected_model["model_id"],
885
- provider=selected_model["provider"],
886
- task=task,
887
- tools=tools
888
- )
889
-
890
- # Step 3: Yield tokens from the stream
891
- async for token in service.astream(input_data):
892
- yield token
893
-
894
- async def _stream_api(
895
- self,
896
- input_data: Union[str, bytes, Path, Dict[str, Any]],
897
- task: str,
898
- service_type: str,
899
- model_hint: Optional[str] = None,
900
- provider_hint: Optional[str] = None,
901
- **kwargs
902
- ):
903
- """API streaming using Server-Sent Events (SSE)"""
904
-
905
- # Only support text streaming for now
906
- if not isinstance(input_data, (str, dict)):
907
- raise ValueError("API streaming only supports text input")
908
-
909
- payload = {
910
- "input_data": input_data,
911
- "task": task,
912
- "service_type": service_type,
913
- "model_hint": model_hint,
914
- "provider_hint": provider_hint,
915
- "stream": True,
916
- "parameters": kwargs
917
- }
918
-
919
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=300)) as session:
920
- try:
921
- async with session.post(
922
- f"{self.api_url}/api/v1/stream",
923
- json=payload,
924
- headers=self.headers
925
- ) as response:
926
-
927
- if response.status == 200:
928
- # Parse SSE stream
929
- async for line in response.content:
930
- if line:
931
- line_str = line.decode().strip()
932
- if line_str.startswith("data: "):
933
- try:
934
- # Parse SSE data
935
- import json
936
- json_str = line_str[6:] # Remove "data: " prefix
937
- data = json.loads(json_str)
938
-
939
- if data.get("type") == "token" and "token" in data:
940
- yield data["token"]
941
- elif data.get("type") == "completion":
942
- # End of stream
943
- break
944
- elif data.get("type") == "error":
945
- raise Exception(f"Server error: {data.get('error')}")
946
-
947
- except json.JSONDecodeError:
948
- # Skip malformed lines
949
- continue
950
- else:
951
- error_data = await response.text()
952
- raise Exception(f"API streaming error {response.status}: {error_data}")
1319
+ # Get fallback service
1320
+ fallback_service, fallback_model_used = await self._get_service(
1321
+ service_type=service_type,
1322
+ model_name=fallback_model["model_id"],
1323
+ provider=fallback_model["provider"],
1324
+ task=task
1325
+ )
953
1326
 
954
- except Exception as e:
955
- logger.error(f"API streaming failed: {e}")
956
- raise
1327
+ # Update selected model for metadata
1328
+ selected_model = fallback_model
1329
+ selected_model["model_id"] = fallback_model_used
1330
+ selected_model["reason"] = "Rate limit fallback"
1331
+
1332
+ # Retry with fallback service
1333
+ result = await self._execute_task(
1334
+ service=fallback_service,
1335
+ input_data=input_data,
1336
+ task=task,
1337
+ service_type=service_type,
1338
+ **task_kwargs
1339
+ )
1340
+ else:
1341
+ # No fallback available, re-raise original error
1342
+ raise
1343
+ else:
1344
+ # Not a rate limit error or no fallback, re-raise
1345
+ raise
1346
+
1347
+ # Step 6: Wait for billing tracking to complete, then get billing information
1348
+ await asyncio.sleep(0.01) # Small delay to ensure billing tracking completes
1349
+ billing_info = self._get_billing_info(service, selected_model["model_id"])
1350
+
1351
+ # Step 6.5: Calculate execution time and log completion
1352
+ execution_time_ms = int((time.time() - execution_start_time) * 1000)
1353
+
1354
+ # Log inference completion
1355
+ self.inference_logger.log_inference_complete(
1356
+ request_id=request_id,
1357
+ status="completed",
1358
+ execution_time_ms=execution_time_ms,
1359
+ input_tokens=billing_info.get("input_tokens"),
1360
+ output_tokens=billing_info.get("output_tokens"),
1361
+ estimated_cost_usd=billing_info.get("cost_usd"),
1362
+ output_data=result if self.inference_logger.log_detailed_requests else None,
1363
+ custom_metadata={
1364
+ "billing_operation": billing_info.get("operation"),
1365
+ "timestamp": billing_info.get("timestamp")
1366
+ }
1367
+ )
1368
+
1369
+ # Log detailed token usage if available
1370
+ if billing_info.get("input_tokens") and billing_info.get("output_tokens"):
1371
+ self.inference_logger.log_token_usage(
1372
+ request_id=request_id,
1373
+ provider=selected_model["provider"],
1374
+ model_name=selected_model["model_id"],
1375
+ prompt_tokens=billing_info.get("input_tokens"),
1376
+ completion_tokens=billing_info.get("output_tokens"),
1377
+ prompt_cost_usd=billing_info.get("cost_usd", 0) * 0.6 if billing_info.get("cost_usd") else None, # Rough estimate
1378
+ completion_cost_usd=billing_info.get("cost_usd", 0) * 0.4 if billing_info.get("cost_usd") else None
1379
+ )
1380
+
1381
+ # Handle formatting - check if result is already formatted
1382
+ formatted_result = result
1383
+ if service_type == "text" and output_format:
1384
+ # Check if result is already formatted by the service
1385
+ if isinstance(result, dict) and result.get("formatted"):
1386
+ # Result is already formatted by the service
1387
+ formatted_result = result.get("result", result)
1388
+ billing_info["formatting"] = {
1389
+ "output_format": output_format,
1390
+ "format_success": True,
1391
+ "format_method": "service_level",
1392
+ "format_errors": result.get("format_errors", []),
1393
+ "repaired": False,
1394
+ "pre_formatted": True
1395
+ }
1396
+ else:
1397
+ # Apply formatting at client level (fallback)
1398
+ try:
1399
+ service, _ = await self._get_service(
1400
+ service_type=service_type,
1401
+ model_name=selected_model["model_id"],
1402
+ provider=selected_model["provider"],
1403
+ task=task
1404
+ )
1405
+ if hasattr(service, 'format_structured_output'):
1406
+ formatting_result = service.format_structured_output(
1407
+ response=result,
1408
+ output_format=output_format,
1409
+ schema=json_schema,
1410
+ repair_attempts=repair_attempts or 3
1411
+ )
1412
+ # Update result and add formatting metadata
1413
+ if formatting_result.get("success") and formatting_result.get("data") is not None:
1414
+ # Extract the actual formatted data
1415
+ formatted_data = formatting_result["data"]
1416
+
1417
+ # For JSON output, ensure we return clean data
1418
+ if output_format == "json" and isinstance(formatted_data, dict):
1419
+ formatted_result = formatted_data
1420
+ else:
1421
+ formatted_result = formatted_data
1422
+ else:
1423
+ # Keep original result if formatting failed
1424
+ formatted_result = result
1425
+
1426
+ # Add formatting info to metadata
1427
+ billing_info["formatting"] = {
1428
+ "output_format": output_format,
1429
+ "format_success": formatting_result.get("success", False),
1430
+ "format_method": formatting_result.get("method"),
1431
+ "format_errors": formatting_result.get("errors", []),
1432
+ "repaired": formatting_result.get("repaired", False),
1433
+ "pre_formatted": False
1434
+ }
1435
+
1436
+ except Exception as format_error:
1437
+ logger.warning(f"Failed to apply output formatting: {format_error}")
1438
+ # Continue with unformatted result
1439
+ formatted_result = result
1440
+ billing_info["formatting"] = {
1441
+ "output_format": output_format,
1442
+ "format_success": False,
1443
+ "format_error": str(format_error)
1444
+ }
1445
+
1446
+ # Return unified response
1447
+ response = {
1448
+ "success": True,
1449
+ "result": formatted_result,
1450
+ "metadata": {
1451
+ "request_id": request_id, # Include request ID for tracking
1452
+ "model_used": selected_model["model_id"],
1453
+ "provider": selected_model["provider"],
1454
+ "task": task,
1455
+ "service_type": service_type,
1456
+ "selection_reason": selected_model.get("reason", "Default selection"),
1457
+ "execution_time_ms": execution_time_ms,
1458
+ "billing": billing_info
1459
+ }
1460
+ }
1461
+
1462
+ return response
1463
+ except Exception as e:
1464
+ # Calculate execution time even for errors
1465
+ execution_time_ms = int((time.time() - execution_start_time) * 1000)
1466
+
1467
+ # Log inference error
1468
+ error_type = type(e).__name__
1469
+ error_message = str(e)
1470
+
1471
+ self.inference_logger.log_inference_complete(
1472
+ request_id=request_id,
1473
+ status="failed",
1474
+ execution_time_ms=execution_time_ms,
1475
+ error_message=error_message,
1476
+ error_code=error_type,
1477
+ custom_metadata={
1478
+ "error_location": "client._invoke_service"
1479
+ }
1480
+ )
1481
+
1482
+ # Also log to the error table
1483
+ self.inference_logger.log_error(
1484
+ request_id=request_id,
1485
+ error_type=error_type,
1486
+ error_message=error_message,
1487
+ provider=model_hint or "unknown",
1488
+ model_name=provider_hint or "unknown"
1489
+ )
1490
+
1491
+ logger.error(f"Service invoke failed: {e}")
1492
+ raise
1493
+
1494
+ def _get_billing_info(self, service: Any, model_id: str) -> Dict[str, Any]:
1495
+ """Extract billing information from service after task execution"""
1496
+ try:
1497
+ # Check if service has model_manager with billing_tracker
1498
+ if hasattr(service, 'model_manager') and hasattr(service.model_manager, 'billing_tracker'):
1499
+ billing_tracker = service.model_manager.billing_tracker
1500
+
1501
+ # Get the latest usage record for this model
1502
+ model_records = [
1503
+ record for record in billing_tracker.usage_records
1504
+ if record.model_id == model_id
1505
+ ]
1506
+
1507
+ if model_records:
1508
+ # Get the most recent record
1509
+ latest_record = max(model_records, key=lambda r: r.timestamp)
1510
+
1511
+ return {
1512
+ "cost_usd": latest_record.cost_usd,
1513
+ "input_tokens": latest_record.input_tokens,
1514
+ "output_tokens": latest_record.output_tokens,
1515
+ "total_tokens": latest_record.total_tokens,
1516
+ "operation": latest_record.operation,
1517
+ "timestamp": latest_record.timestamp,
1518
+ "currency": "USD"
1519
+ }
1520
+
1521
+ # Fallback: no billing info available
1522
+ return {
1523
+ "cost_usd": 0.0,
1524
+ "input_tokens": None,
1525
+ "output_tokens": None,
1526
+ "total_tokens": None,
1527
+ "operation": None,
1528
+ "timestamp": None,
1529
+ "currency": "USD",
1530
+ "note": "Billing information not available"
1531
+ }
1532
+
1533
+ except Exception as e:
1534
+ logger.warning(f"Failed to get billing info: {e}")
1535
+ return {
1536
+ "cost_usd": 0.0,
1537
+ "error": str(e),
1538
+ "currency": "USD"
1539
+ }
1540
+
957
1541
 
958
1542
 
959
1543
  # Convenience function for quick access
960
1544
  def create_client(
961
1545
  config: Optional[Dict[str, Any]] = None,
962
- mode: str = "local",
963
- api_url: Optional[str] = None,
1546
+ service_endpoint: Optional[str] = None,
964
1547
  api_key: Optional[str] = None
965
1548
  ) -> ISAModelClient:
966
1549
  """Create ISA Model Client instance
967
1550
 
968
1551
  Args:
969
1552
  config: Optional configuration
970
- mode: "local" for direct AI Factory, "api" for HTTP API calls
971
- api_url: API base URL (required if mode="api")
972
- api_key: API key for authentication (optional)
1553
+ service_endpoint: Optional service endpoint URL (if None, uses local AI Factory)
1554
+ api_key: Optional API key for authentication (can also be set via ISA_API_KEY env var)
973
1555
 
974
1556
  Returns:
975
1557
  ISAModelClient instance
976
1558
  """
977
- return ISAModelClient(config=config, mode=mode, api_url=api_url, api_key=api_key)
1559
+ return ISAModelClient(config=config, service_endpoint=service_endpoint, api_key=api_key)
978
1560
 
979
1561
 
980
1562
  # Export for easy import