isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
isa_model/client.py CHANGED
@@ -72,10 +72,14 @@ print(result["result"])
72
72
 
73
73
  import logging
74
74
  import asyncio
75
+ import time
76
+ import uuid
75
77
  from typing import Any, Dict, Optional, List, Union
76
78
  from pathlib import Path
79
+ from datetime import datetime, timezone
77
80
 
78
81
  from isa_model.inference.ai_factory import AIFactory
82
+ from isa_model.core.logging import get_inference_logger, generate_request_id
79
83
 
80
84
  try:
81
85
  from isa_model.core.services.intelligent_model_selector import IntelligentModelSelector, get_model_selector
@@ -213,6 +217,9 @@ class ISAModelClient:
213
217
  # Cache for frequently used services
214
218
  self._service_cache: Dict[str, Any] = {}
215
219
 
220
+ # Initialize inference logger
221
+ self.inference_logger = get_inference_logger()
222
+
216
223
  logger.info("ISA Model Client initialized")
217
224
 
218
225
  async def _get_http_session(self):
@@ -327,6 +334,9 @@ class ISAModelClient:
327
334
  provider: Optional[str] = None,
328
335
  stream: Optional[bool] = None,
329
336
  show_reasoning: Optional[bool] = False,
337
+ output_format: Optional[str] = None,
338
+ json_schema: Optional[Dict] = None,
339
+ repair_attempts: Optional[int] = 3,
330
340
  **kwargs
331
341
  ) -> Dict[str, Any]:
332
342
  """
@@ -409,6 +419,9 @@ class ISAModelClient:
409
419
  model_hint=model,
410
420
  provider_hint=provider,
411
421
  show_reasoning=show_reasoning, # Explicitly pass show_reasoning
422
+ output_format=output_format,
423
+ json_schema=json_schema,
424
+ repair_attempts=repair_attempts,
412
425
  **kwargs
413
426
  )
414
427
  else:
@@ -420,6 +433,9 @@ class ISAModelClient:
420
433
  model_hint=model,
421
434
  provider_hint=provider,
422
435
  stream=False, # Force non-streaming
436
+ output_format=output_format,
437
+ json_schema=json_schema,
438
+ repair_attempts=repair_attempts,
423
439
  **kwargs
424
440
  )
425
441
 
@@ -488,7 +504,7 @@ class ISAModelClient:
488
504
  )
489
505
 
490
506
  # Step 2: Get appropriate service
491
- service = await self._get_service(
507
+ service, _ = await self._get_service(
492
508
  service_type=service_type,
493
509
  model_name=selected_model["model_id"],
494
510
  provider=selected_model["provider"],
@@ -508,7 +524,9 @@ class ISAModelClient:
508
524
  content_chunks = []
509
525
  async for token in service.astream(input_data):
510
526
  content_chunks.append(token)
511
- yield token
527
+ # Only yield string tokens for streaming (filter out dict/objects)
528
+ if isinstance(token, str):
529
+ yield token
512
530
 
513
531
  # Step 6: After streaming is complete, calculate billing info and optionally return metadata
514
532
  try:
@@ -533,7 +551,7 @@ class ISAModelClient:
533
551
  "billing": billing_info,
534
552
  "streaming": True,
535
553
  "tokens_streamed": len(content_chunks),
536
- "content_length": len("".join(content_chunks))
554
+ "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
537
555
  }
538
556
  yield ('metadata', metadata)
539
557
 
@@ -554,7 +572,7 @@ class ISAModelClient:
554
572
  },
555
573
  "streaming": True,
556
574
  "tokens_streamed": len(content_chunks),
557
- "content_length": len("".join(content_chunks))
575
+ "content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
558
576
  }
559
577
  yield ('metadata', fallback_metadata)
560
578
 
@@ -562,6 +580,111 @@ class ISAModelClient:
562
580
  logger.error(f"Streaming invoke failed: {e}")
563
581
  raise
564
582
 
583
+ def _is_rate_limit_error(self, error: Exception) -> bool:
584
+ """Check if an error is due to rate limiting"""
585
+ error_str = str(error).lower()
586
+
587
+ # Check for common rate limit indicators
588
+ rate_limit_indicators = [
589
+ 'rate limit',
590
+ 'rate_limit',
591
+ 'ratelimit',
592
+ 'too many requests',
593
+ 'quota exceeded',
594
+ 'limit exceeded',
595
+ 'throttled',
596
+ '429'
597
+ ]
598
+
599
+ return any(indicator in error_str for indicator in rate_limit_indicators)
600
+
601
+ async def _invoke_with_fallback(
602
+ self,
603
+ service_type: str,
604
+ task: str,
605
+ input_data: Any,
606
+ selected_model: Dict[str, Any],
607
+ **kwargs
608
+ ) -> Any:
609
+ """Invoke service with automatic fallback on rate limit"""
610
+ try:
611
+ # First attempt with selected model
612
+ return await self._invoke_service_direct(service_type, task, input_data, selected_model, **kwargs)
613
+ except Exception as e:
614
+ # Check if this is a rate limit error
615
+ if self._is_rate_limit_error(e):
616
+ logger.warning(f"Rate limit detected for {selected_model['provider']}: {e}")
617
+
618
+ # Try to get fallback model using intelligent model selector
619
+ if INTELLIGENT_SELECTOR_AVAILABLE and self.model_selector:
620
+ try:
621
+ fallback_selection = self.model_selector.get_rate_limit_fallback(
622
+ service_type,
623
+ selected_model['provider']
624
+ )
625
+
626
+ if fallback_selection.get('success') and fallback_selection.get('is_fallback'):
627
+ fallback_model = fallback_selection['selected_model']
628
+ logger.info(f"Switching to fallback: {fallback_model['provider']}/{fallback_model['model_id']}")
629
+
630
+ # Retry with fallback model
631
+ return await self._invoke_service_direct(service_type, task, input_data, fallback_model, **kwargs)
632
+ except Exception as fallback_error:
633
+ logger.error(f"Fallback also failed: {fallback_error}")
634
+ raise e # Raise original rate limit error
635
+
636
+ # Re-raise the original error if not rate limit or fallback failed
637
+ raise
638
+
639
+ async def _invoke_service_direct(
640
+ self,
641
+ service_type: str,
642
+ task: str,
643
+ input_data: Any,
644
+ model_config: Dict[str, Any],
645
+ **kwargs
646
+ ) -> Any:
647
+ """Direct service invocation without fallback logic"""
648
+ # Get appropriate service
649
+ factory = AIFactory.get_instance()
650
+
651
+ # Create service with the specified model
652
+ if service_type == "text":
653
+ service = factory.get_llm(model_config["model_id"], model_config["provider"])
654
+ elif service_type == "vision":
655
+ service = factory.get_vision(model_config["model_id"], model_config["provider"])
656
+ elif service_type == "audio":
657
+ service = factory.get_audio(model_config["model_id"], model_config["provider"])
658
+ elif service_type == "image":
659
+ service = factory.get_image(model_config["model_id"], model_config["provider"])
660
+ elif service_type == "embedding":
661
+ service = factory.get_embed(model_config["model_id"], model_config["provider"])
662
+ else:
663
+ raise ValueError(f"Unsupported service type: {service_type}")
664
+
665
+ # Invoke the service
666
+ if service_type == "text":
667
+ show_reasoning = kwargs.pop('show_reasoning', False)
668
+
669
+ # Check if service supports show_reasoning parameter (mainly OpenAI services)
670
+ if model_config["provider"] == "openai":
671
+ result = await service.invoke(
672
+ input_data=input_data,
673
+ task=task,
674
+ show_reasoning=show_reasoning,
675
+ **kwargs
676
+ )
677
+ else:
678
+ # For other providers like yyds, don't pass show_reasoning
679
+ result = await service.invoke(
680
+ input_data=input_data,
681
+ task=task,
682
+ **kwargs
683
+ )
684
+ return result
685
+ else:
686
+ return await service.invoke(input_data=input_data, task=task, **kwargs)
687
+
565
688
  async def _select_model(
566
689
  self,
567
690
  input_data: Any,
@@ -661,6 +784,7 @@ class ISAModelClient:
661
784
  "audio": {
662
785
  "tts": {"model_id": "tts-1", "provider": "openai"},
663
786
  "stt": {"model_id": "whisper-1", "provider": "openai"},
787
+ "realtime": {"model_id": "gpt-4o-realtime-preview-2024-10-01", "provider": "openai"},
664
788
  "default": {"model_id": "whisper-1", "provider": "openai"}
665
789
  },
666
790
  "text": {
@@ -680,9 +804,14 @@ class ISAModelClient:
680
804
 
681
805
  # Handle audio service type with task-specific models
682
806
  if service_type == "audio":
683
- if "speech" in task or "tts" in task:
807
+ # Realtime audio tasks
808
+ if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
809
+ default = defaults["audio"]["realtime"]
810
+ # Traditional TTS tasks
811
+ elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
684
812
  default = defaults["audio"]["tts"]
685
- elif "transcribe" in task or "stt" in task:
813
+ # Traditional STT tasks
814
+ elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
686
815
  default = defaults["audio"]["stt"]
687
816
  else:
688
817
  default = defaults["audio"]["default"]
@@ -714,14 +843,16 @@ class ISAModelClient:
714
843
  provider: str,
715
844
  task: str,
716
845
  use_cache: bool = True
717
- ) -> Any:
718
- """Get appropriate service instance"""
846
+ ) -> tuple[Any, str]:
847
+ """Get appropriate service instance and return actual model used"""
719
848
 
720
- cache_key = f"{service_type}_{provider}_{model_name}"
849
+ cache_key = f"{service_type}_{provider}_{model_name}_{task}"
850
+ actual_model_used = model_name # Track the actual model used
721
851
 
722
852
  # Check cache first (if caching is enabled)
723
853
  if use_cache and cache_key in self._service_cache:
724
- return self._service_cache[cache_key]
854
+ cached_service, cached_model = self._service_cache[cache_key]
855
+ return cached_service, cached_model
725
856
 
726
857
  try:
727
858
  # Validate service type
@@ -730,24 +861,46 @@ class ISAModelClient:
730
861
  # Route to appropriate AIFactory method
731
862
  if service_type == "vision":
732
863
  service = self.ai_factory.get_vision(model_name, provider)
864
+ actual_model_used = model_name
733
865
  elif service_type == "audio":
734
- if "speech" in task or "tts" in task:
735
- service = self.ai_factory.get_tts(model_name, provider)
736
- elif "transcribe" in task or "stt" in task:
737
- service = self.ai_factory.get_stt(model_name, provider)
866
+ # Realtime audio tasks
867
+ if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
868
+ # Use realtime model
869
+ realtime_model = "gpt-4o-realtime-preview-2024-10-01" if model_name == "tts-1" or model_name == "whisper-1" else model_name
870
+ service = self.ai_factory.get_realtime(realtime_model, provider)
871
+ actual_model_used = realtime_model
872
+ # Traditional TTS tasks
873
+ elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
874
+ # Use TTS model
875
+ tts_model = "tts-1" if model_name == "whisper-1" else model_name
876
+ service = self.ai_factory.get_tts(tts_model, provider)
877
+ actual_model_used = tts_model
878
+ # Traditional STT tasks
879
+ elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
880
+ # Use STT model
881
+ stt_model = "whisper-1" if model_name == "tts-1" else model_name
882
+ service = self.ai_factory.get_stt(stt_model, provider)
883
+ actual_model_used = stt_model
884
+ # Default to STT for backward compatibility
738
885
  else:
739
- service = self.ai_factory.get_stt(model_name, provider)
886
+ # Use STT model by default
887
+ stt_model = "whisper-1" if model_name == "tts-1" else model_name
888
+ service = self.ai_factory.get_stt(stt_model, provider)
889
+ actual_model_used = stt_model
740
890
  elif service_type == "text":
741
891
  service = self.ai_factory.get_llm(model_name, provider)
892
+ actual_model_used = model_name
742
893
  elif service_type == "image":
743
894
  service = self.ai_factory.get_img("t2i", model_name, provider)
895
+ actual_model_used = model_name
744
896
  elif service_type == "embedding":
745
897
  service = self.ai_factory.get_embed(model_name, provider)
898
+ actual_model_used = model_name
746
899
 
747
- # Cache the service (if caching is enabled)
900
+ # Cache the service and actual model (if caching is enabled)
748
901
  if use_cache:
749
- self._service_cache[cache_key] = service
750
- return service
902
+ self._service_cache[cache_key] = (service, actual_model_used)
903
+ return service, actual_model_used
751
904
 
752
905
  except Exception as e:
753
906
  logger.error(f"Failed to get service {service_type}/{provider}/{model_name}: {e}")
@@ -785,12 +938,26 @@ class ISAModelClient:
785
938
  )
786
939
 
787
940
  elif service_type == "audio":
788
- if unified_task in ["synthesize", "text_to_speech", "tts"]:
941
+ # Realtime audio tasks
942
+ if any(realtime_task in unified_task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
943
+ # For realtime text_chat and audio_chat, pass text parameter
944
+ if unified_task in ["text_chat", "audio_chat"]:
945
+ if isinstance(input_data, str):
946
+ kwargs['text'] = input_data
947
+ elif isinstance(input_data, bytes):
948
+ kwargs['audio_data'] = input_data
949
+ return await service.invoke(
950
+ task=unified_task,
951
+ **kwargs
952
+ )
953
+ # Traditional TTS tasks
954
+ elif unified_task in ["synthesize", "text_to_speech", "tts", "generate_speech"]:
789
955
  return await service.invoke(
790
956
  text=input_data,
791
957
  task=unified_task,
792
958
  **kwargs
793
959
  )
960
+ # Traditional STT tasks
794
961
  else:
795
962
  return await service.invoke(
796
963
  audio_input=input_data,
@@ -801,22 +968,58 @@ class ISAModelClient:
801
968
  elif service_type == "text":
802
969
  # Extract show_reasoning from kwargs if present
803
970
  show_reasoning = kwargs.pop('show_reasoning', False)
804
- result = await service.invoke(
805
- input_data=input_data,
806
- task=unified_task,
807
- show_reasoning=show_reasoning,
808
- **kwargs
809
- )
971
+
972
+ # Check if service provider supports show_reasoning
973
+ # Only OpenAI services support this parameter
974
+ if hasattr(service, 'provider_name') and service.provider_name == 'openai':
975
+ result = await service.invoke(
976
+ input_data=input_data,
977
+ task=unified_task,
978
+ show_reasoning=show_reasoning,
979
+ **kwargs
980
+ )
981
+ else:
982
+ # For other providers like yyds, don't pass show_reasoning
983
+ result = await service.invoke(
984
+ input_data=input_data,
985
+ task=unified_task,
986
+ **kwargs
987
+ )
810
988
 
811
989
  logger.debug(f"Service result type: {type(result)}")
812
990
  logger.debug(f"Service result: {result}")
813
991
 
814
- if isinstance(result, dict) and 'message' in result:
992
+ # Check if this is a formatted result from invoke method
993
+ if isinstance(result, dict) and 'formatted' in result:
994
+ # This is a formatted result from the new invoke method
995
+ logger.debug(f"Returning formatted result: {result}")
996
+ return result
997
+ elif isinstance(result, dict) and 'message' in result:
998
+ # This is a traditional message result
815
999
  message = result['message']
816
1000
  logger.debug(f"Extracted message type: {type(message)}")
817
- logger.debug(f"Extracted message: {message}")
818
- return message
1001
+ logger.debug(f"Extracted message length: {len(str(message)) if message else 0}")
1002
+
1003
+ # Handle AIMessage objects from LangChain
1004
+ if hasattr(message, 'content'):
1005
+ # Check if there are tool_calls
1006
+ if hasattr(message, 'tool_calls') and message.tool_calls:
1007
+ logger.debug(f"AIMessage contains tool_calls: {len(message.tool_calls)}")
1008
+ # Return a dict with both content and tool_calls
1009
+ return {
1010
+ "content": message.content if message.content else "",
1011
+ "tool_calls": message.tool_calls
1012
+ }
1013
+ else:
1014
+ content = message.content
1015
+ logger.debug(f"Extracted content from AIMessage: {len(content) if content else 0} chars")
1016
+ return content
1017
+ else:
1018
+ # Direct string message
1019
+ logger.debug(f"Returning direct message: {len(str(message)) if message else 0} chars")
1020
+ return message
819
1021
  else:
1022
+ logger.debug(f"Returning result directly: {result}")
820
1023
  return result
821
1024
 
822
1025
  elif service_type == "image":
@@ -886,7 +1089,7 @@ class ISAModelClient:
886
1089
 
887
1090
  for service_type, provider, model in test_services:
888
1091
  try:
889
- await self._get_service(service_type, model, provider, "test")
1092
+ service, _ = await self._get_service(service_type, model, provider, "test")
890
1093
  health_status["services"][f"{service_type}_{provider}"] = "healthy"
891
1094
  except Exception as e:
892
1095
  health_status["services"][f"{service_type}_{provider}"] = f"error: {str(e)}"
@@ -916,9 +1119,18 @@ class ISAModelClient:
916
1119
  service_type: str,
917
1120
  model_hint: Optional[str] = None,
918
1121
  provider_hint: Optional[str] = None,
1122
+ output_format: Optional[str] = None,
1123
+ json_schema: Optional[Dict] = None,
1124
+ repair_attempts: Optional[int] = 3,
919
1125
  **kwargs
920
1126
  ) -> Dict[str, Any]:
921
1127
  """Service invoke that returns streaming response with async generator"""
1128
+
1129
+ # Generate unique request ID for logging
1130
+ request_id = generate_request_id()
1131
+ start_time = datetime.now(timezone.utc)
1132
+ execution_start_time = time.time()
1133
+
922
1134
  try:
923
1135
  # Step 1: Select best model for this task
924
1136
  selected_model = await self._select_model(
@@ -930,18 +1142,20 @@ class ISAModelClient:
930
1142
  )
931
1143
 
932
1144
  # Step 2: Get appropriate service
933
- service = await self._get_service(
1145
+ service, actual_model_used = await self._get_service(
934
1146
  service_type=service_type,
935
1147
  model_name=selected_model["model_id"],
936
1148
  provider=selected_model["provider"],
937
1149
  task=task,
938
1150
  use_cache=False # Don't cache for streaming to avoid state issues
939
1151
  )
1152
+ # Update selected model with actual model used
1153
+ selected_model["model_id"] = actual_model_used
940
1154
 
941
1155
  # Step 3: Handle tools for LLM services (bind tools if provided)
942
1156
  tools = kwargs.pop("tools", None)
943
1157
  if service_type == "text" and tools:
944
- service = await self._get_service(
1158
+ service, _ = await self._get_service(
945
1159
  service_type=service_type,
946
1160
  model_name=selected_model["model_id"],
947
1161
  provider=selected_model["provider"],
@@ -964,7 +1178,8 @@ class ISAModelClient:
964
1178
  if service_type == "text" and hasattr(service, 'astream'):
965
1179
  show_reasoning = kwargs.get('show_reasoning', False)
966
1180
  logger.debug(f"Stream generator: show_reasoning={show_reasoning}")
967
- if 'show_reasoning' in kwargs:
1181
+ # Only pass show_reasoning to OpenAI providers
1182
+ if 'show_reasoning' in kwargs and hasattr(service, 'provider_name') and service.provider_name == 'openai':
968
1183
  async for token in service.astream(input_data, show_reasoning=show_reasoning):
969
1184
  yield token
970
1185
  else:
@@ -999,9 +1214,18 @@ class ISAModelClient:
999
1214
  model_hint: Optional[str] = None,
1000
1215
  provider_hint: Optional[str] = None,
1001
1216
  stream: Optional[bool] = None,
1217
+ output_format: Optional[str] = None,
1218
+ json_schema: Optional[Dict] = None,
1219
+ repair_attempts: Optional[int] = 3,
1002
1220
  **kwargs
1003
1221
  ) -> Dict[str, Any]:
1004
1222
  """Direct service invoke - passes LangChain objects and tools directly to services"""
1223
+
1224
+ # Generate unique request ID for logging
1225
+ request_id = generate_request_id()
1226
+ start_time = datetime.now(timezone.utc)
1227
+ execution_start_time = time.time()
1228
+
1005
1229
  try:
1006
1230
  # Step 1: Select best model for this task
1007
1231
  selected_model = await self._select_model(
@@ -1012,18 +1236,35 @@ class ISAModelClient:
1012
1236
  provider_hint=provider_hint
1013
1237
  )
1014
1238
 
1239
+ # Step 1.5: Log inference start
1240
+ self.inference_logger.log_inference_start(
1241
+ request_id=request_id,
1242
+ service_type=service_type,
1243
+ task=task,
1244
+ provider=selected_model["provider"],
1245
+ model_name=selected_model["model_id"],
1246
+ input_data=input_data if self.inference_logger.log_detailed_requests else None,
1247
+ is_streaming=stream or False,
1248
+ custom_metadata={
1249
+ "selection_reason": selected_model.get("reason", "Default selection"),
1250
+ "has_tools": "tools" in kwargs
1251
+ }
1252
+ )
1253
+
1015
1254
  # Step 2: Get appropriate service
1016
- service = await self._get_service(
1255
+ service, actual_model_used = await self._get_service(
1017
1256
  service_type=service_type,
1018
1257
  model_name=selected_model["model_id"],
1019
1258
  provider=selected_model["provider"],
1020
1259
  task=task
1021
1260
  )
1261
+ # Update selected model with actual model used
1262
+ selected_model["model_id"] = actual_model_used
1022
1263
 
1023
1264
  # Step 3: Handle tools for LLM services (bind tools if provided)
1024
1265
  tools = kwargs.pop("tools", None)
1025
1266
  if service_type == "text" and tools:
1026
- service = await self._get_service(
1267
+ service, _ = await self._get_service(
1027
1268
  service_type=service_type,
1028
1269
  model_name=selected_model["model_id"],
1029
1270
  provider=selected_model["provider"],
@@ -1039,32 +1280,214 @@ class ISAModelClient:
1039
1280
  service.streaming = stream
1040
1281
 
1041
1282
  # Step 5: Execute task with unified interface
1042
- result = await self._execute_task(
1043
- service=service,
1044
- input_data=input_data,
1045
- task=task,
1046
- service_type=service_type,
1047
- **kwargs
1048
- )
1283
+ # Pass JSON formatting parameters to the service
1284
+ task_kwargs = kwargs.copy()
1285
+ if service_type == "text":
1286
+ if output_format:
1287
+ task_kwargs["output_format"] = output_format
1288
+ if json_schema:
1289
+ task_kwargs["json_schema"] = json_schema
1290
+ if repair_attempts is not None:
1291
+ task_kwargs["repair_attempts"] = repair_attempts
1292
+
1293
+ # Try to execute with rate limit detection
1294
+ try:
1295
+ result = await self._execute_task(
1296
+ service=service,
1297
+ input_data=input_data,
1298
+ task=task,
1299
+ service_type=service_type,
1300
+ **task_kwargs
1301
+ )
1302
+ except Exception as e:
1303
+ # Check if this is a rate limit error and we can fallback
1304
+ if self._is_rate_limit_error(e) and service_type == "text":
1305
+ # Ensure model selector is initialized
1306
+ if not self.model_selector:
1307
+ self.model_selector = await get_model_selector(self.config)
1308
+
1309
+ # Get fallback model selection
1310
+ fallback_selection = self.model_selector.get_rate_limit_fallback(
1311
+ service_type=service_type,
1312
+ original_provider=selected_model["provider"]
1313
+ )
1314
+
1315
+ if fallback_selection.get('success'):
1316
+ fallback_model = fallback_selection.get('selected_model', {})
1317
+ logger.info(f"Rate limit hit, switching to fallback: {fallback_model}")
1318
+
1319
+ # Get fallback service
1320
+ fallback_service, fallback_model_used = await self._get_service(
1321
+ service_type=service_type,
1322
+ model_name=fallback_model["model_id"],
1323
+ provider=fallback_model["provider"],
1324
+ task=task
1325
+ )
1326
+
1327
+ # Update selected model for metadata
1328
+ selected_model = fallback_model
1329
+ selected_model["model_id"] = fallback_model_used
1330
+ selected_model["reason"] = "Rate limit fallback"
1331
+
1332
+ # Retry with fallback service
1333
+ result = await self._execute_task(
1334
+ service=fallback_service,
1335
+ input_data=input_data,
1336
+ task=task,
1337
+ service_type=service_type,
1338
+ **task_kwargs
1339
+ )
1340
+ else:
1341
+ # No fallback available, re-raise original error
1342
+ raise
1343
+ else:
1344
+ # Not a rate limit error or no fallback, re-raise
1345
+ raise
1049
1346
 
1050
1347
  # Step 6: Wait for billing tracking to complete, then get billing information
1051
1348
  await asyncio.sleep(0.01) # Small delay to ensure billing tracking completes
1052
1349
  billing_info = self._get_billing_info(service, selected_model["model_id"])
1053
1350
 
1351
+ # Step 6.5: Calculate execution time and log completion
1352
+ execution_time_ms = int((time.time() - execution_start_time) * 1000)
1353
+
1354
+ # Log inference completion
1355
+ self.inference_logger.log_inference_complete(
1356
+ request_id=request_id,
1357
+ status="completed",
1358
+ execution_time_ms=execution_time_ms,
1359
+ input_tokens=billing_info.get("input_tokens"),
1360
+ output_tokens=billing_info.get("output_tokens"),
1361
+ estimated_cost_usd=billing_info.get("cost_usd"),
1362
+ output_data=result if self.inference_logger.log_detailed_requests else None,
1363
+ custom_metadata={
1364
+ "billing_operation": billing_info.get("operation"),
1365
+ "timestamp": billing_info.get("timestamp")
1366
+ }
1367
+ )
1368
+
1369
+ # Log detailed token usage if available
1370
+ if billing_info.get("input_tokens") and billing_info.get("output_tokens"):
1371
+ self.inference_logger.log_token_usage(
1372
+ request_id=request_id,
1373
+ provider=selected_model["provider"],
1374
+ model_name=selected_model["model_id"],
1375
+ prompt_tokens=billing_info.get("input_tokens"),
1376
+ completion_tokens=billing_info.get("output_tokens"),
1377
+ prompt_cost_usd=billing_info.get("cost_usd", 0) * 0.6 if billing_info.get("cost_usd") else None, # Rough estimate
1378
+ completion_cost_usd=billing_info.get("cost_usd", 0) * 0.4 if billing_info.get("cost_usd") else None
1379
+ )
1380
+
1381
+ # Handle formatting - check if result is already formatted
1382
+ formatted_result = result
1383
+ if service_type == "text" and output_format:
1384
+ # Check if result is already formatted by the service
1385
+ if isinstance(result, dict) and result.get("formatted"):
1386
+ # Result is already formatted by the service
1387
+ formatted_result = result.get("result", result)
1388
+ billing_info["formatting"] = {
1389
+ "output_format": output_format,
1390
+ "format_success": True,
1391
+ "format_method": "service_level",
1392
+ "format_errors": result.get("format_errors", []),
1393
+ "repaired": False,
1394
+ "pre_formatted": True
1395
+ }
1396
+ else:
1397
+ # Apply formatting at client level (fallback)
1398
+ try:
1399
+ service, _ = await self._get_service(
1400
+ service_type=service_type,
1401
+ model_name=selected_model["model_id"],
1402
+ provider=selected_model["provider"],
1403
+ task=task
1404
+ )
1405
+ if hasattr(service, 'format_structured_output'):
1406
+ formatting_result = service.format_structured_output(
1407
+ response=result,
1408
+ output_format=output_format,
1409
+ schema=json_schema,
1410
+ repair_attempts=repair_attempts or 3
1411
+ )
1412
+ # Update result and add formatting metadata
1413
+ if formatting_result.get("success") and formatting_result.get("data") is not None:
1414
+ # Extract the actual formatted data
1415
+ formatted_data = formatting_result["data"]
1416
+
1417
+ # For JSON output, ensure we return clean data
1418
+ if output_format == "json" and isinstance(formatted_data, dict):
1419
+ formatted_result = formatted_data
1420
+ else:
1421
+ formatted_result = formatted_data
1422
+ else:
1423
+ # Keep original result if formatting failed
1424
+ formatted_result = result
1425
+
1426
+ # Add formatting info to metadata
1427
+ billing_info["formatting"] = {
1428
+ "output_format": output_format,
1429
+ "format_success": formatting_result.get("success", False),
1430
+ "format_method": formatting_result.get("method"),
1431
+ "format_errors": formatting_result.get("errors", []),
1432
+ "repaired": formatting_result.get("repaired", False),
1433
+ "pre_formatted": False
1434
+ }
1435
+
1436
+ except Exception as format_error:
1437
+ logger.warning(f"Failed to apply output formatting: {format_error}")
1438
+ # Continue with unformatted result
1439
+ formatted_result = result
1440
+ billing_info["formatting"] = {
1441
+ "output_format": output_format,
1442
+ "format_success": False,
1443
+ "format_error": str(format_error)
1444
+ }
1445
+
1054
1446
  # Return unified response
1055
- return {
1447
+ response = {
1056
1448
  "success": True,
1057
- "result": result,
1449
+ "result": formatted_result,
1058
1450
  "metadata": {
1451
+ "request_id": request_id, # Include request ID for tracking
1059
1452
  "model_used": selected_model["model_id"],
1060
1453
  "provider": selected_model["provider"],
1061
1454
  "task": task,
1062
1455
  "service_type": service_type,
1063
1456
  "selection_reason": selected_model.get("reason", "Default selection"),
1457
+ "execution_time_ms": execution_time_ms,
1064
1458
  "billing": billing_info
1065
1459
  }
1066
1460
  }
1461
+
1462
+ return response
1067
1463
  except Exception as e:
1464
+ # Calculate execution time even for errors
1465
+ execution_time_ms = int((time.time() - execution_start_time) * 1000)
1466
+
1467
+ # Log inference error
1468
+ error_type = type(e).__name__
1469
+ error_message = str(e)
1470
+
1471
+ self.inference_logger.log_inference_complete(
1472
+ request_id=request_id,
1473
+ status="failed",
1474
+ execution_time_ms=execution_time_ms,
1475
+ error_message=error_message,
1476
+ error_code=error_type,
1477
+ custom_metadata={
1478
+ "error_location": "client._invoke_service"
1479
+ }
1480
+ )
1481
+
1482
+ # Also log to the error table
1483
+ self.inference_logger.log_error(
1484
+ request_id=request_id,
1485
+ error_type=error_type,
1486
+ error_message=error_message,
1487
+ provider=model_hint or "unknown",
1488
+ model_name=provider_hint or "unknown"
1489
+ )
1490
+
1068
1491
  logger.error(f"Service invoke failed: {e}")
1069
1492
  raise
1070
1493