isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/client.py
CHANGED
@@ -72,10 +72,14 @@ print(result["result"])
|
|
72
72
|
|
73
73
|
import logging
|
74
74
|
import asyncio
|
75
|
+
import time
|
76
|
+
import uuid
|
75
77
|
from typing import Any, Dict, Optional, List, Union
|
76
78
|
from pathlib import Path
|
79
|
+
from datetime import datetime, timezone
|
77
80
|
|
78
81
|
from isa_model.inference.ai_factory import AIFactory
|
82
|
+
from isa_model.core.logging import get_inference_logger, generate_request_id
|
79
83
|
|
80
84
|
try:
|
81
85
|
from isa_model.core.services.intelligent_model_selector import IntelligentModelSelector, get_model_selector
|
@@ -213,6 +217,9 @@ class ISAModelClient:
|
|
213
217
|
# Cache for frequently used services
|
214
218
|
self._service_cache: Dict[str, Any] = {}
|
215
219
|
|
220
|
+
# Initialize inference logger
|
221
|
+
self.inference_logger = get_inference_logger()
|
222
|
+
|
216
223
|
logger.info("ISA Model Client initialized")
|
217
224
|
|
218
225
|
async def _get_http_session(self):
|
@@ -327,6 +334,9 @@ class ISAModelClient:
|
|
327
334
|
provider: Optional[str] = None,
|
328
335
|
stream: Optional[bool] = None,
|
329
336
|
show_reasoning: Optional[bool] = False,
|
337
|
+
output_format: Optional[str] = None,
|
338
|
+
json_schema: Optional[Dict] = None,
|
339
|
+
repair_attempts: Optional[int] = 3,
|
330
340
|
**kwargs
|
331
341
|
) -> Dict[str, Any]:
|
332
342
|
"""
|
@@ -409,6 +419,9 @@ class ISAModelClient:
|
|
409
419
|
model_hint=model,
|
410
420
|
provider_hint=provider,
|
411
421
|
show_reasoning=show_reasoning, # Explicitly pass show_reasoning
|
422
|
+
output_format=output_format,
|
423
|
+
json_schema=json_schema,
|
424
|
+
repair_attempts=repair_attempts,
|
412
425
|
**kwargs
|
413
426
|
)
|
414
427
|
else:
|
@@ -420,6 +433,9 @@ class ISAModelClient:
|
|
420
433
|
model_hint=model,
|
421
434
|
provider_hint=provider,
|
422
435
|
stream=False, # Force non-streaming
|
436
|
+
output_format=output_format,
|
437
|
+
json_schema=json_schema,
|
438
|
+
repair_attempts=repair_attempts,
|
423
439
|
**kwargs
|
424
440
|
)
|
425
441
|
|
@@ -488,7 +504,7 @@ class ISAModelClient:
|
|
488
504
|
)
|
489
505
|
|
490
506
|
# Step 2: Get appropriate service
|
491
|
-
service = await self._get_service(
|
507
|
+
service, _ = await self._get_service(
|
492
508
|
service_type=service_type,
|
493
509
|
model_name=selected_model["model_id"],
|
494
510
|
provider=selected_model["provider"],
|
@@ -508,7 +524,9 @@ class ISAModelClient:
|
|
508
524
|
content_chunks = []
|
509
525
|
async for token in service.astream(input_data):
|
510
526
|
content_chunks.append(token)
|
511
|
-
yield
|
527
|
+
# Only yield string tokens for streaming (filter out dict/objects)
|
528
|
+
if isinstance(token, str):
|
529
|
+
yield token
|
512
530
|
|
513
531
|
# Step 6: After streaming is complete, calculate billing info and optionally return metadata
|
514
532
|
try:
|
@@ -533,7 +551,7 @@ class ISAModelClient:
|
|
533
551
|
"billing": billing_info,
|
534
552
|
"streaming": True,
|
535
553
|
"tokens_streamed": len(content_chunks),
|
536
|
-
"content_length": len("".join(content_chunks))
|
554
|
+
"content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
|
537
555
|
}
|
538
556
|
yield ('metadata', metadata)
|
539
557
|
|
@@ -554,7 +572,7 @@ class ISAModelClient:
|
|
554
572
|
},
|
555
573
|
"streaming": True,
|
556
574
|
"tokens_streamed": len(content_chunks),
|
557
|
-
"content_length": len("".join(content_chunks))
|
575
|
+
"content_length": len("".join(str(chunk) if isinstance(chunk, str) else "" for chunk in content_chunks))
|
558
576
|
}
|
559
577
|
yield ('metadata', fallback_metadata)
|
560
578
|
|
@@ -562,6 +580,111 @@ class ISAModelClient:
|
|
562
580
|
logger.error(f"Streaming invoke failed: {e}")
|
563
581
|
raise
|
564
582
|
|
583
|
+
def _is_rate_limit_error(self, error: Exception) -> bool:
|
584
|
+
"""Check if an error is due to rate limiting"""
|
585
|
+
error_str = str(error).lower()
|
586
|
+
|
587
|
+
# Check for common rate limit indicators
|
588
|
+
rate_limit_indicators = [
|
589
|
+
'rate limit',
|
590
|
+
'rate_limit',
|
591
|
+
'ratelimit',
|
592
|
+
'too many requests',
|
593
|
+
'quota exceeded',
|
594
|
+
'limit exceeded',
|
595
|
+
'throttled',
|
596
|
+
'429'
|
597
|
+
]
|
598
|
+
|
599
|
+
return any(indicator in error_str for indicator in rate_limit_indicators)
|
600
|
+
|
601
|
+
async def _invoke_with_fallback(
|
602
|
+
self,
|
603
|
+
service_type: str,
|
604
|
+
task: str,
|
605
|
+
input_data: Any,
|
606
|
+
selected_model: Dict[str, Any],
|
607
|
+
**kwargs
|
608
|
+
) -> Any:
|
609
|
+
"""Invoke service with automatic fallback on rate limit"""
|
610
|
+
try:
|
611
|
+
# First attempt with selected model
|
612
|
+
return await self._invoke_service_direct(service_type, task, input_data, selected_model, **kwargs)
|
613
|
+
except Exception as e:
|
614
|
+
# Check if this is a rate limit error
|
615
|
+
if self._is_rate_limit_error(e):
|
616
|
+
logger.warning(f"Rate limit detected for {selected_model['provider']}: {e}")
|
617
|
+
|
618
|
+
# Try to get fallback model using intelligent model selector
|
619
|
+
if INTELLIGENT_SELECTOR_AVAILABLE and self.model_selector:
|
620
|
+
try:
|
621
|
+
fallback_selection = self.model_selector.get_rate_limit_fallback(
|
622
|
+
service_type,
|
623
|
+
selected_model['provider']
|
624
|
+
)
|
625
|
+
|
626
|
+
if fallback_selection.get('success') and fallback_selection.get('is_fallback'):
|
627
|
+
fallback_model = fallback_selection['selected_model']
|
628
|
+
logger.info(f"Switching to fallback: {fallback_model['provider']}/{fallback_model['model_id']}")
|
629
|
+
|
630
|
+
# Retry with fallback model
|
631
|
+
return await self._invoke_service_direct(service_type, task, input_data, fallback_model, **kwargs)
|
632
|
+
except Exception as fallback_error:
|
633
|
+
logger.error(f"Fallback also failed: {fallback_error}")
|
634
|
+
raise e # Raise original rate limit error
|
635
|
+
|
636
|
+
# Re-raise the original error if not rate limit or fallback failed
|
637
|
+
raise
|
638
|
+
|
639
|
+
async def _invoke_service_direct(
|
640
|
+
self,
|
641
|
+
service_type: str,
|
642
|
+
task: str,
|
643
|
+
input_data: Any,
|
644
|
+
model_config: Dict[str, Any],
|
645
|
+
**kwargs
|
646
|
+
) -> Any:
|
647
|
+
"""Direct service invocation without fallback logic"""
|
648
|
+
# Get appropriate service
|
649
|
+
factory = AIFactory.get_instance()
|
650
|
+
|
651
|
+
# Create service with the specified model
|
652
|
+
if service_type == "text":
|
653
|
+
service = factory.get_llm(model_config["model_id"], model_config["provider"])
|
654
|
+
elif service_type == "vision":
|
655
|
+
service = factory.get_vision(model_config["model_id"], model_config["provider"])
|
656
|
+
elif service_type == "audio":
|
657
|
+
service = factory.get_audio(model_config["model_id"], model_config["provider"])
|
658
|
+
elif service_type == "image":
|
659
|
+
service = factory.get_image(model_config["model_id"], model_config["provider"])
|
660
|
+
elif service_type == "embedding":
|
661
|
+
service = factory.get_embed(model_config["model_id"], model_config["provider"])
|
662
|
+
else:
|
663
|
+
raise ValueError(f"Unsupported service type: {service_type}")
|
664
|
+
|
665
|
+
# Invoke the service
|
666
|
+
if service_type == "text":
|
667
|
+
show_reasoning = kwargs.pop('show_reasoning', False)
|
668
|
+
|
669
|
+
# Check if service supports show_reasoning parameter (mainly OpenAI services)
|
670
|
+
if model_config["provider"] == "openai":
|
671
|
+
result = await service.invoke(
|
672
|
+
input_data=input_data,
|
673
|
+
task=task,
|
674
|
+
show_reasoning=show_reasoning,
|
675
|
+
**kwargs
|
676
|
+
)
|
677
|
+
else:
|
678
|
+
# For other providers like yyds, don't pass show_reasoning
|
679
|
+
result = await service.invoke(
|
680
|
+
input_data=input_data,
|
681
|
+
task=task,
|
682
|
+
**kwargs
|
683
|
+
)
|
684
|
+
return result
|
685
|
+
else:
|
686
|
+
return await service.invoke(input_data=input_data, task=task, **kwargs)
|
687
|
+
|
565
688
|
async def _select_model(
|
566
689
|
self,
|
567
690
|
input_data: Any,
|
@@ -661,6 +784,7 @@ class ISAModelClient:
|
|
661
784
|
"audio": {
|
662
785
|
"tts": {"model_id": "tts-1", "provider": "openai"},
|
663
786
|
"stt": {"model_id": "whisper-1", "provider": "openai"},
|
787
|
+
"realtime": {"model_id": "gpt-4o-realtime-preview-2024-10-01", "provider": "openai"},
|
664
788
|
"default": {"model_id": "whisper-1", "provider": "openai"}
|
665
789
|
},
|
666
790
|
"text": {
|
@@ -680,9 +804,14 @@ class ISAModelClient:
|
|
680
804
|
|
681
805
|
# Handle audio service type with task-specific models
|
682
806
|
if service_type == "audio":
|
683
|
-
|
807
|
+
# Realtime audio tasks
|
808
|
+
if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
|
809
|
+
default = defaults["audio"]["realtime"]
|
810
|
+
# Traditional TTS tasks
|
811
|
+
elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
|
684
812
|
default = defaults["audio"]["tts"]
|
685
|
-
|
813
|
+
# Traditional STT tasks
|
814
|
+
elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
|
686
815
|
default = defaults["audio"]["stt"]
|
687
816
|
else:
|
688
817
|
default = defaults["audio"]["default"]
|
@@ -714,14 +843,16 @@ class ISAModelClient:
|
|
714
843
|
provider: str,
|
715
844
|
task: str,
|
716
845
|
use_cache: bool = True
|
717
|
-
) -> Any:
|
718
|
-
"""Get appropriate service instance"""
|
846
|
+
) -> tuple[Any, str]:
|
847
|
+
"""Get appropriate service instance and return actual model used"""
|
719
848
|
|
720
|
-
cache_key = f"{service_type}_{provider}_{model_name}"
|
849
|
+
cache_key = f"{service_type}_{provider}_{model_name}_{task}"
|
850
|
+
actual_model_used = model_name # Track the actual model used
|
721
851
|
|
722
852
|
# Check cache first (if caching is enabled)
|
723
853
|
if use_cache and cache_key in self._service_cache:
|
724
|
-
|
854
|
+
cached_service, cached_model = self._service_cache[cache_key]
|
855
|
+
return cached_service, cached_model
|
725
856
|
|
726
857
|
try:
|
727
858
|
# Validate service type
|
@@ -730,24 +861,46 @@ class ISAModelClient:
|
|
730
861
|
# Route to appropriate AIFactory method
|
731
862
|
if service_type == "vision":
|
732
863
|
service = self.ai_factory.get_vision(model_name, provider)
|
864
|
+
actual_model_used = model_name
|
733
865
|
elif service_type == "audio":
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
866
|
+
# Realtime audio tasks
|
867
|
+
if any(realtime_task in task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
|
868
|
+
# Use realtime model
|
869
|
+
realtime_model = "gpt-4o-realtime-preview-2024-10-01" if model_name == "tts-1" or model_name == "whisper-1" else model_name
|
870
|
+
service = self.ai_factory.get_realtime(realtime_model, provider)
|
871
|
+
actual_model_used = realtime_model
|
872
|
+
# Traditional TTS tasks
|
873
|
+
elif "speech" in task or "tts" in task or task in ["synthesize", "text_to_speech", "generate_speech"]:
|
874
|
+
# Use TTS model
|
875
|
+
tts_model = "tts-1" if model_name == "whisper-1" else model_name
|
876
|
+
service = self.ai_factory.get_tts(tts_model, provider)
|
877
|
+
actual_model_used = tts_model
|
878
|
+
# Traditional STT tasks
|
879
|
+
elif "transcribe" in task or "stt" in task or task in ["speech_to_text", "transcription"]:
|
880
|
+
# Use STT model
|
881
|
+
stt_model = "whisper-1" if model_name == "tts-1" else model_name
|
882
|
+
service = self.ai_factory.get_stt(stt_model, provider)
|
883
|
+
actual_model_used = stt_model
|
884
|
+
# Default to STT for backward compatibility
|
738
885
|
else:
|
739
|
-
|
886
|
+
# Use STT model by default
|
887
|
+
stt_model = "whisper-1" if model_name == "tts-1" else model_name
|
888
|
+
service = self.ai_factory.get_stt(stt_model, provider)
|
889
|
+
actual_model_used = stt_model
|
740
890
|
elif service_type == "text":
|
741
891
|
service = self.ai_factory.get_llm(model_name, provider)
|
892
|
+
actual_model_used = model_name
|
742
893
|
elif service_type == "image":
|
743
894
|
service = self.ai_factory.get_img("t2i", model_name, provider)
|
895
|
+
actual_model_used = model_name
|
744
896
|
elif service_type == "embedding":
|
745
897
|
service = self.ai_factory.get_embed(model_name, provider)
|
898
|
+
actual_model_used = model_name
|
746
899
|
|
747
|
-
# Cache the service (if caching is enabled)
|
900
|
+
# Cache the service and actual model (if caching is enabled)
|
748
901
|
if use_cache:
|
749
|
-
self._service_cache[cache_key] = service
|
750
|
-
return service
|
902
|
+
self._service_cache[cache_key] = (service, actual_model_used)
|
903
|
+
return service, actual_model_used
|
751
904
|
|
752
905
|
except Exception as e:
|
753
906
|
logger.error(f"Failed to get service {service_type}/{provider}/{model_name}: {e}")
|
@@ -785,12 +938,26 @@ class ISAModelClient:
|
|
785
938
|
)
|
786
939
|
|
787
940
|
elif service_type == "audio":
|
788
|
-
|
941
|
+
# Realtime audio tasks
|
942
|
+
if any(realtime_task in unified_task for realtime_task in ["realtime", "audio_chat", "text_chat", "create_session", "connect", "send_audio", "send_text", "listen"]):
|
943
|
+
# For realtime text_chat and audio_chat, pass text parameter
|
944
|
+
if unified_task in ["text_chat", "audio_chat"]:
|
945
|
+
if isinstance(input_data, str):
|
946
|
+
kwargs['text'] = input_data
|
947
|
+
elif isinstance(input_data, bytes):
|
948
|
+
kwargs['audio_data'] = input_data
|
949
|
+
return await service.invoke(
|
950
|
+
task=unified_task,
|
951
|
+
**kwargs
|
952
|
+
)
|
953
|
+
# Traditional TTS tasks
|
954
|
+
elif unified_task in ["synthesize", "text_to_speech", "tts", "generate_speech"]:
|
789
955
|
return await service.invoke(
|
790
956
|
text=input_data,
|
791
957
|
task=unified_task,
|
792
958
|
**kwargs
|
793
959
|
)
|
960
|
+
# Traditional STT tasks
|
794
961
|
else:
|
795
962
|
return await service.invoke(
|
796
963
|
audio_input=input_data,
|
@@ -801,22 +968,58 @@ class ISAModelClient:
|
|
801
968
|
elif service_type == "text":
|
802
969
|
# Extract show_reasoning from kwargs if present
|
803
970
|
show_reasoning = kwargs.pop('show_reasoning', False)
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
971
|
+
|
972
|
+
# Check if service provider supports show_reasoning
|
973
|
+
# Only OpenAI services support this parameter
|
974
|
+
if hasattr(service, 'provider_name') and service.provider_name == 'openai':
|
975
|
+
result = await service.invoke(
|
976
|
+
input_data=input_data,
|
977
|
+
task=unified_task,
|
978
|
+
show_reasoning=show_reasoning,
|
979
|
+
**kwargs
|
980
|
+
)
|
981
|
+
else:
|
982
|
+
# For other providers like yyds, don't pass show_reasoning
|
983
|
+
result = await service.invoke(
|
984
|
+
input_data=input_data,
|
985
|
+
task=unified_task,
|
986
|
+
**kwargs
|
987
|
+
)
|
810
988
|
|
811
989
|
logger.debug(f"Service result type: {type(result)}")
|
812
990
|
logger.debug(f"Service result: {result}")
|
813
991
|
|
814
|
-
if
|
992
|
+
# Check if this is a formatted result from invoke method
|
993
|
+
if isinstance(result, dict) and 'formatted' in result:
|
994
|
+
# This is a formatted result from the new invoke method
|
995
|
+
logger.debug(f"Returning formatted result: {result}")
|
996
|
+
return result
|
997
|
+
elif isinstance(result, dict) and 'message' in result:
|
998
|
+
# This is a traditional message result
|
815
999
|
message = result['message']
|
816
1000
|
logger.debug(f"Extracted message type: {type(message)}")
|
817
|
-
logger.debug(f"Extracted message: {message}")
|
818
|
-
|
1001
|
+
logger.debug(f"Extracted message length: {len(str(message)) if message else 0}")
|
1002
|
+
|
1003
|
+
# Handle AIMessage objects from LangChain
|
1004
|
+
if hasattr(message, 'content'):
|
1005
|
+
# Check if there are tool_calls
|
1006
|
+
if hasattr(message, 'tool_calls') and message.tool_calls:
|
1007
|
+
logger.debug(f"AIMessage contains tool_calls: {len(message.tool_calls)}")
|
1008
|
+
# Return a dict with both content and tool_calls
|
1009
|
+
return {
|
1010
|
+
"content": message.content if message.content else "",
|
1011
|
+
"tool_calls": message.tool_calls
|
1012
|
+
}
|
1013
|
+
else:
|
1014
|
+
content = message.content
|
1015
|
+
logger.debug(f"Extracted content from AIMessage: {len(content) if content else 0} chars")
|
1016
|
+
return content
|
1017
|
+
else:
|
1018
|
+
# Direct string message
|
1019
|
+
logger.debug(f"Returning direct message: {len(str(message)) if message else 0} chars")
|
1020
|
+
return message
|
819
1021
|
else:
|
1022
|
+
logger.debug(f"Returning result directly: {result}")
|
820
1023
|
return result
|
821
1024
|
|
822
1025
|
elif service_type == "image":
|
@@ -886,7 +1089,7 @@ class ISAModelClient:
|
|
886
1089
|
|
887
1090
|
for service_type, provider, model in test_services:
|
888
1091
|
try:
|
889
|
-
await self._get_service(service_type, model, provider, "test")
|
1092
|
+
service, _ = await self._get_service(service_type, model, provider, "test")
|
890
1093
|
health_status["services"][f"{service_type}_{provider}"] = "healthy"
|
891
1094
|
except Exception as e:
|
892
1095
|
health_status["services"][f"{service_type}_{provider}"] = f"error: {str(e)}"
|
@@ -916,9 +1119,18 @@ class ISAModelClient:
|
|
916
1119
|
service_type: str,
|
917
1120
|
model_hint: Optional[str] = None,
|
918
1121
|
provider_hint: Optional[str] = None,
|
1122
|
+
output_format: Optional[str] = None,
|
1123
|
+
json_schema: Optional[Dict] = None,
|
1124
|
+
repair_attempts: Optional[int] = 3,
|
919
1125
|
**kwargs
|
920
1126
|
) -> Dict[str, Any]:
|
921
1127
|
"""Service invoke that returns streaming response with async generator"""
|
1128
|
+
|
1129
|
+
# Generate unique request ID for logging
|
1130
|
+
request_id = generate_request_id()
|
1131
|
+
start_time = datetime.now(timezone.utc)
|
1132
|
+
execution_start_time = time.time()
|
1133
|
+
|
922
1134
|
try:
|
923
1135
|
# Step 1: Select best model for this task
|
924
1136
|
selected_model = await self._select_model(
|
@@ -930,18 +1142,20 @@ class ISAModelClient:
|
|
930
1142
|
)
|
931
1143
|
|
932
1144
|
# Step 2: Get appropriate service
|
933
|
-
service = await self._get_service(
|
1145
|
+
service, actual_model_used = await self._get_service(
|
934
1146
|
service_type=service_type,
|
935
1147
|
model_name=selected_model["model_id"],
|
936
1148
|
provider=selected_model["provider"],
|
937
1149
|
task=task,
|
938
1150
|
use_cache=False # Don't cache for streaming to avoid state issues
|
939
1151
|
)
|
1152
|
+
# Update selected model with actual model used
|
1153
|
+
selected_model["model_id"] = actual_model_used
|
940
1154
|
|
941
1155
|
# Step 3: Handle tools for LLM services (bind tools if provided)
|
942
1156
|
tools = kwargs.pop("tools", None)
|
943
1157
|
if service_type == "text" and tools:
|
944
|
-
service = await self._get_service(
|
1158
|
+
service, _ = await self._get_service(
|
945
1159
|
service_type=service_type,
|
946
1160
|
model_name=selected_model["model_id"],
|
947
1161
|
provider=selected_model["provider"],
|
@@ -964,7 +1178,8 @@ class ISAModelClient:
|
|
964
1178
|
if service_type == "text" and hasattr(service, 'astream'):
|
965
1179
|
show_reasoning = kwargs.get('show_reasoning', False)
|
966
1180
|
logger.debug(f"Stream generator: show_reasoning={show_reasoning}")
|
967
|
-
|
1181
|
+
# Only pass show_reasoning to OpenAI providers
|
1182
|
+
if 'show_reasoning' in kwargs and hasattr(service, 'provider_name') and service.provider_name == 'openai':
|
968
1183
|
async for token in service.astream(input_data, show_reasoning=show_reasoning):
|
969
1184
|
yield token
|
970
1185
|
else:
|
@@ -999,9 +1214,18 @@ class ISAModelClient:
|
|
999
1214
|
model_hint: Optional[str] = None,
|
1000
1215
|
provider_hint: Optional[str] = None,
|
1001
1216
|
stream: Optional[bool] = None,
|
1217
|
+
output_format: Optional[str] = None,
|
1218
|
+
json_schema: Optional[Dict] = None,
|
1219
|
+
repair_attempts: Optional[int] = 3,
|
1002
1220
|
**kwargs
|
1003
1221
|
) -> Dict[str, Any]:
|
1004
1222
|
"""Direct service invoke - passes LangChain objects and tools directly to services"""
|
1223
|
+
|
1224
|
+
# Generate unique request ID for logging
|
1225
|
+
request_id = generate_request_id()
|
1226
|
+
start_time = datetime.now(timezone.utc)
|
1227
|
+
execution_start_time = time.time()
|
1228
|
+
|
1005
1229
|
try:
|
1006
1230
|
# Step 1: Select best model for this task
|
1007
1231
|
selected_model = await self._select_model(
|
@@ -1012,18 +1236,35 @@ class ISAModelClient:
|
|
1012
1236
|
provider_hint=provider_hint
|
1013
1237
|
)
|
1014
1238
|
|
1239
|
+
# Step 1.5: Log inference start
|
1240
|
+
self.inference_logger.log_inference_start(
|
1241
|
+
request_id=request_id,
|
1242
|
+
service_type=service_type,
|
1243
|
+
task=task,
|
1244
|
+
provider=selected_model["provider"],
|
1245
|
+
model_name=selected_model["model_id"],
|
1246
|
+
input_data=input_data if self.inference_logger.log_detailed_requests else None,
|
1247
|
+
is_streaming=stream or False,
|
1248
|
+
custom_metadata={
|
1249
|
+
"selection_reason": selected_model.get("reason", "Default selection"),
|
1250
|
+
"has_tools": "tools" in kwargs
|
1251
|
+
}
|
1252
|
+
)
|
1253
|
+
|
1015
1254
|
# Step 2: Get appropriate service
|
1016
|
-
service = await self._get_service(
|
1255
|
+
service, actual_model_used = await self._get_service(
|
1017
1256
|
service_type=service_type,
|
1018
1257
|
model_name=selected_model["model_id"],
|
1019
1258
|
provider=selected_model["provider"],
|
1020
1259
|
task=task
|
1021
1260
|
)
|
1261
|
+
# Update selected model with actual model used
|
1262
|
+
selected_model["model_id"] = actual_model_used
|
1022
1263
|
|
1023
1264
|
# Step 3: Handle tools for LLM services (bind tools if provided)
|
1024
1265
|
tools = kwargs.pop("tools", None)
|
1025
1266
|
if service_type == "text" and tools:
|
1026
|
-
service = await self._get_service(
|
1267
|
+
service, _ = await self._get_service(
|
1027
1268
|
service_type=service_type,
|
1028
1269
|
model_name=selected_model["model_id"],
|
1029
1270
|
provider=selected_model["provider"],
|
@@ -1039,32 +1280,214 @@ class ISAModelClient:
|
|
1039
1280
|
service.streaming = stream
|
1040
1281
|
|
1041
1282
|
# Step 5: Execute task with unified interface
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1283
|
+
# Pass JSON formatting parameters to the service
|
1284
|
+
task_kwargs = kwargs.copy()
|
1285
|
+
if service_type == "text":
|
1286
|
+
if output_format:
|
1287
|
+
task_kwargs["output_format"] = output_format
|
1288
|
+
if json_schema:
|
1289
|
+
task_kwargs["json_schema"] = json_schema
|
1290
|
+
if repair_attempts is not None:
|
1291
|
+
task_kwargs["repair_attempts"] = repair_attempts
|
1292
|
+
|
1293
|
+
# Try to execute with rate limit detection
|
1294
|
+
try:
|
1295
|
+
result = await self._execute_task(
|
1296
|
+
service=service,
|
1297
|
+
input_data=input_data,
|
1298
|
+
task=task,
|
1299
|
+
service_type=service_type,
|
1300
|
+
**task_kwargs
|
1301
|
+
)
|
1302
|
+
except Exception as e:
|
1303
|
+
# Check if this is a rate limit error and we can fallback
|
1304
|
+
if self._is_rate_limit_error(e) and service_type == "text":
|
1305
|
+
# Ensure model selector is initialized
|
1306
|
+
if not self.model_selector:
|
1307
|
+
self.model_selector = await get_model_selector(self.config)
|
1308
|
+
|
1309
|
+
# Get fallback model selection
|
1310
|
+
fallback_selection = self.model_selector.get_rate_limit_fallback(
|
1311
|
+
service_type=service_type,
|
1312
|
+
original_provider=selected_model["provider"]
|
1313
|
+
)
|
1314
|
+
|
1315
|
+
if fallback_selection.get('success'):
|
1316
|
+
fallback_model = fallback_selection.get('selected_model', {})
|
1317
|
+
logger.info(f"Rate limit hit, switching to fallback: {fallback_model}")
|
1318
|
+
|
1319
|
+
# Get fallback service
|
1320
|
+
fallback_service, fallback_model_used = await self._get_service(
|
1321
|
+
service_type=service_type,
|
1322
|
+
model_name=fallback_model["model_id"],
|
1323
|
+
provider=fallback_model["provider"],
|
1324
|
+
task=task
|
1325
|
+
)
|
1326
|
+
|
1327
|
+
# Update selected model for metadata
|
1328
|
+
selected_model = fallback_model
|
1329
|
+
selected_model["model_id"] = fallback_model_used
|
1330
|
+
selected_model["reason"] = "Rate limit fallback"
|
1331
|
+
|
1332
|
+
# Retry with fallback service
|
1333
|
+
result = await self._execute_task(
|
1334
|
+
service=fallback_service,
|
1335
|
+
input_data=input_data,
|
1336
|
+
task=task,
|
1337
|
+
service_type=service_type,
|
1338
|
+
**task_kwargs
|
1339
|
+
)
|
1340
|
+
else:
|
1341
|
+
# No fallback available, re-raise original error
|
1342
|
+
raise
|
1343
|
+
else:
|
1344
|
+
# Not a rate limit error or no fallback, re-raise
|
1345
|
+
raise
|
1049
1346
|
|
1050
1347
|
# Step 6: Wait for billing tracking to complete, then get billing information
|
1051
1348
|
await asyncio.sleep(0.01) # Small delay to ensure billing tracking completes
|
1052
1349
|
billing_info = self._get_billing_info(service, selected_model["model_id"])
|
1053
1350
|
|
1351
|
+
# Step 6.5: Calculate execution time and log completion
|
1352
|
+
execution_time_ms = int((time.time() - execution_start_time) * 1000)
|
1353
|
+
|
1354
|
+
# Log inference completion
|
1355
|
+
self.inference_logger.log_inference_complete(
|
1356
|
+
request_id=request_id,
|
1357
|
+
status="completed",
|
1358
|
+
execution_time_ms=execution_time_ms,
|
1359
|
+
input_tokens=billing_info.get("input_tokens"),
|
1360
|
+
output_tokens=billing_info.get("output_tokens"),
|
1361
|
+
estimated_cost_usd=billing_info.get("cost_usd"),
|
1362
|
+
output_data=result if self.inference_logger.log_detailed_requests else None,
|
1363
|
+
custom_metadata={
|
1364
|
+
"billing_operation": billing_info.get("operation"),
|
1365
|
+
"timestamp": billing_info.get("timestamp")
|
1366
|
+
}
|
1367
|
+
)
|
1368
|
+
|
1369
|
+
# Log detailed token usage if available
|
1370
|
+
if billing_info.get("input_tokens") and billing_info.get("output_tokens"):
|
1371
|
+
self.inference_logger.log_token_usage(
|
1372
|
+
request_id=request_id,
|
1373
|
+
provider=selected_model["provider"],
|
1374
|
+
model_name=selected_model["model_id"],
|
1375
|
+
prompt_tokens=billing_info.get("input_tokens"),
|
1376
|
+
completion_tokens=billing_info.get("output_tokens"),
|
1377
|
+
prompt_cost_usd=billing_info.get("cost_usd", 0) * 0.6 if billing_info.get("cost_usd") else None, # Rough estimate
|
1378
|
+
completion_cost_usd=billing_info.get("cost_usd", 0) * 0.4 if billing_info.get("cost_usd") else None
|
1379
|
+
)
|
1380
|
+
|
1381
|
+
# Handle formatting - check if result is already formatted
|
1382
|
+
formatted_result = result
|
1383
|
+
if service_type == "text" and output_format:
|
1384
|
+
# Check if result is already formatted by the service
|
1385
|
+
if isinstance(result, dict) and result.get("formatted"):
|
1386
|
+
# Result is already formatted by the service
|
1387
|
+
formatted_result = result.get("result", result)
|
1388
|
+
billing_info["formatting"] = {
|
1389
|
+
"output_format": output_format,
|
1390
|
+
"format_success": True,
|
1391
|
+
"format_method": "service_level",
|
1392
|
+
"format_errors": result.get("format_errors", []),
|
1393
|
+
"repaired": False,
|
1394
|
+
"pre_formatted": True
|
1395
|
+
}
|
1396
|
+
else:
|
1397
|
+
# Apply formatting at client level (fallback)
|
1398
|
+
try:
|
1399
|
+
service, _ = await self._get_service(
|
1400
|
+
service_type=service_type,
|
1401
|
+
model_name=selected_model["model_id"],
|
1402
|
+
provider=selected_model["provider"],
|
1403
|
+
task=task
|
1404
|
+
)
|
1405
|
+
if hasattr(service, 'format_structured_output'):
|
1406
|
+
formatting_result = service.format_structured_output(
|
1407
|
+
response=result,
|
1408
|
+
output_format=output_format,
|
1409
|
+
schema=json_schema,
|
1410
|
+
repair_attempts=repair_attempts or 3
|
1411
|
+
)
|
1412
|
+
# Update result and add formatting metadata
|
1413
|
+
if formatting_result.get("success") and formatting_result.get("data") is not None:
|
1414
|
+
# Extract the actual formatted data
|
1415
|
+
formatted_data = formatting_result["data"]
|
1416
|
+
|
1417
|
+
# For JSON output, ensure we return clean data
|
1418
|
+
if output_format == "json" and isinstance(formatted_data, dict):
|
1419
|
+
formatted_result = formatted_data
|
1420
|
+
else:
|
1421
|
+
formatted_result = formatted_data
|
1422
|
+
else:
|
1423
|
+
# Keep original result if formatting failed
|
1424
|
+
formatted_result = result
|
1425
|
+
|
1426
|
+
# Add formatting info to metadata
|
1427
|
+
billing_info["formatting"] = {
|
1428
|
+
"output_format": output_format,
|
1429
|
+
"format_success": formatting_result.get("success", False),
|
1430
|
+
"format_method": formatting_result.get("method"),
|
1431
|
+
"format_errors": formatting_result.get("errors", []),
|
1432
|
+
"repaired": formatting_result.get("repaired", False),
|
1433
|
+
"pre_formatted": False
|
1434
|
+
}
|
1435
|
+
|
1436
|
+
except Exception as format_error:
|
1437
|
+
logger.warning(f"Failed to apply output formatting: {format_error}")
|
1438
|
+
# Continue with unformatted result
|
1439
|
+
formatted_result = result
|
1440
|
+
billing_info["formatting"] = {
|
1441
|
+
"output_format": output_format,
|
1442
|
+
"format_success": False,
|
1443
|
+
"format_error": str(format_error)
|
1444
|
+
}
|
1445
|
+
|
1054
1446
|
# Return unified response
|
1055
|
-
|
1447
|
+
response = {
|
1056
1448
|
"success": True,
|
1057
|
-
"result":
|
1449
|
+
"result": formatted_result,
|
1058
1450
|
"metadata": {
|
1451
|
+
"request_id": request_id, # Include request ID for tracking
|
1059
1452
|
"model_used": selected_model["model_id"],
|
1060
1453
|
"provider": selected_model["provider"],
|
1061
1454
|
"task": task,
|
1062
1455
|
"service_type": service_type,
|
1063
1456
|
"selection_reason": selected_model.get("reason", "Default selection"),
|
1457
|
+
"execution_time_ms": execution_time_ms,
|
1064
1458
|
"billing": billing_info
|
1065
1459
|
}
|
1066
1460
|
}
|
1461
|
+
|
1462
|
+
return response
|
1067
1463
|
except Exception as e:
|
1464
|
+
# Calculate execution time even for errors
|
1465
|
+
execution_time_ms = int((time.time() - execution_start_time) * 1000)
|
1466
|
+
|
1467
|
+
# Log inference error
|
1468
|
+
error_type = type(e).__name__
|
1469
|
+
error_message = str(e)
|
1470
|
+
|
1471
|
+
self.inference_logger.log_inference_complete(
|
1472
|
+
request_id=request_id,
|
1473
|
+
status="failed",
|
1474
|
+
execution_time_ms=execution_time_ms,
|
1475
|
+
error_message=error_message,
|
1476
|
+
error_code=error_type,
|
1477
|
+
custom_metadata={
|
1478
|
+
"error_location": "client._invoke_service"
|
1479
|
+
}
|
1480
|
+
)
|
1481
|
+
|
1482
|
+
# Also log to the error table
|
1483
|
+
self.inference_logger.log_error(
|
1484
|
+
request_id=request_id,
|
1485
|
+
error_type=error_type,
|
1486
|
+
error_message=error_message,
|
1487
|
+
provider=model_hint or "unknown",
|
1488
|
+
model_name=provider_hint or "unknown"
|
1489
|
+
)
|
1490
|
+
|
1068
1491
|
logger.error(f"Service invoke failed: {e}")
|
1069
1492
|
raise
|
1070
1493
|
|