tamar-model-client 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tamar_model_client/async_client.py +97 -25
- tamar_model_client/circuit_breaker.py +6 -3
- tamar_model_client/core/__init__.py +9 -1
- tamar_model_client/core/base_client.py +137 -37
- tamar_model_client/core/http_fallback.py +238 -17
- tamar_model_client/core/logging_setup.py +15 -1
- tamar_model_client/core/request_id_manager.py +112 -0
- tamar_model_client/core/utils.py +27 -1
- tamar_model_client/error_handler.py +106 -13
- tamar_model_client/sync_client.py +205 -43
- {tamar_model_client-0.1.26.dist-info → tamar_model_client-0.1.28.dist-info}/METADATA +96 -3
- {tamar_model_client-0.1.26.dist-info → tamar_model_client-0.1.28.dist-info}/RECORD +15 -14
- tests/test_google_azure_final.py +17 -17
- {tamar_model_client-0.1.26.dist-info → tamar_model_client-0.1.28.dist-info}/WHEEL +0 -0
- {tamar_model_client-0.1.26.dist-info → tamar_model_client-0.1.28.dist-info}/top_level.txt +0 -0
@@ -32,8 +32,11 @@ from grpc import RpcError
|
|
32
32
|
from .core import (
|
33
33
|
generate_request_id,
|
34
34
|
set_request_id,
|
35
|
+
set_origin_request_id,
|
35
36
|
get_protected_logger,
|
36
|
-
MAX_MESSAGE_LENGTH,
|
37
|
+
MAX_MESSAGE_LENGTH,
|
38
|
+
get_request_id,
|
39
|
+
RequestIdManager
|
37
40
|
)
|
38
41
|
from .core.base_client import BaseClient
|
39
42
|
from .core.request_builder import RequestBuilder
|
@@ -102,12 +105,18 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
102
105
|
self._last_channel_error_time = None
|
103
106
|
self._channel_lock = asyncio.Lock() # 异步锁
|
104
107
|
|
108
|
+
# === Request ID 管理 ===
|
109
|
+
self._request_id_manager = RequestIdManager()
|
110
|
+
|
105
111
|
# === 增强的重试处理器 ===
|
106
112
|
self.retry_handler = EnhancedRetryHandler(
|
107
113
|
max_retries=self.max_retries,
|
108
114
|
base_delay=self.retry_delay
|
109
115
|
)
|
110
116
|
|
117
|
+
# 设置client引用,用于快速降级
|
118
|
+
self.retry_handler.error_handler.client = self
|
119
|
+
|
111
120
|
# 注册退出时的清理函数
|
112
121
|
atexit.register(self._cleanup_atexit)
|
113
122
|
|
@@ -734,7 +743,12 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
734
743
|
if self.resilient_enabled and self.circuit_breaker and self.circuit_breaker.is_open:
|
735
744
|
if self.http_fallback_url:
|
736
745
|
logger.warning("🔻 Circuit breaker is OPEN, using HTTP fallback")
|
737
|
-
|
746
|
+
# 在这里还没有计算origin_request_id,所以先计算
|
747
|
+
temp_origin_request_id = None
|
748
|
+
temp_request_id = request_id
|
749
|
+
if request_id:
|
750
|
+
temp_request_id, temp_origin_request_id = self._request_id_manager.get_composite_id(request_id)
|
751
|
+
return await self._invoke_http_fallback(model_request, timeout, temp_request_id, temp_origin_request_id)
|
738
752
|
|
739
753
|
await self._ensure_initialized()
|
740
754
|
|
@@ -744,10 +758,24 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
744
758
|
"user_id": model_request.user_context.user_id or ""
|
745
759
|
}
|
746
760
|
|
747
|
-
|
761
|
+
# 处理 request_id
|
762
|
+
origin_request_id = None
|
763
|
+
if request_id:
|
764
|
+
# 用户提供了 request_id,生成组合 ID
|
765
|
+
request_id, origin_request_id = self._request_id_manager.get_composite_id(request_id)
|
766
|
+
else:
|
767
|
+
# 没有提供,生成新的
|
748
768
|
request_id = generate_request_id()
|
769
|
+
|
749
770
|
set_request_id(request_id)
|
750
|
-
|
771
|
+
if origin_request_id:
|
772
|
+
set_origin_request_id(origin_request_id)
|
773
|
+
metadata = self._build_auth_metadata(request_id, origin_request_id)
|
774
|
+
|
775
|
+
# 构建日志数据
|
776
|
+
log_data = ResponseHandler.build_log_data(model_request)
|
777
|
+
if origin_request_id:
|
778
|
+
log_data['origin_request_id'] = origin_request_id
|
751
779
|
|
752
780
|
# 记录开始日志
|
753
781
|
start_time = time.time()
|
@@ -756,7 +784,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
756
784
|
extra={
|
757
785
|
"log_type": "request",
|
758
786
|
"uri": f"/invoke/{model_request.provider.value}/{model_request.invoke_type.value}",
|
759
|
-
"data":
|
787
|
+
"data": log_data
|
760
788
|
})
|
761
789
|
|
762
790
|
try:
|
@@ -789,18 +817,34 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
789
817
|
# 对于流式响应,直接返回带日志记录的包装器
|
790
818
|
return self._stream_with_logging(request, metadata, invoke_timeout, start_time, model_request)
|
791
819
|
else:
|
792
|
-
|
820
|
+
# 存储model_request和origin_request_id供重试方法使用
|
821
|
+
self._current_model_request = model_request
|
822
|
+
self._current_origin_request_id = origin_request_id
|
823
|
+
try:
|
824
|
+
result = await self._retry_request(self._invoke_request, request, metadata, invoke_timeout, request_id=request_id)
|
825
|
+
finally:
|
826
|
+
# 清理临时存储
|
827
|
+
if hasattr(self, '_current_model_request'):
|
828
|
+
delattr(self, '_current_model_request')
|
829
|
+
if hasattr(self, '_current_origin_request_id'):
|
830
|
+
delattr(self, '_current_origin_request_id')
|
793
831
|
|
794
832
|
# 记录非流式响应的成功日志
|
795
833
|
duration = time.time() - start_time
|
796
834
|
content_length = len(result.content) if result.content else 0
|
835
|
+
|
836
|
+
# 构建响应日志数据
|
837
|
+
response_log_data = ResponseHandler.build_log_data(model_request, result)
|
838
|
+
if origin_request_id:
|
839
|
+
response_log_data['origin_request_id'] = origin_request_id
|
840
|
+
|
797
841
|
logger.info(
|
798
842
|
f"✅ Request completed | content_length: {content_length}",
|
799
843
|
extra={
|
800
844
|
"log_type": "response",
|
801
845
|
"uri": f"/invoke/{model_request.provider.value}/{model_request.invoke_type.value}",
|
802
846
|
"duration": duration,
|
803
|
-
"data":
|
847
|
+
"data": response_log_data
|
804
848
|
}
|
805
849
|
)
|
806
850
|
|
@@ -813,31 +857,29 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
813
857
|
except (ConnectionError, grpc.RpcError) as e:
|
814
858
|
duration = time.time() - start_time
|
815
859
|
error_message = f"❌ Invoke gRPC failed: {str(e)}"
|
860
|
+
|
861
|
+
# 构建错误日志数据
|
862
|
+
error_log_data = ResponseHandler.build_log_data(model_request, error=e)
|
863
|
+
if origin_request_id:
|
864
|
+
error_log_data['origin_request_id'] = origin_request_id
|
865
|
+
|
816
866
|
logger.error(error_message, exc_info=True,
|
817
867
|
extra={
|
818
868
|
"log_type": "response",
|
819
869
|
"uri": f"/invoke/{model_request.provider.value}/{model_request.invoke_type.value}",
|
820
870
|
"duration": duration,
|
821
|
-
"data":
|
822
|
-
model_request,
|
823
|
-
error=e
|
824
|
-
)
|
871
|
+
"data": error_log_data
|
825
872
|
})
|
826
873
|
|
827
874
|
# 记录 channel 错误
|
828
875
|
if isinstance(e, grpc.RpcError):
|
829
876
|
self._record_channel_error(e)
|
830
877
|
|
831
|
-
#
|
878
|
+
# 记录失败(如果启用了熔断)
|
832
879
|
if self.resilient_enabled and self.circuit_breaker:
|
833
880
|
# 将错误码传递给熔断器,用于智能失败统计
|
834
881
|
error_code = e.code() if hasattr(e, 'code') else None
|
835
882
|
self.circuit_breaker.record_failure(error_code)
|
836
|
-
|
837
|
-
# 如果可以降级,则降级
|
838
|
-
if self.http_fallback_url and self.circuit_breaker.should_fallback():
|
839
|
-
logger.warning(f"🔻 gRPC failed, falling back to HTTP: {str(e)}")
|
840
|
-
return await self._invoke_http_fallback(model_request, timeout, request_id)
|
841
883
|
|
842
884
|
raise e
|
843
885
|
except Exception as e:
|
@@ -867,6 +909,17 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
867
909
|
Returns:
|
868
910
|
BatchModelResponse: 批量请求的结果
|
869
911
|
"""
|
912
|
+
# 如果启用了熔断且熔断器打开,直接走 HTTP
|
913
|
+
if self.resilient_enabled and self.circuit_breaker and self.circuit_breaker.is_open:
|
914
|
+
if self.http_fallback_url:
|
915
|
+
logger.warning("🔻 Circuit breaker is OPEN, using HTTP fallback for batch request")
|
916
|
+
# 在这里还没有计算origin_request_id,所以先计算
|
917
|
+
temp_origin_request_id = None
|
918
|
+
temp_request_id = request_id
|
919
|
+
if request_id:
|
920
|
+
temp_request_id, temp_origin_request_id = self._request_id_manager.get_composite_id(request_id)
|
921
|
+
return await self._invoke_batch_http_fallback(batch_request_model, timeout, temp_request_id, temp_origin_request_id)
|
922
|
+
|
870
923
|
await self._ensure_initialized()
|
871
924
|
|
872
925
|
if not self.default_payload:
|
@@ -875,10 +928,29 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
875
928
|
"user_id": batch_request_model.user_context.user_id or ""
|
876
929
|
}
|
877
930
|
|
878
|
-
|
931
|
+
# 处理 request_id
|
932
|
+
origin_request_id = None
|
933
|
+
if request_id:
|
934
|
+
# 用户提供了 request_id,生成组合 ID
|
935
|
+
request_id, origin_request_id = self._request_id_manager.get_composite_id(request_id)
|
936
|
+
else:
|
937
|
+
# 没有提供,生成新的
|
879
938
|
request_id = generate_request_id()
|
939
|
+
|
880
940
|
set_request_id(request_id)
|
881
|
-
|
941
|
+
if origin_request_id:
|
942
|
+
set_origin_request_id(origin_request_id)
|
943
|
+
metadata = self._build_auth_metadata(request_id, origin_request_id)
|
944
|
+
|
945
|
+
# 构建日志数据
|
946
|
+
batch_log_data = {
|
947
|
+
"batch_size": len(batch_request_model.items),
|
948
|
+
"org_id": batch_request_model.user_context.org_id,
|
949
|
+
"user_id": batch_request_model.user_context.user_id,
|
950
|
+
"client_type": batch_request_model.user_context.client_type
|
951
|
+
}
|
952
|
+
if origin_request_id:
|
953
|
+
batch_log_data['origin_request_id'] = origin_request_id
|
882
954
|
|
883
955
|
# 记录开始日志
|
884
956
|
start_time = time.time()
|
@@ -887,12 +959,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
887
959
|
extra={
|
888
960
|
"log_type": "request",
|
889
961
|
"uri": "/batch_invoke",
|
890
|
-
"data":
|
891
|
-
"batch_size": len(batch_request_model.items),
|
892
|
-
"org_id": batch_request_model.user_context.org_id,
|
893
|
-
"user_id": batch_request_model.user_context.user_id,
|
894
|
-
"client_type": batch_request_model.user_context.client_type
|
895
|
-
}
|
962
|
+
"data": batch_log_data
|
896
963
|
})
|
897
964
|
|
898
965
|
try:
|
@@ -919,6 +986,11 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
919
986
|
|
920
987
|
try:
|
921
988
|
invoke_timeout = timeout or self.default_invoke_timeout
|
989
|
+
|
990
|
+
# 保存批量请求信息用于降级
|
991
|
+
self._current_batch_request = batch_request_model
|
992
|
+
self._current_origin_request_id = origin_request_id
|
993
|
+
|
922
994
|
batch_response = await self._retry_request(
|
923
995
|
self.stub.BatchInvoke,
|
924
996
|
batch_request,
|
@@ -101,9 +101,12 @@ class CircuitBreaker:
|
|
101
101
|
logger.warning(
|
102
102
|
f"🔻 Circuit breaker OPENED after {self.failure_count} failures",
|
103
103
|
extra={
|
104
|
-
"
|
105
|
-
"
|
106
|
-
|
104
|
+
"log_type": "info",
|
105
|
+
"data": {
|
106
|
+
"failure_count": self.failure_count,
|
107
|
+
"threshold": self.failure_threshold,
|
108
|
+
"trigger_error": error_code.name if error_code else "unknown"
|
109
|
+
}
|
107
110
|
}
|
108
111
|
)
|
109
112
|
|
@@ -10,7 +10,9 @@ from .utils import (
|
|
10
10
|
remove_none_from_dict,
|
11
11
|
generate_request_id,
|
12
12
|
set_request_id,
|
13
|
-
get_request_id
|
13
|
+
get_request_id,
|
14
|
+
set_origin_request_id,
|
15
|
+
get_origin_request_id
|
14
16
|
)
|
15
17
|
|
16
18
|
from .logging_setup import (
|
@@ -22,6 +24,8 @@ from .logging_setup import (
|
|
22
24
|
MAX_MESSAGE_LENGTH
|
23
25
|
)
|
24
26
|
|
27
|
+
from .request_id_manager import RequestIdManager
|
28
|
+
|
25
29
|
__all__ = [
|
26
30
|
# Utils
|
27
31
|
'is_effective_value',
|
@@ -30,6 +34,8 @@ __all__ = [
|
|
30
34
|
'generate_request_id',
|
31
35
|
'set_request_id',
|
32
36
|
'get_request_id',
|
37
|
+
'set_origin_request_id',
|
38
|
+
'get_origin_request_id',
|
33
39
|
# Logging
|
34
40
|
'setup_logger',
|
35
41
|
'RequestIdFilter',
|
@@ -37,4 +43,6 @@ __all__ = [
|
|
37
43
|
'get_protected_logger',
|
38
44
|
'reset_logger_config',
|
39
45
|
'MAX_MESSAGE_LENGTH',
|
46
|
+
# Request ID Management
|
47
|
+
'RequestIdManager',
|
40
48
|
]
|
@@ -6,8 +6,7 @@ and configuration management for both sync and async clients.
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import os
|
9
|
-
import
|
10
|
-
from typing import Optional, Dict, Any
|
9
|
+
from typing import Optional
|
11
10
|
from abc import ABC, abstractmethod
|
12
11
|
|
13
12
|
from ..auth import JWTAuthHandler
|
@@ -25,7 +24,7 @@ class BaseClient(ABC):
|
|
25
24
|
- 连接选项构建
|
26
25
|
- 错误处理器初始化
|
27
26
|
"""
|
28
|
-
|
27
|
+
|
29
28
|
def __init__(
|
30
29
|
self,
|
31
30
|
server_address: Optional[str] = None,
|
@@ -57,40 +56,43 @@ class BaseClient(ABC):
|
|
57
56
|
self.server_address = server_address or os.getenv("MODEL_MANAGER_SERVER_ADDRESS")
|
58
57
|
if not self.server_address:
|
59
58
|
raise ValueError("Server address must be provided via argument or environment variable.")
|
60
|
-
|
59
|
+
|
61
60
|
# 默认调用超时时间
|
62
61
|
self.default_invoke_timeout = float(os.getenv("MODEL_MANAGER_SERVER_INVOKE_TIMEOUT", 30.0))
|
63
|
-
|
62
|
+
|
64
63
|
# === JWT 认证配置 ===
|
65
64
|
self.jwt_secret_key = jwt_secret_key or os.getenv("MODEL_MANAGER_SERVER_JWT_SECRET_KEY")
|
66
65
|
self.jwt_handler = JWTAuthHandler(self.jwt_secret_key) if self.jwt_secret_key else None
|
67
66
|
self.jwt_token = jwt_token # 用户传入的预生成 Token(可选)
|
68
67
|
self.default_payload = default_payload
|
69
68
|
self.token_expires_in = token_expires_in
|
70
|
-
|
69
|
+
|
71
70
|
# === TLS/Authority 配置 ===
|
72
71
|
self.use_tls = os.getenv("MODEL_MANAGER_SERVER_GRPC_USE_TLS", "true").lower() == "true"
|
73
72
|
self.default_authority = os.getenv("MODEL_MANAGER_SERVER_GRPC_DEFAULT_AUTHORITY")
|
74
|
-
|
73
|
+
|
75
74
|
# === 重试配置 ===
|
76
75
|
self.max_retries = max_retries if max_retries is not None else int(
|
77
76
|
os.getenv("MODEL_MANAGER_SERVER_GRPC_MAX_RETRIES", 6))
|
78
77
|
self.retry_delay = retry_delay if retry_delay is not None else float(
|
79
78
|
os.getenv("MODEL_MANAGER_SERVER_GRPC_RETRY_DELAY", 1.0))
|
80
|
-
|
79
|
+
|
81
80
|
# === 日志配置 ===
|
82
81
|
self.logger = get_protected_logger(logger_name or __name__)
|
83
|
-
|
82
|
+
|
84
83
|
# === 错误处理器 ===
|
85
84
|
self.error_handler = GrpcErrorHandler(self.logger)
|
86
85
|
self.recovery_strategy = ErrorRecoveryStrategy(self)
|
87
|
-
|
86
|
+
|
88
87
|
# === 连接状态 ===
|
89
88
|
self._closed = False
|
90
|
-
|
89
|
+
|
91
90
|
# === 熔断降级配置 ===
|
92
91
|
self._init_resilient_features()
|
93
|
-
|
92
|
+
|
93
|
+
# === 快速降级配置 ===
|
94
|
+
self._init_fast_fallback_config()
|
95
|
+
|
94
96
|
def build_channel_options(self) -> list:
|
95
97
|
"""
|
96
98
|
构建 gRPC 通道选项
|
@@ -108,30 +110,44 @@ class BaseClient(ABC):
|
|
108
110
|
# 消息大小限制
|
109
111
|
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
110
112
|
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
111
|
-
|
113
|
+
|
112
114
|
# Keepalive 核心配置
|
113
115
|
('grpc.keepalive_time_ms', 30000), # 30秒发送一次 keepalive ping
|
114
116
|
('grpc.keepalive_timeout_ms', 10000), # ping 响应超时时间 10秒
|
115
117
|
('grpc.keepalive_permit_without_calls', True), # 空闲时也发送 keepalive
|
116
118
|
('grpc.http2.max_pings_without_data', 2), # 无数据时最大 ping 次数
|
117
|
-
|
119
|
+
|
118
120
|
# 连接管理增强配置
|
119
121
|
('grpc.http2.min_time_between_pings_ms', 10000), # ping 最小间隔 10秒
|
120
122
|
('grpc.http2.max_connection_idle_ms', 300000), # 最大空闲时间 5分钟
|
121
123
|
('grpc.http2.max_connection_age_ms', 3600000), # 连接最大生存时间 1小时
|
122
124
|
('grpc.http2.max_connection_age_grace_ms', 5000), # 优雅关闭时间 5秒
|
123
|
-
|
125
|
+
|
124
126
|
# 性能相关配置
|
125
127
|
('grpc.http2.bdp_probe', 1), # 启用带宽延迟探测
|
126
128
|
('grpc.enable_retries', 1), # 启用内置重试
|
129
|
+
|
130
|
+
# 启用连接池配置(如果 gRPC 客户端支持)
|
131
|
+
('grpc.keepalive_time_ms', 30000), # 保持活跃的连接时间(30秒)
|
132
|
+
('grpc.keepalive_timeout_ms', 10000), # ping 响应超时时间(10秒)
|
133
|
+
('grpc.max_connection_idle_ms', 300000), # 连接最大空闲时间(5分钟)
|
134
|
+
|
135
|
+
# 设置资源配额
|
136
|
+
('grpc.resource_quota_size', 1048576000), # 设置资源配额为1GB
|
137
|
+
|
138
|
+
# 启用负载均衡配置
|
139
|
+
('grpc.lb_policy', 'round_robin'), # 设置负载均衡策略为 round_robin(轮询)
|
140
|
+
|
141
|
+
# 启用详细的日志记录
|
142
|
+
('grpc.debug', 1), # 启用 gRPC 的调试日志,记录更多的连接和请求信息
|
127
143
|
]
|
128
|
-
|
144
|
+
|
129
145
|
if self.default_authority:
|
130
146
|
options.append(("grpc.default_authority", self.default_authority))
|
131
|
-
|
147
|
+
|
132
148
|
return options
|
133
|
-
|
134
|
-
def _build_auth_metadata(self, request_id: str) -> list:
|
149
|
+
|
150
|
+
def _build_auth_metadata(self, request_id: str, origin_request_id: Optional[str] = None) -> list:
|
135
151
|
"""
|
136
152
|
构建认证元数据
|
137
153
|
|
@@ -140,82 +156,166 @@ class BaseClient(ABC):
|
|
140
156
|
|
141
157
|
Args:
|
142
158
|
request_id: 当前请求的唯一标识符
|
159
|
+
origin_request_id: 原始请求ID(可选)
|
143
160
|
|
144
161
|
Returns:
|
145
162
|
list: gRPC元数据列表,包含请求ID和认证令牌
|
146
163
|
"""
|
147
164
|
metadata = [("x-request-id", request_id)] # 将 request_id 添加到 headers
|
148
|
-
|
165
|
+
|
166
|
+
# 如果有原始请求ID,也添加到 headers
|
167
|
+
if origin_request_id:
|
168
|
+
metadata.append(("x-origin-request-id", origin_request_id))
|
169
|
+
|
149
170
|
if self.jwt_handler:
|
150
171
|
self.jwt_token = self.jwt_handler.encode_token(
|
151
|
-
self.default_payload,
|
172
|
+
self.default_payload,
|
152
173
|
expires_in=self.token_expires_in
|
153
174
|
)
|
154
175
|
metadata.append(("authorization", f"Bearer {self.jwt_token}"))
|
155
|
-
|
176
|
+
|
156
177
|
return metadata
|
157
|
-
|
178
|
+
|
158
179
|
@abstractmethod
|
159
180
|
def close(self):
|
160
181
|
"""关闭客户端连接(由子类实现)"""
|
161
182
|
pass
|
162
|
-
|
183
|
+
|
163
184
|
@abstractmethod
|
164
185
|
def __enter__(self):
|
165
186
|
"""进入上下文管理器(由子类实现)"""
|
166
187
|
pass
|
167
|
-
|
188
|
+
|
168
189
|
@abstractmethod
|
169
190
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
170
191
|
"""退出上下文管理器(由子类实现)"""
|
171
192
|
pass
|
172
|
-
|
193
|
+
|
173
194
|
def _init_resilient_features(self):
|
174
195
|
"""初始化熔断降级特性"""
|
175
196
|
# 是否启用熔断降级
|
176
197
|
self.resilient_enabled = os.getenv('MODEL_CLIENT_RESILIENT_ENABLED', 'false').lower() == 'true'
|
177
|
-
|
198
|
+
|
178
199
|
if self.resilient_enabled:
|
179
200
|
# HTTP 降级地址
|
180
201
|
self.http_fallback_url = os.getenv('MODEL_CLIENT_HTTP_FALLBACK_URL')
|
181
|
-
|
202
|
+
|
182
203
|
if not self.http_fallback_url:
|
183
204
|
self.logger.warning("🔶 Resilient mode enabled but MODEL_CLIENT_HTTP_FALLBACK_URL not set")
|
184
205
|
self.resilient_enabled = False
|
185
206
|
return
|
186
|
-
|
207
|
+
|
187
208
|
# 初始化熔断器
|
188
209
|
from ..circuit_breaker import CircuitBreaker
|
189
210
|
self.circuit_breaker = CircuitBreaker(
|
190
211
|
failure_threshold=int(os.getenv('MODEL_CLIENT_CIRCUIT_BREAKER_THRESHOLD', '5')),
|
191
212
|
recovery_timeout=int(os.getenv('MODEL_CLIENT_CIRCUIT_BREAKER_TIMEOUT', '60'))
|
192
213
|
)
|
193
|
-
|
214
|
+
|
194
215
|
# HTTP 客户端(延迟初始化)
|
195
216
|
self._http_client = None
|
196
217
|
self._http_session = None # 异步客户端使用
|
197
|
-
|
218
|
+
|
198
219
|
self.logger.info(
|
199
220
|
"🛡️ Resilient mode enabled",
|
200
221
|
extra={
|
201
|
-
"
|
202
|
-
"
|
203
|
-
|
222
|
+
"log_type": "info",
|
223
|
+
"data": {
|
224
|
+
"http_fallback_url": self.http_fallback_url,
|
225
|
+
"circuit_breaker_threshold": self.circuit_breaker.failure_threshold,
|
226
|
+
"circuit_breaker_timeout": self.circuit_breaker.recovery_timeout
|
227
|
+
}
|
204
228
|
}
|
205
229
|
)
|
206
230
|
else:
|
207
231
|
self.circuit_breaker = None
|
208
232
|
self.http_fallback_url = None
|
209
|
-
|
233
|
+
self._http_client = None
|
234
|
+
self._http_session = None
|
235
|
+
|
210
236
|
def get_resilient_metrics(self):
|
211
237
|
"""获取熔断降级指标"""
|
212
238
|
if not self.resilient_enabled or not self.circuit_breaker:
|
213
239
|
return None
|
214
|
-
|
240
|
+
|
215
241
|
return {
|
216
242
|
"enabled": self.resilient_enabled,
|
217
243
|
"circuit_state": self.circuit_breaker.get_state(),
|
218
244
|
"failure_count": self.circuit_breaker.failure_count,
|
219
245
|
"last_failure_time": self.circuit_breaker.last_failure_time,
|
220
246
|
"http_fallback_url": self.http_fallback_url
|
221
|
-
}
|
247
|
+
}
|
248
|
+
|
249
|
+
def _init_fast_fallback_config(self):
|
250
|
+
"""初始化快速降级配置"""
|
251
|
+
import grpc
|
252
|
+
|
253
|
+
# 是否启用快速降级
|
254
|
+
self.fast_fallback_enabled = os.getenv('MODEL_CLIENT_FAST_FALLBACK_ENABLED', 'false').lower() == 'true'
|
255
|
+
|
256
|
+
# 降级前的最大gRPC重试次数
|
257
|
+
self.fallback_after_retries = int(os.getenv('MODEL_CLIENT_FALLBACK_AFTER_RETRIES', '1'))
|
258
|
+
|
259
|
+
# 立即降级的错误码配置
|
260
|
+
immediate_fallback_errors = os.getenv('MODEL_CLIENT_IMMEDIATE_FALLBACK_ERRORS',
|
261
|
+
'UNAVAILABLE,DEADLINE_EXCEEDED,CANCELLED')
|
262
|
+
self.immediate_fallback_errors = set()
|
263
|
+
|
264
|
+
if immediate_fallback_errors:
|
265
|
+
for error_name in immediate_fallback_errors.split(','):
|
266
|
+
error_name = error_name.strip()
|
267
|
+
if hasattr(grpc.StatusCode, error_name):
|
268
|
+
self.immediate_fallback_errors.add(getattr(grpc.StatusCode, error_name))
|
269
|
+
|
270
|
+
# 永不降级的错误码
|
271
|
+
never_fallback_errors = os.getenv('MODEL_CLIENT_NEVER_FALLBACK_ERRORS',
|
272
|
+
'UNAUTHENTICATED,PERMISSION_DENIED,INVALID_ARGUMENT')
|
273
|
+
self.never_fallback_errors = set()
|
274
|
+
|
275
|
+
if never_fallback_errors:
|
276
|
+
for error_name in never_fallback_errors.split(','):
|
277
|
+
error_name = error_name.strip()
|
278
|
+
if hasattr(grpc.StatusCode, error_name):
|
279
|
+
self.never_fallback_errors.add(getattr(grpc.StatusCode, error_name))
|
280
|
+
|
281
|
+
if self.fast_fallback_enabled:
|
282
|
+
self.logger.info(
|
283
|
+
"🚀 Fast fallback enabled",
|
284
|
+
extra={
|
285
|
+
"data": {
|
286
|
+
"fallback_after_retries": self.fallback_after_retries,
|
287
|
+
"immediate_fallback_errors": [e.name for e in self.immediate_fallback_errors],
|
288
|
+
"never_fallback_errors": [e.name for e in self.never_fallback_errors]
|
289
|
+
}
|
290
|
+
}
|
291
|
+
)
|
292
|
+
|
293
|
+
def _should_try_fallback(self, error_code, attempt: int) -> bool:
|
294
|
+
"""
|
295
|
+
判断是否应该尝试降级
|
296
|
+
|
297
|
+
Args:
|
298
|
+
error_code: gRPC错误码
|
299
|
+
attempt: 当前重试次数
|
300
|
+
|
301
|
+
Returns:
|
302
|
+
bool: 是否应该尝试降级
|
303
|
+
"""
|
304
|
+
# 未启用快速降级
|
305
|
+
if not self.fast_fallback_enabled:
|
306
|
+
return False
|
307
|
+
|
308
|
+
# 未启用熔断降级功能
|
309
|
+
if not self.resilient_enabled or not self.http_fallback_url:
|
310
|
+
return False
|
311
|
+
|
312
|
+
# 永不降级的错误类型
|
313
|
+
if error_code in self.never_fallback_errors:
|
314
|
+
return False
|
315
|
+
|
316
|
+
# 立即降级的错误类型
|
317
|
+
if error_code in self.immediate_fallback_errors:
|
318
|
+
return True
|
319
|
+
|
320
|
+
# 其他错误在达到重试次数后降级
|
321
|
+
return attempt >= self.fallback_after_retries
|