tamar-model-client 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tamar_model_client/async_client.py +41 -32
- tamar_model_client/core/base_client.py +8 -7
- tamar_model_client/core/http_fallback.py +75 -14
- tamar_model_client/error_handler.py +8 -6
- tamar_model_client/json_formatter.py +9 -0
- tamar_model_client/sync_client.py +29 -14
- {tamar_model_client-0.1.28.dist-info → tamar_model_client-0.1.30.dist-info}/METADATA +496 -7
- {tamar_model_client-0.1.28.dist-info → tamar_model_client-0.1.30.dist-info}/RECORD +12 -11
- tests/test_circuit_breaker.py +269 -0
- tests/test_google_azure_final.py +589 -5
- {tamar_model_client-0.1.28.dist-info → tamar_model_client-0.1.30.dist-info}/WHEEL +0 -0
- {tamar_model_client-0.1.28.dist-info → tamar_model_client-0.1.30.dist-info}/top_level.txt +0 -0
@@ -103,7 +103,6 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
103
103
|
self.stub: Optional[model_service_pb2_grpc.ModelServiceStub] = None
|
104
104
|
self._channel_error_count = 0
|
105
105
|
self._last_channel_error_time = None
|
106
|
-
self._channel_lock = asyncio.Lock() # 异步锁
|
107
106
|
|
108
107
|
# === Request ID 管理 ===
|
109
108
|
self._request_id_manager = RequestIdManager()
|
@@ -194,7 +193,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
194
193
|
# 如果 channel 存在但不健康,记录日志
|
195
194
|
if self.channel and self.stub:
|
196
195
|
logger.warning(
|
197
|
-
"Channel exists but unhealthy, will recreate",
|
196
|
+
"⚠️ Channel exists but unhealthy, will recreate",
|
198
197
|
extra={
|
199
198
|
"log_type": "channel_recreate",
|
200
199
|
"data": {
|
@@ -222,7 +221,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
222
221
|
"data": {"tls_enabled": True, "server_address": self.server_address}})
|
223
222
|
else:
|
224
223
|
self.channel = grpc.aio.insecure_channel(
|
225
|
-
self.server_address,
|
224
|
+
f"dns:///{self.server_address}",
|
226
225
|
options=options
|
227
226
|
)
|
228
227
|
logger.info("🔓 Using insecure gRPC channel (TLS disabled)",
|
@@ -272,7 +271,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
272
271
|
# 如果处于关闭或失败状态,需要重建
|
273
272
|
if state in [grpc.ChannelConnectivity.SHUTDOWN,
|
274
273
|
grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
|
275
|
-
logger.warning(f"Channel in unhealthy state: {state}",
|
274
|
+
logger.warning(f"⚠️ Channel in unhealthy state: {state}",
|
276
275
|
extra={"log_type": "info",
|
277
276
|
"data": {"channel_state": str(state)}})
|
278
277
|
return False
|
@@ -280,7 +279,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
280
279
|
# 如果最近有多次错误,也需要重建
|
281
280
|
if self._channel_error_count > 3 and self._last_channel_error_time:
|
282
281
|
if time.time() - self._last_channel_error_time < 60: # 60秒内
|
283
|
-
logger.warning("Too many channel errors recently, marking as unhealthy",
|
282
|
+
logger.warning("⚠️ Too many channel errors recently, marking as unhealthy",
|
284
283
|
extra={"log_type": "info",
|
285
284
|
"data": {"error_count": self._channel_error_count}})
|
286
285
|
return False
|
@@ -288,7 +287,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
288
287
|
return True
|
289
288
|
|
290
289
|
except Exception as e:
|
291
|
-
logger.error(f"Error checking channel health: {e}",
|
290
|
+
logger.error(f"❌ Error checking channel health: {e}",
|
292
291
|
extra={"log_type": "info",
|
293
292
|
"data": {"error": str(e)}})
|
294
293
|
return False
|
@@ -299,27 +298,26 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
299
298
|
|
300
299
|
关闭旧的 channel 并创建新的连接
|
301
300
|
"""
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
extra={"log_type": "info"})
|
301
|
+
# 关闭旧 channel
|
302
|
+
if self.channel:
|
303
|
+
try:
|
304
|
+
await self.channel.close()
|
305
|
+
logger.info("🔚 Closed unhealthy channel",
|
306
|
+
extra={"log_type": "info"})
|
307
|
+
except Exception as e:
|
308
|
+
logger.warning(f"⚠️ Error closing channel: {e}",
|
309
|
+
extra={"log_type": "info"})
|
310
|
+
|
311
|
+
# 清空引用
|
312
|
+
self.channel = None
|
313
|
+
self.stub = None
|
314
|
+
|
315
|
+
# 重置错误计数
|
316
|
+
self._channel_error_count = 0
|
317
|
+
self._last_channel_error_time = None
|
318
|
+
|
319
|
+
logger.info("🔄 Recreating gRPC channel...",
|
320
|
+
extra={"log_type": "info"})
|
323
321
|
|
324
322
|
def _record_channel_error(self, error: grpc.RpcError):
|
325
323
|
"""
|
@@ -346,7 +344,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
346
344
|
|
347
345
|
# 记录详细的错误信息
|
348
346
|
logger.warning(
|
349
|
-
f"Channel error recorded: {error.code().name}",
|
347
|
+
f"⚠️ Channel error recorded: {error.code().name}",
|
350
348
|
extra={
|
351
349
|
"log_type": "channel_error",
|
352
350
|
"data": {
|
@@ -457,7 +455,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
457
455
|
is_network_cancelled = error_context.is_network_cancelled()
|
458
456
|
|
459
457
|
logger.warning(
|
460
|
-
f"CANCELLED error in stream, channel state: {channel_state}",
|
458
|
+
f"⚠️ CANCELLED error in stream, channel state: {channel_state}",
|
461
459
|
extra={
|
462
460
|
"log_type": "cancelled_debug",
|
463
461
|
"request_id": context.get('request_id'),
|
@@ -485,14 +483,16 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
485
483
|
"request_id": context.get('request_id'),
|
486
484
|
"data": {
|
487
485
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
486
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
488
487
|
"retry_count": attempt,
|
489
488
|
"max_retries": self.max_retries,
|
490
489
|
"method": "stream"
|
491
490
|
},
|
492
491
|
"duration": current_duration
|
493
492
|
}
|
493
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
494
494
|
logger.warning(
|
495
|
-
f"
|
495
|
+
f"🔄 Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (will retry)",
|
496
496
|
extra=log_data
|
497
497
|
)
|
498
498
|
|
@@ -507,6 +507,7 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
507
507
|
"request_id": context.get('request_id'),
|
508
508
|
"data": {
|
509
509
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
510
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
510
511
|
"retry_count": attempt,
|
511
512
|
"max_retries": self.max_retries,
|
512
513
|
"method": "stream",
|
@@ -514,8 +515,9 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
514
515
|
},
|
515
516
|
"duration": current_duration
|
516
517
|
}
|
517
|
-
|
518
|
-
|
518
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
519
|
+
logger.warning(
|
520
|
+
f"⚠️ Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (no more retries)",
|
519
521
|
extra=log_data
|
520
522
|
)
|
521
523
|
context['duration'] = current_duration
|
@@ -1033,6 +1035,13 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
|
|
1033
1035
|
"batch_size": len(batch_request_model.items)
|
1034
1036
|
}
|
1035
1037
|
})
|
1038
|
+
|
1039
|
+
# 记录失败(如果启用了熔断)
|
1040
|
+
if self.resilient_enabled and self.circuit_breaker:
|
1041
|
+
# 将错误码传递给熔断器,用于智能失败统计
|
1042
|
+
error_code = e.code() if hasattr(e, 'code') else None
|
1043
|
+
self.circuit_breaker.record_failure(error_code)
|
1044
|
+
|
1036
1045
|
raise e
|
1037
1046
|
except Exception as e:
|
1038
1047
|
duration = time.time() - start_time
|
@@ -136,10 +136,7 @@ class BaseClient(ABC):
|
|
136
136
|
('grpc.resource_quota_size', 1048576000), # 设置资源配额为1GB
|
137
137
|
|
138
138
|
# 启用负载均衡配置
|
139
|
-
('grpc.
|
140
|
-
|
141
|
-
# 启用详细的日志记录
|
142
|
-
('grpc.debug', 1), # 启用 gRPC 的调试日志,记录更多的连接和请求信息
|
139
|
+
('grpc.lb_policy_name', 'round_robin'), # 设置负载均衡策略为 round_robin(轮询)
|
143
140
|
]
|
144
141
|
|
145
142
|
if self.default_authority:
|
@@ -240,9 +237,13 @@ class BaseClient(ABC):
|
|
240
237
|
|
241
238
|
return {
|
242
239
|
"enabled": self.resilient_enabled,
|
243
|
-
"
|
244
|
-
|
245
|
-
|
240
|
+
"circuit_breaker": {
|
241
|
+
"state": self.circuit_breaker.get_state(),
|
242
|
+
"failure_count": self.circuit_breaker.failure_count,
|
243
|
+
"last_failure_time": self.circuit_breaker.last_failure_time,
|
244
|
+
"failure_threshold": self.circuit_breaker.failure_threshold,
|
245
|
+
"recovery_timeout": self.circuit_breaker.recovery_timeout
|
246
|
+
},
|
246
247
|
"http_fallback_url": self.http_fallback_url
|
247
248
|
}
|
248
249
|
|
@@ -15,6 +15,59 @@ from ..schemas import ModelRequest, ModelResponse
|
|
15
15
|
logger = get_protected_logger(__name__)
|
16
16
|
|
17
17
|
|
18
|
+
def safe_serialize(obj: Any) -> Any:
|
19
|
+
"""
|
20
|
+
安全地序列化对象,避免 Pydantic ValidatorIterator 序列化问题
|
21
|
+
"""
|
22
|
+
if obj is None:
|
23
|
+
return None
|
24
|
+
|
25
|
+
# 处理基本类型
|
26
|
+
if isinstance(obj, (str, int, float, bool)):
|
27
|
+
return obj
|
28
|
+
|
29
|
+
# 处理列表
|
30
|
+
if isinstance(obj, (list, tuple)):
|
31
|
+
return [safe_serialize(item) for item in obj]
|
32
|
+
|
33
|
+
# 处理字典
|
34
|
+
if isinstance(obj, dict):
|
35
|
+
return {key: safe_serialize(value) for key, value in obj.items()}
|
36
|
+
|
37
|
+
# 处理 Pydantic 模型
|
38
|
+
if hasattr(obj, 'model_dump'):
|
39
|
+
try:
|
40
|
+
return obj.model_dump(exclude_unset=True)
|
41
|
+
except Exception:
|
42
|
+
# 如果 model_dump 失败,尝试手动提取字段
|
43
|
+
try:
|
44
|
+
if hasattr(obj, '__dict__'):
|
45
|
+
return {k: safe_serialize(v) for k, v in obj.__dict__.items()
|
46
|
+
if not k.startswith('_') and not callable(v)}
|
47
|
+
elif hasattr(obj, '__slots__'):
|
48
|
+
return {slot: safe_serialize(getattr(obj, slot, None))
|
49
|
+
for slot in obj.__slots__ if hasattr(obj, slot)}
|
50
|
+
except Exception:
|
51
|
+
pass
|
52
|
+
|
53
|
+
# 处理 Pydantic v1 模型
|
54
|
+
if hasattr(obj, 'dict'):
|
55
|
+
try:
|
56
|
+
return obj.dict(exclude_unset=True)
|
57
|
+
except Exception:
|
58
|
+
pass
|
59
|
+
|
60
|
+
# 处理枚举
|
61
|
+
if hasattr(obj, 'value'):
|
62
|
+
return obj.value
|
63
|
+
|
64
|
+
# 最后的尝试:转换为字符串
|
65
|
+
try:
|
66
|
+
return str(obj)
|
67
|
+
except Exception:
|
68
|
+
return None
|
69
|
+
|
70
|
+
|
18
71
|
class HttpFallbackMixin:
|
19
72
|
"""HTTP fallback functionality for synchronous clients
|
20
73
|
|
@@ -43,30 +96,37 @@ class HttpFallbackMixin:
|
|
43
96
|
|
44
97
|
def _convert_to_http_format(self, model_request: ModelRequest) -> Dict[str, Any]:
|
45
98
|
"""Convert ModelRequest to HTTP payload format"""
|
99
|
+
# Use safe serialization to avoid Pydantic ValidatorIterator issues
|
46
100
|
payload = {
|
47
|
-
"provider": model_request.provider
|
48
|
-
"model": model_request.model,
|
49
|
-
"user_context": model_request.user_context
|
50
|
-
"stream": model_request.stream
|
101
|
+
"provider": safe_serialize(model_request.provider),
|
102
|
+
"model": safe_serialize(model_request.model),
|
103
|
+
"user_context": safe_serialize(model_request.user_context),
|
104
|
+
"stream": safe_serialize(model_request.stream)
|
51
105
|
}
|
52
106
|
|
53
107
|
# Add provider-specific fields
|
54
108
|
if hasattr(model_request, 'messages') and model_request.messages:
|
55
|
-
payload['messages'] = model_request.messages
|
109
|
+
payload['messages'] = safe_serialize(model_request.messages)
|
56
110
|
if hasattr(model_request, 'contents') and model_request.contents:
|
57
|
-
payload['contents'] = model_request.contents
|
111
|
+
payload['contents'] = safe_serialize(model_request.contents)
|
58
112
|
|
59
113
|
# Add optional fields
|
60
114
|
if model_request.channel:
|
61
|
-
payload['channel'] = model_request.channel
|
115
|
+
payload['channel'] = safe_serialize(model_request.channel)
|
62
116
|
if model_request.invoke_type:
|
63
|
-
payload['invoke_type'] = model_request.invoke_type
|
117
|
+
payload['invoke_type'] = safe_serialize(model_request.invoke_type)
|
64
118
|
|
65
|
-
# Add
|
119
|
+
# Add config parameters safely
|
120
|
+
if hasattr(model_request, 'config') and model_request.config:
|
121
|
+
payload['config'] = safe_serialize(model_request.config)
|
122
|
+
|
123
|
+
# Add extra parameters safely
|
66
124
|
if hasattr(model_request, 'model_extra') and model_request.model_extra:
|
67
|
-
|
68
|
-
|
69
|
-
|
125
|
+
serialized_extra = safe_serialize(model_request.model_extra)
|
126
|
+
if isinstance(serialized_extra, dict):
|
127
|
+
for key, value in serialized_extra.items():
|
128
|
+
if key not in payload:
|
129
|
+
payload[key] = value
|
70
130
|
|
71
131
|
return payload
|
72
132
|
|
@@ -96,7 +156,7 @@ class HttpFallbackMixin:
|
|
96
156
|
data = json.loads(data_str)
|
97
157
|
yield ModelResponse(**data)
|
98
158
|
except json.JSONDecodeError:
|
99
|
-
logger.warning(f"Failed to parse streaming response: {data_str}")
|
159
|
+
logger.warning(f"⚠️ Failed to parse streaming response: {data_str}")
|
100
160
|
|
101
161
|
def _invoke_http_fallback(self, model_request: ModelRequest,
|
102
162
|
timeout: Optional[float] = None,
|
@@ -305,7 +365,7 @@ class AsyncHttpFallbackMixin:
|
|
305
365
|
data = json.loads(data_str)
|
306
366
|
yield ModelResponse(**data)
|
307
367
|
except json.JSONDecodeError:
|
308
|
-
logger.warning(f"Failed to parse streaming response: {data_str}")
|
368
|
+
logger.warning(f"⚠️ Failed to parse streaming response: {data_str}")
|
309
369
|
|
310
370
|
async def _invoke_http_fallback(self, model_request: ModelRequest,
|
311
371
|
timeout: Optional[float] = None,
|
@@ -339,6 +399,7 @@ class AsyncHttpFallbackMixin:
|
|
339
399
|
|
340
400
|
# Convert to HTTP format
|
341
401
|
http_payload = self._convert_to_http_format(model_request)
|
402
|
+
print(http_payload)
|
342
403
|
|
343
404
|
# Construct URL
|
344
405
|
url = f"{self.http_fallback_url}/v1/invoke"
|
@@ -67,7 +67,7 @@ class GrpcErrorHandler:
|
|
67
67
|
log_data['duration'] = context['duration']
|
68
68
|
|
69
69
|
self.logger.error(
|
70
|
-
f"gRPC Error occurred: {error_context.error_code.name if error_context.error_code else 'UNKNOWN'}",
|
70
|
+
f"❌ gRPC Error occurred: {error_context.error_code.name if error_context.error_code else 'UNKNOWN'}",
|
71
71
|
extra=log_data
|
72
72
|
)
|
73
73
|
|
@@ -151,14 +151,14 @@ class ErrorRecoveryStrategy:
|
|
151
151
|
|
152
152
|
async def handle_token_refresh(self, error_context: ErrorContext):
|
153
153
|
"""处理 Token 刷新"""
|
154
|
-
self.client.logger.info("Attempting to refresh JWT token")
|
154
|
+
self.client.logger.info("🔄 Attempting to refresh JWT token")
|
155
155
|
# 这里需要客户端实现 _refresh_jwt_token 方法
|
156
156
|
if hasattr(self.client, '_refresh_jwt_token'):
|
157
157
|
await self.client._refresh_jwt_token()
|
158
158
|
|
159
159
|
async def handle_reconnect(self, error_context: ErrorContext):
|
160
160
|
"""处理重连"""
|
161
|
-
self.client.logger.info("Attempting to reconnect channel")
|
161
|
+
self.client.logger.info("🔄 Attempting to reconnect channel")
|
162
162
|
# 这里需要客户端实现 _reconnect_channel 方法
|
163
163
|
if hasattr(self.client, '_reconnect_channel'):
|
164
164
|
await self.client._reconnect_channel()
|
@@ -170,7 +170,7 @@ class ErrorRecoveryStrategy:
|
|
170
170
|
|
171
171
|
async def handle_circuit_break(self, error_context: ErrorContext):
|
172
172
|
"""处理熔断"""
|
173
|
-
self.client.logger.warning("Circuit breaker activated")
|
173
|
+
self.client.logger.warning("⚠️ Circuit breaker activated")
|
174
174
|
# 这里可以实现熔断逻辑
|
175
175
|
pass
|
176
176
|
|
@@ -322,8 +322,9 @@ class EnhancedRetryHandler:
|
|
322
322
|
},
|
323
323
|
"duration": current_duration
|
324
324
|
}
|
325
|
+
error_detail = f" - {error_context.error_message}" if error_context.error_message else ""
|
325
326
|
logger.warning(
|
326
|
-
f"
|
327
|
+
f"⚠️ Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (no more retries)",
|
327
328
|
extra=log_data
|
328
329
|
)
|
329
330
|
last_exception = self.error_handler.handle_error(e, context)
|
@@ -346,8 +347,9 @@ class EnhancedRetryHandler:
|
|
346
347
|
},
|
347
348
|
"duration": current_duration
|
348
349
|
}
|
350
|
+
error_detail = f" - {error_context.error_message}" if error_context.error_message else ""
|
349
351
|
logger.warning(
|
350
|
-
f"Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()} (will retry)",
|
352
|
+
f"🔄 Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (will retry)",
|
351
353
|
extra=log_data
|
352
354
|
)
|
353
355
|
|
@@ -57,5 +57,14 @@ class JSONFormatter(logging.Formatter):
|
|
57
57
|
if hasattr(record, "trace"):
|
58
58
|
log_data["trace"] = getattr(record, "trace")
|
59
59
|
|
60
|
+
# 添加异常信息(如果有的话)
|
61
|
+
if record.exc_info:
|
62
|
+
import traceback
|
63
|
+
log_data["exception"] = {
|
64
|
+
"type": record.exc_info[0].__name__ if record.exc_info[0] else None,
|
65
|
+
"message": str(record.exc_info[1]) if record.exc_info[1] else None,
|
66
|
+
"traceback": traceback.format_exception(*record.exc_info)
|
67
|
+
}
|
68
|
+
|
60
69
|
# 使用安全的 JSON 编码器
|
61
70
|
return json.dumps(log_data, ensure_ascii=False, cls=SafeJSONEncoder)
|
@@ -159,7 +159,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
159
159
|
# 如果 channel 存在但不健康,记录日志
|
160
160
|
if self.channel and self.stub:
|
161
161
|
logger.warning(
|
162
|
-
"Channel exists but unhealthy, will recreate",
|
162
|
+
"⚠️ Channel exists but unhealthy, will recreate",
|
163
163
|
extra={
|
164
164
|
"log_type": "channel_recreate",
|
165
165
|
"data": {
|
@@ -187,7 +187,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
187
187
|
"data": {"tls_enabled": True, "server_address": self.server_address}})
|
188
188
|
else:
|
189
189
|
self.channel = grpc.insecure_channel(
|
190
|
-
self.server_address,
|
190
|
+
f"dns:///{self.server_address}",
|
191
191
|
options=options
|
192
192
|
)
|
193
193
|
logger.info("🔓 Using insecure gRPC channel (TLS disabled)",
|
@@ -238,7 +238,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
238
238
|
# 如果处于关闭或失败状态,需要重建
|
239
239
|
if state in [grpc.ChannelConnectivity.SHUTDOWN,
|
240
240
|
grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
|
241
|
-
logger.warning(f"Channel in unhealthy state: {state}",
|
241
|
+
logger.warning(f"⚠️ Channel in unhealthy state: {state}",
|
242
242
|
extra={"log_type": "info",
|
243
243
|
"data": {"channel_state": str(state)}})
|
244
244
|
return False
|
@@ -246,7 +246,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
246
246
|
# 如果最近有多次错误,也需要重建
|
247
247
|
if self._channel_error_count > 3 and self._last_channel_error_time:
|
248
248
|
if time.time() - self._last_channel_error_time < 60: # 60秒内
|
249
|
-
logger.warning("Too many channel errors recently, marking as unhealthy",
|
249
|
+
logger.warning("⚠️ Too many channel errors recently, marking as unhealthy",
|
250
250
|
extra={"log_type": "info",
|
251
251
|
"data": {"error_count": self._channel_error_count}})
|
252
252
|
return False
|
@@ -254,7 +254,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
254
254
|
return True
|
255
255
|
|
256
256
|
except Exception as e:
|
257
|
-
logger.error(f"Error checking channel health: {e}",
|
257
|
+
logger.error(f"❌ Error checking channel health: {e}",
|
258
258
|
extra={"log_type": "info",
|
259
259
|
"data": {"error": str(e)}})
|
260
260
|
return False
|
@@ -270,10 +270,10 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
270
270
|
if self.channel:
|
271
271
|
try:
|
272
272
|
self.channel.close()
|
273
|
-
logger.info("Closed unhealthy channel",
|
273
|
+
logger.info("🔚 Closed unhealthy channel",
|
274
274
|
extra={"log_type": "info"})
|
275
275
|
except Exception as e:
|
276
|
-
logger.warning(f"Error closing channel: {e}",
|
276
|
+
logger.warning(f"⚠️ Error closing channel: {e}",
|
277
277
|
extra={"log_type": "info"})
|
278
278
|
|
279
279
|
# 清空引用
|
@@ -284,7 +284,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
284
284
|
self._channel_error_count = 0
|
285
285
|
self._last_channel_error_time = None
|
286
286
|
|
287
|
-
logger.info("Recreating gRPC channel...",
|
287
|
+
logger.info("🔄 Recreating gRPC channel...",
|
288
288
|
extra={"log_type": "info"})
|
289
289
|
|
290
290
|
def _record_channel_error(self, error: grpc.RpcError):
|
@@ -312,7 +312,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
312
312
|
|
313
313
|
# 记录详细的错误信息
|
314
314
|
logger.warning(
|
315
|
-
f"Channel error recorded: {error.code().name}",
|
315
|
+
f"⚠️ Channel error recorded: {error.code().name}",
|
316
316
|
extra={
|
317
317
|
"log_type": "channel_error",
|
318
318
|
"data": {
|
@@ -371,7 +371,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
371
371
|
channel_state = "UNKNOWN"
|
372
372
|
|
373
373
|
logger.warning(
|
374
|
-
f"CANCELLED error detected, channel state: {channel_state}",
|
374
|
+
f"⚠️ CANCELLED error detected, channel state: {channel_state}",
|
375
375
|
extra={
|
376
376
|
"log_type": "cancelled_debug",
|
377
377
|
"request_id": context.get('request_id'),
|
@@ -469,6 +469,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
469
469
|
"request_id": context.get('request_id'),
|
470
470
|
"data": {
|
471
471
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
472
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
472
473
|
"retry_count": attempt,
|
473
474
|
"max_retries": self.max_retries,
|
474
475
|
"method": context.get('method', 'unknown'),
|
@@ -476,8 +477,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
476
477
|
},
|
477
478
|
"duration": current_duration
|
478
479
|
}
|
480
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
479
481
|
logger.warning(
|
480
|
-
f"Final attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()} (no more retries)",
|
482
|
+
f"⚠️ Final attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (no more retries)",
|
481
483
|
extra=log_data
|
482
484
|
)
|
483
485
|
|
@@ -490,6 +492,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
490
492
|
"request_id": context.get('request_id'),
|
491
493
|
"data": {
|
492
494
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
495
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
493
496
|
"retry_count": attempt,
|
494
497
|
"max_retries": self.max_retries,
|
495
498
|
"method": context.get('method', 'unknown'),
|
@@ -498,8 +501,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
498
501
|
},
|
499
502
|
"duration": current_duration
|
500
503
|
}
|
504
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
501
505
|
logger.warning(
|
502
|
-
f"Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()} (will retry)",
|
506
|
+
f"🔄 Attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (will retry)",
|
503
507
|
extra=log_data
|
504
508
|
)
|
505
509
|
|
@@ -683,6 +687,7 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
683
687
|
"request_id": context.get('request_id'),
|
684
688
|
"data": {
|
685
689
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
690
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
686
691
|
"retry_count": attempt,
|
687
692
|
"max_retries": self.max_retries,
|
688
693
|
"method": "stream",
|
@@ -690,8 +695,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
690
695
|
},
|
691
696
|
"duration": current_duration
|
692
697
|
}
|
698
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
693
699
|
logger.error(
|
694
|
-
f"Stream failed: {e.code()} (no retry)",
|
700
|
+
f"❌ Stream failed: {e.code()}{error_detail} (no retry)",
|
695
701
|
extra=log_data
|
696
702
|
)
|
697
703
|
context['duration'] = current_duration
|
@@ -704,14 +710,16 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
704
710
|
"request_id": context.get('request_id'),
|
705
711
|
"data": {
|
706
712
|
"error_code": e.code().name if e.code() else 'UNKNOWN',
|
713
|
+
"error_details": e.details() if hasattr(e, 'details') else '',
|
707
714
|
"retry_count": attempt,
|
708
715
|
"max_retries": self.max_retries,
|
709
716
|
"method": "stream"
|
710
717
|
},
|
711
718
|
"duration": current_duration
|
712
719
|
}
|
720
|
+
error_detail = f" - {e.details()}" if e.details() else ""
|
713
721
|
logger.warning(
|
714
|
-
f"Stream attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()} (will retry)",
|
722
|
+
f"🔄 Stream attempt {attempt + 1}/{self.max_retries + 1} failed: {e.code()}{error_detail} (will retry)",
|
715
723
|
extra=log_data
|
716
724
|
)
|
717
725
|
|
@@ -1191,6 +1199,13 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
|
|
1191
1199
|
"batch_size": len(batch_request_model.items)
|
1192
1200
|
}
|
1193
1201
|
})
|
1202
|
+
|
1203
|
+
# 记录失败(如果启用了熔断)
|
1204
|
+
if self.resilient_enabled and self.circuit_breaker:
|
1205
|
+
# 将错误码传递给熔断器,用于智能失败统计
|
1206
|
+
error_code = e.code() if hasattr(e, 'code') else None
|
1207
|
+
self.circuit_breaker.record_failure(error_code)
|
1208
|
+
|
1194
1209
|
raise e
|
1195
1210
|
except Exception as e:
|
1196
1211
|
duration = time.time() - start_time
|