tamar-model-client 0.1.24__tar.gz → 0.1.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/PKG-INFO +1 -1
  2. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/setup.py +1 -1
  3. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/async_client.py +156 -2
  4. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/sync_client.py +155 -1
  5. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client.egg-info/PKG-INFO +1 -1
  6. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tests/test_google_azure_final.py +3 -3
  7. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/README.md +0 -0
  8. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/setup.cfg +0 -0
  9. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/__init__.py +0 -0
  10. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/auth.py +0 -0
  11. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/circuit_breaker.py +0 -0
  12. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/__init__.py +0 -0
  13. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/base_client.py +0 -0
  14. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/http_fallback.py +0 -0
  15. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/logging_setup.py +0 -0
  16. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/request_builder.py +0 -0
  17. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/response_handler.py +0 -0
  18. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/core/utils.py +0 -0
  19. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/enums/__init__.py +0 -0
  20. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/enums/channel.py +0 -0
  21. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/enums/invoke.py +0 -0
  22. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/enums/providers.py +0 -0
  23. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/error_handler.py +0 -0
  24. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/exceptions.py +0 -0
  25. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/generated/__init__.py +0 -0
  26. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/generated/model_service_pb2.py +0 -0
  27. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/generated/model_service_pb2_grpc.py +0 -0
  28. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/json_formatter.py +0 -0
  29. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/logging_icons.py +0 -0
  30. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/schemas/__init__.py +0 -0
  31. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/schemas/inputs.py +0 -0
  32. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/schemas/outputs.py +0 -0
  33. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client/utils.py +0 -0
  34. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client.egg-info/SOURCES.txt +0 -0
  35. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client.egg-info/dependency_links.txt +0 -0
  36. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client.egg-info/requires.txt +0 -0
  37. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tamar_model_client.egg-info/top_level.txt +0 -0
  38. {tamar_model_client-0.1.24 → tamar_model_client-0.1.25}/tests/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tamar-model-client
3
- Version: 0.1.24
3
+ Version: 0.1.25
4
4
  Summary: A Python SDK for interacting with the Model Manager gRPC service
5
5
  Home-page: http://gitlab.tamaredge.top/project-tap/AgentOS/model-manager-client
6
6
  Author: Oscar Ou
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="tamar-model-client",
5
- version="0.1.24",
5
+ version="0.1.25",
6
6
  description="A Python SDK for interacting with the Model Manager gRPC service",
7
7
  author="Oscar Ou",
8
8
  author_email="oscar.ou@tamaredge.ai",
@@ -98,6 +98,9 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
98
98
  # === gRPC 通道和连接管理 ===
99
99
  self.channel: Optional[grpc.aio.Channel] = None
100
100
  self.stub: Optional[model_service_pb2_grpc.ModelServiceStub] = None
101
+ self._channel_error_count = 0
102
+ self._last_channel_error_time = None
103
+ self._channel_lock = asyncio.Lock() # 异步锁
101
104
 
102
105
  # === 增强的重试处理器 ===
103
106
  self.retry_handler = EnhancedRetryHandler(
@@ -176,9 +179,23 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
176
179
  Raises:
177
180
  ConnectionError: 当达到最大重试次数仍无法连接时
178
181
  """
179
- if self.channel and self.stub:
182
+ if self.channel and self.stub and await self._is_channel_healthy():
180
183
  return
181
184
 
185
+ # 如果 channel 存在但不健康,记录日志
186
+ if self.channel and self.stub:
187
+ logger.warning(
188
+ "Channel exists but unhealthy, will recreate",
189
+ extra={
190
+ "log_type": "channel_recreate",
191
+ "data": {
192
+ "channel_error_count": self._channel_error_count,
193
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None
194
+ }
195
+ }
196
+ )
197
+ await self._recreate_channel()
198
+
182
199
  retry_count = 0
183
200
  options = self.build_channel_options()
184
201
 
@@ -228,6 +245,111 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
228
245
  await asyncio.sleep(self.retry_delay * retry_count)
229
246
 
230
247
  raise ConnectionError(f"Failed to connect to {self.server_address} after {self.max_retries} retries")
248
+
249
+ async def _is_channel_healthy(self) -> bool:
250
+ """
251
+ 检查 channel 是否健康
252
+
253
+ Returns:
254
+ bool: True 如果 channel 健康,False 如果需要重建
255
+ """
256
+ if not self.channel:
257
+ return False
258
+
259
+ try:
260
+ # 检查 channel 状态
261
+ state = self.channel.get_state()
262
+
263
+ # 如果处于关闭或失败状态,需要重建
264
+ if state in [grpc.ChannelConnectivity.SHUTDOWN,
265
+ grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
266
+ logger.warning(f"Channel in unhealthy state: {state}",
267
+ extra={"log_type": "info",
268
+ "data": {"channel_state": str(state)}})
269
+ return False
270
+
271
+ # 如果最近有多次错误,也需要重建
272
+ if self._channel_error_count > 3 and self._last_channel_error_time:
273
+ if time.time() - self._last_channel_error_time < 60: # 60秒内
274
+ logger.warning("Too many channel errors recently, marking as unhealthy",
275
+ extra={"log_type": "info",
276
+ "data": {"error_count": self._channel_error_count}})
277
+ return False
278
+
279
+ return True
280
+
281
+ except Exception as e:
282
+ logger.error(f"Error checking channel health: {e}",
283
+ extra={"log_type": "info",
284
+ "data": {"error": str(e)}})
285
+ return False
286
+
287
+ async def _recreate_channel(self):
288
+ """
289
+ 重建 gRPC channel
290
+
291
+ 关闭旧的 channel 并创建新的连接
292
+ """
293
+ async with self._channel_lock:
294
+ # 关闭旧 channel
295
+ if self.channel:
296
+ try:
297
+ await self.channel.close()
298
+ logger.info("Closed unhealthy channel",
299
+ extra={"log_type": "info"})
300
+ except Exception as e:
301
+ logger.warning(f"Error closing channel: {e}",
302
+ extra={"log_type": "info"})
303
+
304
+ # 清空引用
305
+ self.channel = None
306
+ self.stub = None
307
+
308
+ # 重置错误计数
309
+ self._channel_error_count = 0
310
+ self._last_channel_error_time = None
311
+
312
+ logger.info("Recreating gRPC channel...",
313
+ extra={"log_type": "info"})
314
+
315
+ def _record_channel_error(self, error: grpc.RpcError):
316
+ """
317
+ 记录 channel 错误,用于健康检查
318
+
319
+ Args:
320
+ error: gRPC 错误
321
+ """
322
+ self._channel_error_count += 1
323
+ self._last_channel_error_time = time.time()
324
+
325
+ # 获取当前 channel 状态
326
+ channel_state = None
327
+ if self.channel:
328
+ try:
329
+ channel_state = self.channel.get_state()
330
+ except:
331
+ channel_state = "UNKNOWN"
332
+
333
+ # 对于严重错误,增加错误权重
334
+ if error.code() in [grpc.StatusCode.INTERNAL,
335
+ grpc.StatusCode.UNAVAILABLE]:
336
+ self._channel_error_count += 2
337
+
338
+ # 记录详细的错误信息
339
+ logger.warning(
340
+ f"Channel error recorded: {error.code().name}",
341
+ extra={
342
+ "log_type": "channel_error",
343
+ "data": {
344
+ "error_code": error.code().name,
345
+ "error_count": self._channel_error_count,
346
+ "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
347
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else 0,
348
+ "error_details": error.details() if hasattr(error, 'details') else "",
349
+ "debug_string": error.debug_error_string() if hasattr(error, 'debug_error_string') else ""
350
+ }
351
+ }
352
+ )
231
353
 
232
354
  async def _retry_request(self, func, *args, **kwargs):
233
355
  """
@@ -315,7 +437,33 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
315
437
  elif retryable == 'conditional':
316
438
  # 条件重试,特殊处理 CANCELLED
317
439
  if error_code == grpc.StatusCode.CANCELLED:
318
- should_retry = error_context.is_network_cancelled()
440
+ # 获取 channel 状态信息
441
+ channel_state = None
442
+ if self.channel:
443
+ try:
444
+ channel_state = self.channel.get_state()
445
+ except:
446
+ channel_state = "UNKNOWN"
447
+
448
+ is_network_cancelled = error_context.is_network_cancelled()
449
+
450
+ logger.warning(
451
+ f"CANCELLED error in stream, channel state: {channel_state}",
452
+ extra={
453
+ "log_type": "cancelled_debug",
454
+ "request_id": context.get('request_id'),
455
+ "data": {
456
+ "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
457
+ "channel_error_count": self._channel_error_count,
458
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None,
459
+ "channel_healthy": await self._is_channel_healthy(),
460
+ "is_network_cancelled": is_network_cancelled,
461
+ "debug_string": e.debug_error_string() if hasattr(e, 'debug_error_string') else ""
462
+ }
463
+ }
464
+ )
465
+
466
+ should_retry = is_network_cancelled
319
467
  else:
320
468
  should_retry = self._check_error_details_for_retry(e)
321
469
  else:
@@ -363,6 +511,8 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
363
511
  )
364
512
  context['duration'] = current_duration
365
513
  last_exception = self.error_handler.handle_error(e, context)
514
+ # 记录 channel 错误
515
+ self._record_channel_error(e)
366
516
  break
367
517
 
368
518
  last_exception = e
@@ -674,6 +824,10 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
674
824
  )
675
825
  })
676
826
 
827
+ # 记录 channel 错误
828
+ if isinstance(e, grpc.RpcError):
829
+ self._record_channel_error(e)
830
+
677
831
  # 记录失败并尝试降级(如果启用了熔断)
678
832
  if self.resilient_enabled and self.circuit_breaker:
679
833
  # 将错误码传递给熔断器,用于智能失败统计
@@ -22,6 +22,7 @@ Tamar Model Client 同步客户端实现
22
22
  import json
23
23
  import logging
24
24
  import random
25
+ import threading
25
26
  import time
26
27
  from typing import Optional, Union, Iterator
27
28
 
@@ -95,6 +96,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
95
96
  # === gRPC 通道和连接管理 ===
96
97
  self.channel: Optional[grpc.Channel] = None
97
98
  self.stub: Optional[model_service_pb2_grpc.ModelServiceStub] = None
99
+ self._channel_error_count = 0
100
+ self._last_channel_error_time = None
101
+ self._channel_lock = threading.Lock() # 线程安全的channel操作
98
102
 
99
103
  def close(self):
100
104
  """
@@ -143,8 +147,22 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
143
147
  Raises:
144
148
  ConnectionError: 当达到最大重试次数仍无法连接时
145
149
  """
146
- if self.channel and self.stub:
150
+ if self.channel and self.stub and self._is_channel_healthy():
147
151
  return
152
+
153
+ # 如果 channel 存在但不健康,记录日志
154
+ if self.channel and self.stub:
155
+ logger.warning(
156
+ "Channel exists but unhealthy, will recreate",
157
+ extra={
158
+ "log_type": "channel_recreate",
159
+ "data": {
160
+ "channel_error_count": self._channel_error_count,
161
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None
162
+ }
163
+ }
164
+ )
165
+ self._recreate_channel()
148
166
 
149
167
  retry_count = 0
150
168
  options = self.build_channel_options()
@@ -196,6 +214,111 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
196
214
  time.sleep(self.retry_delay * retry_count)
197
215
 
198
216
  raise ConnectionError(f"Failed to connect to {self.server_address} after {self.max_retries} retries")
217
+
218
+ def _is_channel_healthy(self) -> bool:
219
+ """
220
+ 检查 channel 是否健康
221
+
222
+ Returns:
223
+ bool: True 如果 channel 健康,False 如果需要重建
224
+ """
225
+ if not self.channel:
226
+ return False
227
+
228
+ try:
229
+ # 检查 channel 状态
230
+ state = self.channel._channel.check_connectivity_state(False)
231
+
232
+ # 如果处于关闭或失败状态,需要重建
233
+ if state in [grpc.ChannelConnectivity.SHUTDOWN,
234
+ grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
235
+ logger.warning(f"Channel in unhealthy state: {state}",
236
+ extra={"log_type": "info",
237
+ "data": {"channel_state": str(state)}})
238
+ return False
239
+
240
+ # 如果最近有多次错误,也需要重建
241
+ if self._channel_error_count > 3 and self._last_channel_error_time:
242
+ if time.time() - self._last_channel_error_time < 60: # 60秒内
243
+ logger.warning("Too many channel errors recently, marking as unhealthy",
244
+ extra={"log_type": "info",
245
+ "data": {"error_count": self._channel_error_count}})
246
+ return False
247
+
248
+ return True
249
+
250
+ except Exception as e:
251
+ logger.error(f"Error checking channel health: {e}",
252
+ extra={"log_type": "info",
253
+ "data": {"error": str(e)}})
254
+ return False
255
+
256
+ def _recreate_channel(self):
257
+ """
258
+ 重建 gRPC channel
259
+
260
+ 关闭旧的 channel 并创建新的连接
261
+ """
262
+ with self._channel_lock:
263
+ # 关闭旧 channel
264
+ if self.channel:
265
+ try:
266
+ self.channel.close()
267
+ logger.info("Closed unhealthy channel",
268
+ extra={"log_type": "info"})
269
+ except Exception as e:
270
+ logger.warning(f"Error closing channel: {e}",
271
+ extra={"log_type": "info"})
272
+
273
+ # 清空引用
274
+ self.channel = None
275
+ self.stub = None
276
+
277
+ # 重置错误计数
278
+ self._channel_error_count = 0
279
+ self._last_channel_error_time = None
280
+
281
+ logger.info("Recreating gRPC channel...",
282
+ extra={"log_type": "info"})
283
+
284
+ def _record_channel_error(self, error: grpc.RpcError):
285
+ """
286
+ 记录 channel 错误,用于健康检查
287
+
288
+ Args:
289
+ error: gRPC 错误
290
+ """
291
+ self._channel_error_count += 1
292
+ self._last_channel_error_time = time.time()
293
+
294
+ # 获取当前 channel 状态
295
+ channel_state = None
296
+ if self.channel:
297
+ try:
298
+ channel_state = self.channel._channel.check_connectivity_state(False)
299
+ except:
300
+ channel_state = "UNKNOWN"
301
+
302
+ # 对于严重错误,增加错误权重
303
+ if error.code() in [grpc.StatusCode.INTERNAL,
304
+ grpc.StatusCode.UNAVAILABLE]:
305
+ self._channel_error_count += 2
306
+
307
+ # 记录详细的错误信息
308
+ logger.warning(
309
+ f"Channel error recorded: {error.code().name}",
310
+ extra={
311
+ "log_type": "channel_error",
312
+ "data": {
313
+ "error_code": error.code().name,
314
+ "error_count": self._channel_error_count,
315
+ "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
316
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else 0,
317
+ "error_details": error.details() if hasattr(error, 'details') else "",
318
+ "debug_string": error.debug_error_string() if hasattr(error, 'debug_error_string') else ""
319
+ }
320
+ }
321
+ )
199
322
 
200
323
  def _retry_request(self, func, *args, **kwargs):
201
324
  """
@@ -237,6 +360,30 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
237
360
  # 计算当前的耗时
238
361
  current_duration = time.time() - method_start_time
239
362
 
363
+ # 特殊处理 CANCELLED 错误
364
+ if e.code() == grpc.StatusCode.CANCELLED:
365
+ channel_state = None
366
+ if self.channel:
367
+ try:
368
+ channel_state = self.channel._channel.check_connectivity_state(False)
369
+ except:
370
+ channel_state = "UNKNOWN"
371
+
372
+ logger.warning(
373
+ f"CANCELLED error detected, channel state: {channel_state}",
374
+ extra={
375
+ "log_type": "cancelled_debug",
376
+ "request_id": context.get('request_id'),
377
+ "data": {
378
+ "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
379
+ "channel_error_count": self._channel_error_count,
380
+ "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None,
381
+ "channel_healthy": self._is_channel_healthy(),
382
+ "debug_string": e.debug_error_string() if hasattr(e, 'debug_error_string') else ""
383
+ }
384
+ }
385
+ )
386
+
240
387
  # 记录重试日志
241
388
  log_data = {
242
389
  "log_type": "info",
@@ -261,6 +408,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
261
408
 
262
409
  context['duration'] = current_duration
263
410
  last_exception = self.error_handler.handle_error(e, context)
411
+
412
+ # 记录 channel 错误
413
+ self._record_channel_error(e)
264
414
 
265
415
  except Exception as e:
266
416
  # 非 gRPC 错误,直接包装抛出
@@ -742,6 +892,10 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
742
892
  )
743
893
  })
744
894
 
895
+ # 记录 channel 错误
896
+ if isinstance(e, grpc.RpcError):
897
+ self._record_channel_error(e)
898
+
745
899
  # 记录失败并尝试降级(如果启用了熔断)
746
900
  if self.resilient_enabled and self.circuit_breaker:
747
901
  # 将错误码传递给熔断器,用于智能失败统计
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tamar-model-client
3
- Version: 0.1.24
3
+ Version: 0.1.25
4
4
  Summary: A Python SDK for interacting with the Model Manager gRPC service
5
5
  Home-page: http://gitlab.tamaredge.top/project-tap/AgentOS/model-manager-client
6
6
  Author: Oscar Ou
@@ -27,7 +27,7 @@ test_logger.addHandler(test_handler)
27
27
  logger = test_logger
28
28
 
29
29
  os.environ['MODEL_MANAGER_SERVER_GRPC_USE_TLS'] = "true"
30
- os.environ['MODEL_MANAGER_SERVER_ADDRESS'] = "localhost:50051"
30
+ os.environ['MODEL_MANAGER_SERVER_ADDRESS'] = "model-manager-server-grpc-131786869360.asia-northeast1.run.app"
31
31
  os.environ['MODEL_MANAGER_SERVER_JWT_SECRET_KEY'] = "model-manager-server-jwt-key"
32
32
 
33
33
  # 导入客户端模块
@@ -645,10 +645,10 @@ async def main():
645
645
  # await asyncio.wait_for(test_batch_requests(), timeout=120.0)
646
646
 
647
647
  # 同步并发测试
648
- #test_concurrent_requests(150) # 测试150个并发请求
648
+ test_concurrent_requests(150) # 测试150个并发请求
649
649
 
650
650
  # 异步并发测试
651
- await test_async_concurrent_requests(50) # 测试150个异步并发请求
651
+ await test_async_concurrent_requests(150) # 测试150个异步并发请求
652
652
 
653
653
  print("\n✅ 测试完成")
654
654