PyPI - tamar-model-client - Versions diffs - 0.1.24__tar.gz → 0.1.26__tar.gz - Mend

tamar-model-client 0.1.24tar.gz → 0.1.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tamar-model-client
-Version: 0.1.24
+Version: 0.1.26
 Summary: A Python SDK for interacting with the Model Manager gRPC service
 Home-page: http://gitlab.tamaredge.top/project-tap/AgentOS/model-manager-client
 Author: Oscar Ou

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="tamar-model-client",
-    version="0.1.24",
+    version="0.1.26",
     description="A Python SDK for interacting with the Model Manager gRPC service",
     author="Oscar Ou",
     author_email="oscar.ou@tamaredge.ai",

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tamar_model_client/async_client.py RENAMED Viewed

@@ -98,6 +98,9 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
         # === gRPC 通道和连接管理 ===
         self.channel: Optional[grpc.aio.Channel] = None
         self.stub: Optional[model_service_pb2_grpc.ModelServiceStub] = None
+        self._channel_error_count = 0
+        self._last_channel_error_time = None
+        self._channel_lock = asyncio.Lock()  # 异步锁
         # === 增强的重试处理器 ===
         self.retry_handler = EnhancedRetryHandler(
@@ -176,9 +179,23 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
         Raises:
             ConnectionError: 当达到最大重试次数仍无法连接时
         """
-        if self.channel and self.stub:
+        if self.channel and self.stub and await self._is_channel_healthy():
             return
+        # 如果 channel 存在但不健康，记录日志
+        if self.channel and self.stub:
+            logger.warning(
+                "Channel exists but unhealthy, will recreate",
+                extra={
+                    "log_type": "channel_recreate",
+                    "data": {
+                        "channel_error_count": self._channel_error_count,
+                        "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None
+                    }
+                }
+            )
+            await self._recreate_channel()
         retry_count = 0
         options = self.build_channel_options()
@@ -228,6 +245,111 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
                 await asyncio.sleep(self.retry_delay * retry_count)
         raise ConnectionError(f"Failed to connect to {self.server_address} after {self.max_retries} retries")
+    async def _is_channel_healthy(self) -> bool:
+        """
+        检查 channel 是否健康
+        Returns:
+            bool: True 如果 channel 健康，False 如果需要重建
+        """
+        if not self.channel:
+            return False
+        try:
+            # 检查 channel 状态
+            state = self.channel.get_state()
+            # 如果处于关闭或失败状态，需要重建
+            if state in [grpc.ChannelConnectivity.SHUTDOWN,
+                        grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
+                logger.warning(f"Channel in unhealthy state: {state}",
+                             extra={"log_type": "info",
+                                   "data": {"channel_state": str(state)}})
+                return False
+            # 如果最近有多次错误，也需要重建
+            if self._channel_error_count > 3 and self._last_channel_error_time:
+                if time.time() - self._last_channel_error_time < 60:  # 60秒内
+                    logger.warning("Too many channel errors recently, marking as unhealthy",
+                                 extra={"log_type": "info",
+                                       "data": {"error_count": self._channel_error_count}})
+                    return False
+            return True
+        except Exception as e:
+            logger.error(f"Error checking channel health: {e}",
+                        extra={"log_type": "info",
+                              "data": {"error": str(e)}})
+            return False
+    async def _recreate_channel(self):
+        """
+        重建 gRPC channel
+        关闭旧的 channel 并创建新的连接
+        """
+        async with self._channel_lock:
+            # 关闭旧 channel
+            if self.channel:
+                try:
+                    await self.channel.close()
+                    logger.info("Closed unhealthy channel",
+                              extra={"log_type": "info"})
+                except Exception as e:
+                    logger.warning(f"Error closing channel: {e}",
+                                 extra={"log_type": "info"})
+            # 清空引用
+            self.channel = None
+            self.stub = None
+            # 重置错误计数
+            self._channel_error_count = 0
+            self._last_channel_error_time = None
+            logger.info("Recreating gRPC channel...",
+                       extra={"log_type": "info"})
+    def _record_channel_error(self, error: grpc.RpcError):
+        """
+        记录 channel 错误，用于健康检查
+        Args:
+            error: gRPC 错误
+        """
+        self._channel_error_count += 1
+        self._last_channel_error_time = time.time()
+        # 获取当前 channel 状态
+        channel_state = None
+        if self.channel:
+            try:
+                channel_state = self.channel.get_state()
+            except:
+                channel_state = "UNKNOWN"
+        # 对于严重错误，增加错误权重
+        if error.code() in [grpc.StatusCode.INTERNAL,
+                           grpc.StatusCode.UNAVAILABLE]:
+            self._channel_error_count += 2
+        # 记录详细的错误信息
+        logger.warning(
+            f"Channel error recorded: {error.code().name}",
+            extra={
+                "log_type": "channel_error",
+                "data": {
+                    "error_code": error.code().name,
+                    "error_count": self._channel_error_count,
+                    "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
+                    "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else 0,
+                    "error_details": error.details() if hasattr(error, 'details') else "",
+                    "debug_string": error.debug_error_string() if hasattr(error, 'debug_error_string') else ""
+                }
+            }
+        )
     async def _retry_request(self, func, *args, **kwargs):
         """
@@ -315,7 +437,33 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
                     elif retryable == 'conditional':
                         # 条件重试，特殊处理 CANCELLED
                         if error_code == grpc.StatusCode.CANCELLED:
-                            should_retry = error_context.is_network_cancelled()
+                            # 获取 channel 状态信息
+                            channel_state = None
+                            if self.channel:
+                                try:
+                                    channel_state = self.channel.get_state()
+                                except:
+                                    channel_state = "UNKNOWN"
+                            is_network_cancelled = error_context.is_network_cancelled()
+                            logger.warning(
+                                f"CANCELLED error in stream, channel state: {channel_state}",
+                                extra={
+                                    "log_type": "cancelled_debug",
+                                    "request_id": context.get('request_id'),
+                                    "data": {
+                                        "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
+                                        "channel_error_count": self._channel_error_count,
+                                        "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None,
+                                        "channel_healthy": await self._is_channel_healthy(),
+                                        "is_network_cancelled": is_network_cancelled,
+                                        "debug_string": e.debug_error_string() if hasattr(e, 'debug_error_string') else ""
+                                    }
+                                }
+                            )
+                            should_retry = is_network_cancelled
                         else:
                             should_retry = self._check_error_details_for_retry(e)
                     else:
@@ -363,6 +511,8 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
                     )
                     context['duration'] = current_duration
                     last_exception = self.error_handler.handle_error(e, context)
+                    # 记录 channel 错误
+                    self._record_channel_error(e)
                     break
                 last_exception = e
@@ -674,6 +824,10 @@ class AsyncTamarModelClient(BaseClient, AsyncHttpFallbackMixin):
                              )
                          })
+            # 记录 channel 错误
+            if isinstance(e, grpc.RpcError):
+                self._record_channel_error(e)
             # 记录失败并尝试降级（如果启用了熔断）
             if self.resilient_enabled and self.circuit_breaker:
                 # 将错误码传递给熔断器，用于智能失败统计

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tamar_model_client/core/base_client.py RENAMED Viewed

@@ -74,7 +74,7 @@ class BaseClient(ABC):
         # === 重试配置 ===
         self.max_retries = max_retries if max_retries is not None else int(
-            os.getenv("MODEL_MANAGER_SERVER_GRPC_MAX_RETRIES", 3))
+            os.getenv("MODEL_MANAGER_SERVER_GRPC_MAX_RETRIES", 6))
         self.retry_delay = retry_delay if retry_delay is not None else float(
             os.getenv("MODEL_MANAGER_SERVER_GRPC_RETRY_DELAY", 1.0))

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tamar_model_client/exceptions.py RENAMED Viewed

@@ -77,7 +77,7 @@ RETRY_POLICY = {
     grpc.StatusCode.CANCELLED: {
         'retryable': True,
         'backoff': 'linear',        # 线性退避，网络问题通常不需要指数退避
-        'max_attempts': 2,          # 最大重试次数（不包括初始请求），总共会尝试3次
+        'max_attempts': 5,          # 最大重试次数（不包括初始请求），总共会尝试6次
         'check_details': False      # 不检查详细信息，统一重试
     },
     grpc.StatusCode.ABORTED: {

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tamar_model_client/sync_client.py RENAMED Viewed

@@ -22,6 +22,7 @@ Tamar Model Client 同步客户端实现
 import json
 import logging
 import random
+import threading
 import time
 from typing import Optional, Union, Iterator
@@ -95,6 +96,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
         # === gRPC 通道和连接管理 ===
         self.channel: Optional[grpc.Channel] = None
         self.stub: Optional[model_service_pb2_grpc.ModelServiceStub] = None
+        self._channel_error_count = 0
+        self._last_channel_error_time = None
+        self._channel_lock = threading.Lock()  # 线程安全的channel操作
     def close(self):
         """
@@ -143,8 +147,22 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
         Raises:
             ConnectionError: 当达到最大重试次数仍无法连接时
         """
-        if self.channel and self.stub:
+        if self.channel and self.stub and self._is_channel_healthy():
             return
+        # 如果 channel 存在但不健康，记录日志
+        if self.channel and self.stub:
+            logger.warning(
+                "Channel exists but unhealthy, will recreate",
+                extra={
+                    "log_type": "channel_recreate",
+                    "data": {
+                        "channel_error_count": self._channel_error_count,
+                        "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None
+                    }
+                }
+            )
+            self._recreate_channel()
         retry_count = 0
         options = self.build_channel_options()
@@ -196,6 +214,111 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
                 time.sleep(self.retry_delay * retry_count)
         raise ConnectionError(f"Failed to connect to {self.server_address} after {self.max_retries} retries")
+    def _is_channel_healthy(self) -> bool:
+        """
+        检查 channel 是否健康
+        Returns:
+            bool: True 如果 channel 健康，False 如果需要重建
+        """
+        if not self.channel:
+            return False
+        try:
+            # 检查 channel 状态
+            state = self.channel._channel.check_connectivity_state(False)
+            # 如果处于关闭或失败状态，需要重建
+            if state in [grpc.ChannelConnectivity.SHUTDOWN,
+                        grpc.ChannelConnectivity.TRANSIENT_FAILURE]:
+                logger.warning(f"Channel in unhealthy state: {state}",
+                             extra={"log_type": "info",
+                                   "data": {"channel_state": str(state)}})
+                return False
+            # 如果最近有多次错误，也需要重建
+            if self._channel_error_count > 3 and self._last_channel_error_time:
+                if time.time() - self._last_channel_error_time < 60:  # 60秒内
+                    logger.warning("Too many channel errors recently, marking as unhealthy",
+                                 extra={"log_type": "info",
+                                       "data": {"error_count": self._channel_error_count}})
+                    return False
+            return True
+        except Exception as e:
+            logger.error(f"Error checking channel health: {e}",
+                        extra={"log_type": "info",
+                              "data": {"error": str(e)}})
+            return False
+    def _recreate_channel(self):
+        """
+        重建 gRPC channel
+        关闭旧的 channel 并创建新的连接
+        """
+        with self._channel_lock:
+            # 关闭旧 channel
+            if self.channel:
+                try:
+                    self.channel.close()
+                    logger.info("Closed unhealthy channel",
+                              extra={"log_type": "info"})
+                except Exception as e:
+                    logger.warning(f"Error closing channel: {e}",
+                                 extra={"log_type": "info"})
+            # 清空引用
+            self.channel = None
+            self.stub = None
+            # 重置错误计数
+            self._channel_error_count = 0
+            self._last_channel_error_time = None
+            logger.info("Recreating gRPC channel...",
+                       extra={"log_type": "info"})
+    def _record_channel_error(self, error: grpc.RpcError):
+        """
+        记录 channel 错误，用于健康检查
+        Args:
+            error: gRPC 错误
+        """
+        self._channel_error_count += 1
+        self._last_channel_error_time = time.time()
+        # 获取当前 channel 状态
+        channel_state = None
+        if self.channel:
+            try:
+                channel_state = self.channel._channel.check_connectivity_state(False)
+            except:
+                channel_state = "UNKNOWN"
+        # 对于严重错误，增加错误权重
+        if error.code() in [grpc.StatusCode.INTERNAL,
+                           grpc.StatusCode.UNAVAILABLE]:
+            self._channel_error_count += 2
+        # 记录详细的错误信息
+        logger.warning(
+            f"Channel error recorded: {error.code().name}",
+            extra={
+                "log_type": "channel_error",
+                "data": {
+                    "error_code": error.code().name,
+                    "error_count": self._channel_error_count,
+                    "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
+                    "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else 0,
+                    "error_details": error.details() if hasattr(error, 'details') else "",
+                    "debug_string": error.debug_error_string() if hasattr(error, 'debug_error_string') else ""
+                }
+            }
+        )
     def _retry_request(self, func, *args, **kwargs):
         """
@@ -237,6 +360,30 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
                 # 计算当前的耗时
                 current_duration = time.time() - method_start_time
+                # 特殊处理 CANCELLED 错误
+                if e.code() == grpc.StatusCode.CANCELLED:
+                    channel_state = None
+                    if self.channel:
+                        try:
+                            channel_state = self.channel._channel.check_connectivity_state(False)
+                        except:
+                            channel_state = "UNKNOWN"
+                    logger.warning(
+                        f"CANCELLED error detected, channel state: {channel_state}",
+                        extra={
+                            "log_type": "cancelled_debug",
+                            "request_id": context.get('request_id'),
+                            "data": {
+                                "channel_state": str(channel_state) if channel_state else "NO_CHANNEL",
+                                "channel_error_count": self._channel_error_count,
+                                "time_since_last_error": time.time() - self._last_channel_error_time if self._last_channel_error_time else None,
+                                "channel_healthy": self._is_channel_healthy(),
+                                "debug_string": e.debug_error_string() if hasattr(e, 'debug_error_string') else ""
+                            }
+                        }
+                    )
                 # 记录重试日志
                 log_data = {
                     "log_type": "info",
@@ -261,6 +408,9 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
                 context['duration'] = current_duration
                 last_exception = self.error_handler.handle_error(e, context)
+                # 记录 channel 错误
+                self._record_channel_error(e)
             except Exception as e:
                 # 非 gRPC 错误，直接包装抛出
@@ -742,6 +892,10 @@ class TamarModelClient(BaseClient, HttpFallbackMixin):
                              )
                          })
+            # 记录 channel 错误
+            if isinstance(e, grpc.RpcError):
+                self._record_channel_error(e)
             # 记录失败并尝试降级（如果启用了熔断）
             if self.resilient_enabled and self.circuit_breaker:
                 # 将错误码传递给熔断器，用于智能失败统计

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tamar_model_client.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tamar-model-client
-Version: 0.1.24
+Version: 0.1.26
 Summary: A Python SDK for interacting with the Model Manager gRPC service
 Home-page: http://gitlab.tamaredge.top/project-tap/AgentOS/model-manager-client
 Author: Oscar Ou

{tamar_model_client-0.1.24 → tamar_model_client-0.1.26}/tests/test_google_azure_final.py RENAMED Viewed

@@ -27,7 +27,7 @@ test_logger.addHandler(test_handler)
 logger = test_logger
 os.environ['MODEL_MANAGER_SERVER_GRPC_USE_TLS'] = "true"
-os.environ['MODEL_MANAGER_SERVER_ADDRESS'] = "localhost:50051"
+os.environ['MODEL_MANAGER_SERVER_ADDRESS'] = "model-manager-server-grpc-131786869360.asia-northeast1.run.app"
 os.environ['MODEL_MANAGER_SERVER_JWT_SECRET_KEY'] = "model-manager-server-jwt-key"
 # 导入客户端模块
@@ -645,10 +645,10 @@ async def main():
         # await asyncio.wait_for(test_batch_requests(), timeout=120.0)
         # 同步并发测试
-        #test_concurrent_requests(150)  # 测试150个并发请求
+        test_concurrent_requests(150)  # 测试150个并发请求
         # 异步并发测试
-        await test_async_concurrent_requests(50)  # 测试150个异步并发请求
+        await test_async_concurrent_requests(150)  # 测试150个异步并发请求
         print("\n✅ 测试完成")