PyPI - xparse-client - Versions diffs - 0.2.17__tar.gz → 0.2.19__tar.gz - Mend

xparse-client 0.2.17tar.gz → 0.2.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{xparse_client-0.2.17 → xparse_client-0.2.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.17
+Version: 0.2.19
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline

{xparse_client-0.2.17 → xparse_client-0.2.19}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xparse-client"
-version = "0.2.17"
+version = "0.2.19"
 description = "面向Agent和RAG的新一代文档处理 AI Infra"
 readme = "README.md"
 license = "MIT"

{xparse_client-0.2.17 → xparse_client-0.2.19}/xparse_client/pipeline/pipeline.py RENAMED Viewed

@@ -3,6 +3,7 @@
 import json
 import logging
+import re
 import time
 from datetime import datetime, timezone
 from pathlib import Path
@@ -220,6 +221,67 @@ class Pipeline:
         return config
+    def _extract_error_message(self, response: requests.Response) -> Tuple[str, str]:
+        """
+        从响应中提取规范化的错误信息
+        Returns:
+            Tuple[str, str]: (error_msg, x_request_id)
+        """
+        # 首先尝试从响应头中提取 x-request-id（requests的headers大小写不敏感）
+        x_request_id = response.headers.get('x-request-id', '')
+        error_msg = ''
+        # 获取Content-Type
+        content_type = response.headers.get('Content-Type', '').lower()
+        # 尝试解析JSON响应
+        if 'application/json' in content_type:
+            try:
+                result = response.json()
+                # 如果响应头中没有x-request-id，尝试从响应体中获取
+                if not x_request_id:
+                    x_request_id = result.get('x_request_id', '')
+                error_msg = result.get('message', result.get('msg', f'HTTP {response.status_code}'))
+                return error_msg, x_request_id
+            except:
+                pass
+        # 处理HTML响应
+        if 'text/html' in content_type or response.text.strip().startswith('<'):
+            try:
+                # 从HTML中提取标题（通常包含状态码和状态文本）
+                title_match = re.search(r'<title>(.*?)</title>', response.text, re.IGNORECASE)
+                if title_match:
+                    error_msg = title_match.group(1).strip()
+                else:
+                    # 如果没有title，尝试提取h1标签
+                    h1_match = re.search(r'<h1>(.*?)</h1>', response.text, re.IGNORECASE)
+                    if h1_match:
+                        error_msg = h1_match.group(1).strip()
+                    else:
+                        error_msg = f'HTTP {response.status_code}'
+            except:
+                error_msg = f'HTTP {response.status_code}'
+        # 处理纯文本响应
+        elif 'text/plain' in content_type:
+            error_msg = response.text[:200].strip() if response.text else f'HTTP {response.status_code}'
+        # 其他情况
+        else:
+            if response.text:
+                # 尝试截取前200字符，但去除换行和多余空格
+                text = response.text[:200].strip()
+                # 如果包含多行，只取第一行
+                if '\n' in text:
+                    text = text.split('\n')[0].strip()
+                error_msg = text if text else f'HTTP {response.status_code}'
+            else:
+                error_msg = f'HTTP {response.status_code}'
+        return error_msg, x_request_id
     def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         url = f"{self.api_base_url}/pipeline"
         max_retries = 3
@@ -263,15 +325,8 @@ class Pipeline:
                     logger.error(f"Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
                     return None
                 else:
-                    # 尝试解析响应获取 x_request_id 和错误信息
-                    x_request_id = ''
-                    error_msg = ''
-                    try:
-                        result = response.json()
-                        x_request_id = result.get('x_request_id', '')
-                        error_msg = result.get('message', result.get('msg', response.text[:200]))
-                    except:
-                        error_msg = response.text[:200] if response.text else f'HTTP {response.status_code}'
+                    # 使用规范化函数提取错误信息
+                    error_msg, x_request_id = self._extract_error_message(response)
                     print(f"  ✗ API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
                     logger.warning(f"API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
@@ -374,6 +429,14 @@ class Pipeline:
             print(f"  → 读取文件...")
             file_bytes, data_source = self.source.read_file(file_path)
             data_source = data_source or {}
+            # 检查文件大小，超过 100MB 则报错
+            MAX_FILE_SIZE = 100 * 1024 * 1024  # 100MB
+            file_size = len(file_bytes)
+            if file_size > MAX_FILE_SIZE:
+                file_size_mb = file_size / (1024 * 1024)
+                raise ValueError(f"文件大小过大: {file_size_mb:.2f}MB，超过100MB限制")
             # 转换为毫秒时间戳字符串
             timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
             data_source['date_processed'] = str(timestamp_ms)