xparse-client 0.2.17__tar.gz → 0.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xparse-client"
7
- version = "0.2.17"
7
+ version = "0.2.19"
8
8
  description = "面向Agent和RAG的新一代文档处理 AI Infra"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -3,6 +3,7 @@
3
3
 
4
4
  import json
5
5
  import logging
6
+ import re
6
7
  import time
7
8
  from datetime import datetime, timezone
8
9
  from pathlib import Path
@@ -220,6 +221,67 @@ class Pipeline:
220
221
 
221
222
  return config
222
223
 
224
+ def _extract_error_message(self, response: requests.Response) -> Tuple[str, str]:
225
+ """
226
+ 从响应中提取规范化的错误信息
227
+
228
+ Returns:
229
+ Tuple[str, str]: (error_msg, x_request_id)
230
+ """
231
+ # 首先尝试从响应头中提取 x-request-id(requests的headers大小写不敏感)
232
+ x_request_id = response.headers.get('x-request-id', '')
233
+ error_msg = ''
234
+
235
+ # 获取Content-Type
236
+ content_type = response.headers.get('Content-Type', '').lower()
237
+
238
+ # 尝试解析JSON响应
239
+ if 'application/json' in content_type:
240
+ try:
241
+ result = response.json()
242
+ # 如果响应头中没有x-request-id,尝试从响应体中获取
243
+ if not x_request_id:
244
+ x_request_id = result.get('x_request_id', '')
245
+ error_msg = result.get('message', result.get('msg', f'HTTP {response.status_code}'))
246
+ return error_msg, x_request_id
247
+ except:
248
+ pass
249
+
250
+ # 处理HTML响应
251
+ if 'text/html' in content_type or response.text.strip().startswith('<'):
252
+ try:
253
+ # 从HTML中提取标题(通常包含状态码和状态文本)
254
+ title_match = re.search(r'<title>(.*?)</title>', response.text, re.IGNORECASE)
255
+ if title_match:
256
+ error_msg = title_match.group(1).strip()
257
+ else:
258
+ # 如果没有title,尝试提取h1标签
259
+ h1_match = re.search(r'<h1>(.*?)</h1>', response.text, re.IGNORECASE)
260
+ if h1_match:
261
+ error_msg = h1_match.group(1).strip()
262
+ else:
263
+ error_msg = f'HTTP {response.status_code}'
264
+ except:
265
+ error_msg = f'HTTP {response.status_code}'
266
+
267
+ # 处理纯文本响应
268
+ elif 'text/plain' in content_type:
269
+ error_msg = response.text[:200].strip() if response.text else f'HTTP {response.status_code}'
270
+
271
+ # 其他情况
272
+ else:
273
+ if response.text:
274
+ # 尝试截取前200字符,但去除换行和多余空格
275
+ text = response.text[:200].strip()
276
+ # 如果包含多行,只取第一行
277
+ if '\n' in text:
278
+ text = text.split('\n')[0].strip()
279
+ error_msg = text if text else f'HTTP {response.status_code}'
280
+ else:
281
+ error_msg = f'HTTP {response.status_code}'
282
+
283
+ return error_msg, x_request_id
284
+
223
285
  def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
224
286
  url = f"{self.api_base_url}/pipeline"
225
287
  max_retries = 3
@@ -263,15 +325,8 @@ class Pipeline:
263
325
  logger.error(f"Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
264
326
  return None
265
327
  else:
266
- # 尝试解析响应获取 x_request_id 和错误信息
267
- x_request_id = ''
268
- error_msg = ''
269
- try:
270
- result = response.json()
271
- x_request_id = result.get('x_request_id', '')
272
- error_msg = result.get('message', result.get('msg', response.text[:200]))
273
- except:
274
- error_msg = response.text[:200] if response.text else f'HTTP {response.status_code}'
328
+ # 使用规范化函数提取错误信息
329
+ error_msg, x_request_id = self._extract_error_message(response)
275
330
 
276
331
  print(f" ✗ API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
277
332
  logger.warning(f"API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
@@ -374,6 +429,14 @@ class Pipeline:
374
429
  print(f" → 读取文件...")
375
430
  file_bytes, data_source = self.source.read_file(file_path)
376
431
  data_source = data_source or {}
432
+
433
+ # 检查文件大小,超过 100MB 则报错
434
+ MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB
435
+ file_size = len(file_bytes)
436
+ if file_size > MAX_FILE_SIZE:
437
+ file_size_mb = file_size / (1024 * 1024)
438
+ raise ValueError(f"文件大小过大: {file_size_mb:.2f}MB,超过100MB限制")
439
+
377
440
  # 转换为毫秒时间戳字符串
378
441
  timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
379
442
  data_source['date_processed'] = str(timestamp_ms)
File without changes
File without changes
File without changes