xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +111 -20
- xparse_client/_base.py +188 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +351 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +225 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +134 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +132 -0
- xparse_client/models/pipeline.py +134 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b8.dist-info/RECORD +68 -0
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -129
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -690
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.19.dist-info/METADATA +0 -1050
- xparse_client-0.2.19.dist-info/RECORD +0 -11
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""SMB/CIFS 数据源(懒加载 pysmb)"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...exceptions import SourceError
|
|
11
|
+
from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
|
|
12
|
+
from .base import Source
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_smb_connection():
|
|
18
|
+
"""懒加载 pysmb
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
SMBConnection 类
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
ImportError: pysmb 未安装
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
from smb.SMBConnection import SMBConnection
|
|
28
|
+
|
|
29
|
+
return SMBConnection
|
|
30
|
+
except ImportError as e:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"使用 SmbSource 需要安装 pysmb: pip install xparse-client[smb]"
|
|
33
|
+
) from e
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SmbSource(Source):
|
|
37
|
+
"""SMB/CIFS 数据源
|
|
38
|
+
|
|
39
|
+
从 SMB/CIFS 共享读取文件。pysmb 会在首次使用时懒加载。
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
host: SMB 主机
|
|
43
|
+
share_name: 共享名称
|
|
44
|
+
path: 共享内的路径
|
|
45
|
+
pattern: 文件匹配模式
|
|
46
|
+
recursive: 是否递归
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
>>> source = SmbSource(
|
|
50
|
+
... host="192.168.1.100",
|
|
51
|
+
... share_name="documents",
|
|
52
|
+
... username="user",
|
|
53
|
+
... password="pass",
|
|
54
|
+
... path="reports/",
|
|
55
|
+
... pattern=["*.pdf"],
|
|
56
|
+
... )
|
|
57
|
+
>>> files = source.list_files()
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
host: str,
|
|
63
|
+
share_name: str,
|
|
64
|
+
username: str,
|
|
65
|
+
password: str,
|
|
66
|
+
domain: str = "",
|
|
67
|
+
port: int = 445,
|
|
68
|
+
path: str = "",
|
|
69
|
+
pattern: list[str] | None = None,
|
|
70
|
+
recursive: bool = False,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""初始化 SMB 数据源
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
host: SMB 主机地址
|
|
76
|
+
share_name: 共享名称
|
|
77
|
+
username: 用户名
|
|
78
|
+
password: 密码
|
|
79
|
+
domain: 域名(可选)
|
|
80
|
+
port: SMB 端口,默认 445
|
|
81
|
+
path: 共享内的路径(可选)
|
|
82
|
+
pattern: 文件匹配模式列表
|
|
83
|
+
recursive: 是否递归,默认 False
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
SourceError: 连接失败
|
|
87
|
+
"""
|
|
88
|
+
SMBConnection = _get_smb_connection()
|
|
89
|
+
|
|
90
|
+
self.host = host
|
|
91
|
+
self.share_name = share_name
|
|
92
|
+
self.username = username
|
|
93
|
+
self.domain = domain
|
|
94
|
+
self.port = port
|
|
95
|
+
self.path = path.strip("/").strip("\\") if path else ""
|
|
96
|
+
self.pattern = normalize_wildcard_patterns(pattern)
|
|
97
|
+
self.recursive = recursive
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
self.conn = SMBConnection(
|
|
101
|
+
username,
|
|
102
|
+
password,
|
|
103
|
+
"", # my_name
|
|
104
|
+
host,
|
|
105
|
+
domain=domain,
|
|
106
|
+
use_ntlm_v2=True,
|
|
107
|
+
)
|
|
108
|
+
self.conn.connect(host, port)
|
|
109
|
+
logger.info(f"SMB 连接成功: {host}/{share_name}")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise SourceError(
|
|
112
|
+
f"SMB 连接失败: {e}",
|
|
113
|
+
connector_type="smb",
|
|
114
|
+
operation="connect",
|
|
115
|
+
details={"host": host, "share_name": share_name},
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
def list_files(self) -> list[str]:
|
|
119
|
+
"""列出 SMB 文件
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
文件路径列表
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
SourceError: 列出文件失败
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
files: list[str] = []
|
|
129
|
+
base_path = "/" if not self.path else f"/{self.path}"
|
|
130
|
+
|
|
131
|
+
self._list_recursive(base_path, base_path, files)
|
|
132
|
+
|
|
133
|
+
logger.info(f"SMB 找到 {len(files)} 个文件")
|
|
134
|
+
return files
|
|
135
|
+
|
|
136
|
+
except SourceError:
|
|
137
|
+
raise
|
|
138
|
+
except Exception as e:
|
|
139
|
+
raise SourceError(
|
|
140
|
+
f"列出 SMB 文件失败: {e}",
|
|
141
|
+
connector_type="smb",
|
|
142
|
+
operation="list_files",
|
|
143
|
+
) from e
|
|
144
|
+
|
|
145
|
+
def _list_recursive(
|
|
146
|
+
self, current_path: str, base_path: str, files: list[str]
|
|
147
|
+
) -> None:
|
|
148
|
+
"""递归列出目录下的文件"""
|
|
149
|
+
try:
|
|
150
|
+
items = self.conn.listPath(self.share_name, current_path)
|
|
151
|
+
|
|
152
|
+
for item in items:
|
|
153
|
+
# 跳过 . 和 .. 以及隐藏文件
|
|
154
|
+
if item.filename in [".", ".."] or item.filename.startswith("."):
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
item_path = (
|
|
158
|
+
f"{current_path.rstrip('/')}/{item.filename}"
|
|
159
|
+
if current_path != "/"
|
|
160
|
+
else f"/{item.filename}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# 计算相对路径
|
|
164
|
+
relative_path = item_path[len(base_path) :].lstrip("/")
|
|
165
|
+
|
|
166
|
+
if item.isDirectory:
|
|
167
|
+
if self.recursive:
|
|
168
|
+
self._list_recursive(item_path, base_path, files)
|
|
169
|
+
# 非递归模式忽略子目录
|
|
170
|
+
else:
|
|
171
|
+
if match_file_pattern(relative_path, self.pattern):
|
|
172
|
+
files.append(relative_path)
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.warning(f"SMB 列出路径失败 {current_path}: {e}")
|
|
176
|
+
|
|
177
|
+
def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
|
|
178
|
+
"""读取 SMB 文件
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
file_path: 文件相对路径
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
(文件内容, 元信息) 元组
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
SourceError: 读取文件失败
|
|
188
|
+
"""
|
|
189
|
+
base_path = "/" if not self.path else f"/{self.path}"
|
|
190
|
+
full_path = (
|
|
191
|
+
f"{base_path.rstrip('/')}/{file_path.lstrip('/')}"
|
|
192
|
+
if base_path != "/"
|
|
193
|
+
else f"/{file_path.lstrip('/')}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
file_obj = BytesIO()
|
|
198
|
+
self.conn.retrieveFile(self.share_name, full_path, file_obj)
|
|
199
|
+
|
|
200
|
+
# 获取文件属性
|
|
201
|
+
date_created = None
|
|
202
|
+
date_modified = None
|
|
203
|
+
try:
|
|
204
|
+
attrs = self.conn.getAttributes(self.share_name, full_path)
|
|
205
|
+
date_created = self._to_timestamp(getattr(attrs, "create_time", None))
|
|
206
|
+
date_modified = self._to_timestamp(
|
|
207
|
+
getattr(attrs, "last_write_time", None)
|
|
208
|
+
)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.debug(f"SMB 获取文件属性失败 {full_path}: {e}")
|
|
211
|
+
|
|
212
|
+
smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
|
|
213
|
+
data_source = {
|
|
214
|
+
"url": smb_url,
|
|
215
|
+
"version": to_millis_timestamp(date_modified),
|
|
216
|
+
"date_created": to_millis_timestamp(date_created),
|
|
217
|
+
"date_modified": to_millis_timestamp(date_modified),
|
|
218
|
+
"record_locator": {
|
|
219
|
+
"server": self.host,
|
|
220
|
+
"share": self.share_name,
|
|
221
|
+
"protocol": "smb",
|
|
222
|
+
"remote_file_path": full_path,
|
|
223
|
+
},
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
file_obj.seek(0)
|
|
227
|
+
return file_obj.read(), data_source
|
|
228
|
+
|
|
229
|
+
except Exception as e:
|
|
230
|
+
raise SourceError(
|
|
231
|
+
f"读取 SMB 文件失败: {file_path}, {e}",
|
|
232
|
+
connector_type="smb",
|
|
233
|
+
operation="read_file",
|
|
234
|
+
) from e
|
|
235
|
+
|
|
236
|
+
@staticmethod
|
|
237
|
+
def _to_timestamp(value: Any) -> float | None:
|
|
238
|
+
"""将时间值转换为 Unix 时间戳"""
|
|
239
|
+
if value is None:
|
|
240
|
+
return None
|
|
241
|
+
if isinstance(value, datetime):
|
|
242
|
+
return value.astimezone(timezone.utc).timestamp()
|
|
243
|
+
if isinstance(value, (int, float)):
|
|
244
|
+
return float(value)
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
def close(self) -> None:
|
|
248
|
+
"""关闭 SMB 连接"""
|
|
249
|
+
try:
|
|
250
|
+
if hasattr(self, "conn") and self.conn:
|
|
251
|
+
self.conn.close()
|
|
252
|
+
except Exception:
|
|
253
|
+
pass
|
|
254
|
+
|
|
255
|
+
def __repr__(self) -> str:
|
|
256
|
+
return f"<SmbSource host={self.host} share={self.share_name}>"
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
__all__ = ["SmbSource"]
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""统一异常类层次结构
|
|
2
|
+
|
|
3
|
+
提供结构化的异常类型,便于错误处理和排查。
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
>>> from xparse_client.exceptions import APIError, AuthenticationError
|
|
7
|
+
>>>
|
|
8
|
+
>>> try:
|
|
9
|
+
... result = client.parse.partition(file=file_bytes, filename="doc.pdf")
|
|
10
|
+
... except AuthenticationError as e:
|
|
11
|
+
... print(f"认证失败: {e.message}, request_id: {e.request_id}")
|
|
12
|
+
... except APIError as e:
|
|
13
|
+
... print(f"API 错误: {e}")
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class XParseClientError(Exception):
|
|
22
|
+
"""xParse SDK 基础异常类
|
|
23
|
+
|
|
24
|
+
所有 SDK 异常的基类,提供统一的异常结构。
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
message: 错误信息
|
|
28
|
+
details: 额外的错误详情
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
message: str,
|
|
34
|
+
*,
|
|
35
|
+
details: dict[str, Any] | None = None,
|
|
36
|
+
) -> None:
|
|
37
|
+
self.message = message
|
|
38
|
+
self.details = details or {}
|
|
39
|
+
super().__init__(message)
|
|
40
|
+
|
|
41
|
+
def __str__(self) -> str:
|
|
42
|
+
if self.details:
|
|
43
|
+
details_str = ", ".join(f"{k}={v}" for k, v in self.details.items())
|
|
44
|
+
return f"{self.message} ({details_str})"
|
|
45
|
+
return self.message
|
|
46
|
+
|
|
47
|
+
def __repr__(self) -> str:
|
|
48
|
+
return f"{self.__class__.__name__}(message={self.message!r}, details={self.details!r})"
|
|
49
|
+
|
|
50
|
+
def to_dict(self) -> dict[str, Any]:
|
|
51
|
+
"""转换为字典,便于序列化"""
|
|
52
|
+
return {
|
|
53
|
+
"error_type": self.__class__.__name__,
|
|
54
|
+
"message": self.message,
|
|
55
|
+
"details": self.details,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ============================================================
|
|
60
|
+
# 配置和验证错误
|
|
61
|
+
# ============================================================
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ConfigurationError(XParseClientError):
|
|
65
|
+
"""配置错误
|
|
66
|
+
|
|
67
|
+
SDK 配置不正确时抛出。
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> raise ConfigurationError(
|
|
71
|
+
... "缺少必要的配置参数",
|
|
72
|
+
... details={"missing_params": ["app_id", "secret_code"]}
|
|
73
|
+
... )
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ValidationError(XParseClientError):
|
|
80
|
+
"""输入验证错误
|
|
81
|
+
|
|
82
|
+
请求参数验证失败时抛出。
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
field: 验证失败的字段名
|
|
86
|
+
value: 传入的值
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
message: str,
|
|
92
|
+
*,
|
|
93
|
+
field: str | None = None,
|
|
94
|
+
value: Any = None,
|
|
95
|
+
details: dict[str, Any] | None = None,
|
|
96
|
+
) -> None:
|
|
97
|
+
self.field = field
|
|
98
|
+
self.value = value
|
|
99
|
+
|
|
100
|
+
_details = details or {}
|
|
101
|
+
if field:
|
|
102
|
+
_details["field"] = field
|
|
103
|
+
if value is not None:
|
|
104
|
+
_details["value"] = str(value)[:100] # 截断过长的值
|
|
105
|
+
|
|
106
|
+
super().__init__(message, details=_details)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ============================================================
|
|
110
|
+
# API 错误
|
|
111
|
+
# ============================================================
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class APIError(XParseClientError):
|
|
115
|
+
"""API 错误基类
|
|
116
|
+
|
|
117
|
+
所有 HTTP API 相关错误的基类。
|
|
118
|
+
|
|
119
|
+
Attributes:
|
|
120
|
+
message: 错误信息
|
|
121
|
+
status_code: HTTP 状态码
|
|
122
|
+
request_id: 请求 ID(用于排查)
|
|
123
|
+
response_body: 原始响应体
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
message: str,
|
|
129
|
+
*,
|
|
130
|
+
status_code: int | None = None,
|
|
131
|
+
request_id: str | None = None,
|
|
132
|
+
response_body: str | None = None,
|
|
133
|
+
details: dict[str, Any] | None = None,
|
|
134
|
+
) -> None:
|
|
135
|
+
self.status_code = status_code
|
|
136
|
+
self.request_id = request_id
|
|
137
|
+
self.response_body = response_body
|
|
138
|
+
|
|
139
|
+
_details = details or {}
|
|
140
|
+
if status_code:
|
|
141
|
+
_details["status_code"] = status_code
|
|
142
|
+
if request_id:
|
|
143
|
+
_details["request_id"] = request_id
|
|
144
|
+
|
|
145
|
+
super().__init__(message, details=_details)
|
|
146
|
+
|
|
147
|
+
def __str__(self) -> str:
|
|
148
|
+
parts = [self.message]
|
|
149
|
+
if self.status_code:
|
|
150
|
+
parts.append(f"status={self.status_code}")
|
|
151
|
+
if self.request_id:
|
|
152
|
+
parts.append(f"request_id={self.request_id}")
|
|
153
|
+
return " ".join(parts)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class AuthenticationError(APIError):
|
|
157
|
+
"""认证错误 (401)
|
|
158
|
+
|
|
159
|
+
API 密钥无效或已过期时抛出。
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
>>> raise AuthenticationError(
|
|
163
|
+
... "API 密钥无效",
|
|
164
|
+
... status_code=401,
|
|
165
|
+
... request_id="req-123"
|
|
166
|
+
... )
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class PermissionDeniedError(APIError):
|
|
173
|
+
"""权限错误 (403)
|
|
174
|
+
|
|
175
|
+
无权访问请求的资源时抛出。
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class NotFoundError(APIError):
|
|
182
|
+
"""资源不存在错误 (404)
|
|
183
|
+
|
|
184
|
+
请求的资源不存在时抛出。
|
|
185
|
+
|
|
186
|
+
Attributes:
|
|
187
|
+
resource_type: 资源类型(如 job, workflow)
|
|
188
|
+
resource_id: 资源 ID
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
message: str,
|
|
194
|
+
*,
|
|
195
|
+
resource_type: str | None = None,
|
|
196
|
+
resource_id: str | None = None,
|
|
197
|
+
**kwargs: Any,
|
|
198
|
+
) -> None:
|
|
199
|
+
self.resource_type = resource_type
|
|
200
|
+
self.resource_id = resource_id
|
|
201
|
+
|
|
202
|
+
details = kwargs.pop("details", {}) or {}
|
|
203
|
+
if resource_type:
|
|
204
|
+
details["resource_type"] = resource_type
|
|
205
|
+
if resource_id:
|
|
206
|
+
details["resource_id"] = resource_id
|
|
207
|
+
|
|
208
|
+
super().__init__(message, details=details, **kwargs)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class RateLimitError(APIError):
|
|
212
|
+
"""限流错误 (429)
|
|
213
|
+
|
|
214
|
+
请求频率过高时抛出。
|
|
215
|
+
|
|
216
|
+
Attributes:
|
|
217
|
+
retry_after: 建议的重试等待时间(秒)
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
def __init__(
|
|
221
|
+
self,
|
|
222
|
+
message: str,
|
|
223
|
+
*,
|
|
224
|
+
retry_after: int | None = None,
|
|
225
|
+
**kwargs: Any,
|
|
226
|
+
) -> None:
|
|
227
|
+
self.retry_after = retry_after
|
|
228
|
+
|
|
229
|
+
details = kwargs.pop("details", {}) or {}
|
|
230
|
+
if retry_after:
|
|
231
|
+
details["retry_after"] = retry_after
|
|
232
|
+
|
|
233
|
+
super().__init__(message, details=details, **kwargs)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class ServerError(APIError):
|
|
237
|
+
"""服务器错误 (5xx)
|
|
238
|
+
|
|
239
|
+
服务器内部错误时抛出。
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class RequestTimeoutError(APIError):
|
|
246
|
+
"""请求超时错误
|
|
247
|
+
|
|
248
|
+
请求超时时抛出。
|
|
249
|
+
|
|
250
|
+
Attributes:
|
|
251
|
+
timeout_seconds: 超时时间设置
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
def __init__(
|
|
255
|
+
self,
|
|
256
|
+
message: str,
|
|
257
|
+
*,
|
|
258
|
+
timeout_seconds: float | None = None,
|
|
259
|
+
**kwargs: Any,
|
|
260
|
+
) -> None:
|
|
261
|
+
self.timeout_seconds = timeout_seconds
|
|
262
|
+
|
|
263
|
+
details = kwargs.pop("details", {}) or {}
|
|
264
|
+
if timeout_seconds:
|
|
265
|
+
details["timeout_seconds"] = timeout_seconds
|
|
266
|
+
|
|
267
|
+
super().__init__(message, details=details, **kwargs)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# ============================================================
|
|
271
|
+
# 连接器错误
|
|
272
|
+
# ============================================================
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class ConnectorError(XParseClientError):
|
|
276
|
+
"""连接器错误基类
|
|
277
|
+
|
|
278
|
+
Source 和 Destination 操作相关错误的基类。
|
|
279
|
+
|
|
280
|
+
Attributes:
|
|
281
|
+
connector_type: 连接器类型(如 s3, local, milvus)
|
|
282
|
+
operation: 操作类型(如 list, read, write)
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
def __init__(
|
|
286
|
+
self,
|
|
287
|
+
message: str,
|
|
288
|
+
*,
|
|
289
|
+
connector_type: str | None = None,
|
|
290
|
+
operation: str | None = None,
|
|
291
|
+
details: dict[str, Any] | None = None,
|
|
292
|
+
) -> None:
|
|
293
|
+
self.connector_type = connector_type
|
|
294
|
+
self.operation = operation
|
|
295
|
+
|
|
296
|
+
_details = details or {}
|
|
297
|
+
if connector_type:
|
|
298
|
+
_details["connector_type"] = connector_type
|
|
299
|
+
if operation:
|
|
300
|
+
_details["operation"] = operation
|
|
301
|
+
|
|
302
|
+
super().__init__(message, details=_details)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class SourceError(ConnectorError):
|
|
306
|
+
"""数据源错误
|
|
307
|
+
|
|
308
|
+
读取数据源时发生错误。
|
|
309
|
+
|
|
310
|
+
Example:
|
|
311
|
+
>>> raise SourceError(
|
|
312
|
+
... "无法连接到 S3 存储桶",
|
|
313
|
+
... connector_type="s3",
|
|
314
|
+
... operation="connect",
|
|
315
|
+
... details={"bucket": "my-bucket"}
|
|
316
|
+
... )
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
pass
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class DestinationError(ConnectorError):
|
|
323
|
+
"""目的地错误
|
|
324
|
+
|
|
325
|
+
写入目的地时发生错误。
|
|
326
|
+
|
|
327
|
+
Example:
|
|
328
|
+
>>> raise DestinationError(
|
|
329
|
+
... "写入 Milvus 失败",
|
|
330
|
+
... connector_type="milvus",
|
|
331
|
+
... operation="write",
|
|
332
|
+
... details={"collection": "my_collection"}
|
|
333
|
+
... )
|
|
334
|
+
"""
|
|
335
|
+
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ============================================================
|
|
340
|
+
# Pipeline 错误
|
|
341
|
+
# ============================================================
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class PipelineError(XParseClientError):
|
|
345
|
+
"""Pipeline 执行错误
|
|
346
|
+
|
|
347
|
+
Pipeline 执行过程中发生的错误。
|
|
348
|
+
|
|
349
|
+
Attributes:
|
|
350
|
+
stage: 发生错误的阶段(如 parse, chunk, embed)
|
|
351
|
+
filename: 处理的文件名
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def __init__(
|
|
355
|
+
self,
|
|
356
|
+
message: str,
|
|
357
|
+
*,
|
|
358
|
+
stage: str | None = None,
|
|
359
|
+
filename: str | None = None,
|
|
360
|
+
details: dict[str, Any] | None = None,
|
|
361
|
+
) -> None:
|
|
362
|
+
self.stage = stage
|
|
363
|
+
self.filename = filename
|
|
364
|
+
|
|
365
|
+
_details = details or {}
|
|
366
|
+
if stage:
|
|
367
|
+
_details["stage"] = stage
|
|
368
|
+
if filename:
|
|
369
|
+
_details["filename"] = filename
|
|
370
|
+
|
|
371
|
+
super().__init__(message, details=_details)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
# ============================================================
|
|
375
|
+
# 导出
|
|
376
|
+
# ============================================================
|
|
377
|
+
|
|
378
|
+
__all__ = [
|
|
379
|
+
# 基类
|
|
380
|
+
"XParseClientError",
|
|
381
|
+
# 配置和验证
|
|
382
|
+
"ConfigurationError",
|
|
383
|
+
"ValidationError",
|
|
384
|
+
# API 错误
|
|
385
|
+
"APIError",
|
|
386
|
+
"AuthenticationError",
|
|
387
|
+
"PermissionDeniedError",
|
|
388
|
+
"NotFoundError",
|
|
389
|
+
"RateLimitError",
|
|
390
|
+
"ServerError",
|
|
391
|
+
"RequestTimeoutError",
|
|
392
|
+
# 连接器错误
|
|
393
|
+
"ConnectorError",
|
|
394
|
+
"SourceError",
|
|
395
|
+
"DestinationError",
|
|
396
|
+
# Pipeline 错误
|
|
397
|
+
"PipelineError",
|
|
398
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""数据模型模块
|
|
2
|
+
|
|
3
|
+
提供 API 请求和响应的 Pydantic 数据模型。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .chunk import ChunkConfig
|
|
7
|
+
from .embed import EmbedConfig
|
|
8
|
+
from .extract import ExtractConfig
|
|
9
|
+
from .local import FailedFile, WorkflowResult
|
|
10
|
+
from .parse import (
|
|
11
|
+
AsyncJobResponse,
|
|
12
|
+
Element,
|
|
13
|
+
ElementMetadata,
|
|
14
|
+
JobStatusResponse,
|
|
15
|
+
ParseConfig,
|
|
16
|
+
ParseResponse,
|
|
17
|
+
)
|
|
18
|
+
from .pipeline import (
|
|
19
|
+
ChunkStage,
|
|
20
|
+
EmbedStage,
|
|
21
|
+
ExtractStage,
|
|
22
|
+
ParseStage,
|
|
23
|
+
PipelineConfig,
|
|
24
|
+
PipelineResponse,
|
|
25
|
+
PipelineStage,
|
|
26
|
+
PipelineStats,
|
|
27
|
+
)
|
|
28
|
+
from .workflows import Schedule, WorkflowInformation, WorkflowState
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Parse 模型
|
|
32
|
+
"ParseConfig",
|
|
33
|
+
"Element",
|
|
34
|
+
"ElementMetadata",
|
|
35
|
+
"ParseResponse",
|
|
36
|
+
"AsyncJobResponse",
|
|
37
|
+
"JobStatusResponse",
|
|
38
|
+
# Extract 模型
|
|
39
|
+
"ExtractConfig",
|
|
40
|
+
# Chunk 模型
|
|
41
|
+
"ChunkConfig",
|
|
42
|
+
# Embed 模型
|
|
43
|
+
"EmbedConfig",
|
|
44
|
+
# Pipeline 模型
|
|
45
|
+
"ParseStage",
|
|
46
|
+
"ChunkStage",
|
|
47
|
+
"EmbedStage",
|
|
48
|
+
"ExtractStage",
|
|
49
|
+
"PipelineStage",
|
|
50
|
+
"PipelineStats",
|
|
51
|
+
"PipelineConfig",
|
|
52
|
+
"PipelineResponse",
|
|
53
|
+
# Local 模型
|
|
54
|
+
"FailedFile",
|
|
55
|
+
"WorkflowResult",
|
|
56
|
+
# Workflows 模型
|
|
57
|
+
"WorkflowInformation",
|
|
58
|
+
"WorkflowState",
|
|
59
|
+
"Schedule",
|
|
60
|
+
]
|