xparse-client 0.2.20__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +110 -20
- xparse_client/_base.py +179 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +350 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +185 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +132 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +136 -0
- xparse_client/models/pipeline.py +132 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b1.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b1.dist-info/RECORD +68 -0
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -163
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -860
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.20.dist-info/METADATA +0 -1050
- xparse_client-0.2.20.dist-info/RECORD +0 -11
xparse_client/_http.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""HTTP 客户端抽象层
|
|
2
|
+
|
|
3
|
+
基于 httpx 的 HTTP 客户端封装。
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- 自动重试(指数退避)
|
|
7
|
+
- 统一的错误处理
|
|
8
|
+
- 请求日志
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
from ._config import RetryConfiguration, SDKConfiguration
|
|
20
|
+
from .exceptions import (
|
|
21
|
+
APIError,
|
|
22
|
+
AuthenticationError,
|
|
23
|
+
NotFoundError,
|
|
24
|
+
PermissionDeniedError,
|
|
25
|
+
RateLimitError,
|
|
26
|
+
RequestTimeoutError,
|
|
27
|
+
ServerError,
|
|
28
|
+
ValidationError,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _extract_error_message(response: httpx.Response) -> str:
|
|
35
|
+
"""从响应中提取错误信息
|
|
36
|
+
|
|
37
|
+
支持多种响应格式:
|
|
38
|
+
- JSON: {"message": "..."} 或 {"error": "..."} 或 {"detail": "..."}
|
|
39
|
+
- 纯文本
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
response: HTTP 响应
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
错误信息字符串
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
data = response.json()
|
|
49
|
+
if isinstance(data, dict):
|
|
50
|
+
# 尝试多种常见的错误字段
|
|
51
|
+
for key in ("message", "error", "detail", "msg", "error_message"):
|
|
52
|
+
if key in data:
|
|
53
|
+
return str(data[key])
|
|
54
|
+
# 如果有嵌套的 error 对象
|
|
55
|
+
if "error" in data and isinstance(data["error"], dict):
|
|
56
|
+
return data["error"].get("message", str(data["error"]))
|
|
57
|
+
return str(data)
|
|
58
|
+
except Exception:
|
|
59
|
+
# 非 JSON 响应,返回文本内容
|
|
60
|
+
text = response.text.strip()
|
|
61
|
+
if text:
|
|
62
|
+
return text[:500] # 截断过长的响应
|
|
63
|
+
return f"HTTP {response.status_code}"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _extract_request_id(response: httpx.Response) -> str | None:
|
|
67
|
+
"""从响应头中提取 request_id
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
response: HTTP 响应
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
request_id 或 None
|
|
74
|
+
"""
|
|
75
|
+
# 尝试多种常见的 request id 头
|
|
76
|
+
for header in ("x-request-id", "x-req-id", "request-id", "X-Request-Id"):
|
|
77
|
+
if header in response.headers:
|
|
78
|
+
return response.headers[header]
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def raise_for_status(response: httpx.Response) -> None:
|
|
83
|
+
"""根据状态码和业务 code 抛出相应异常
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
response: HTTP 响应
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValidationError: 400 请求参数错误(HTTP 400 或业务 code 400)
|
|
90
|
+
AuthenticationError: 401 认证失败
|
|
91
|
+
PermissionDeniedError: 403 权限不足
|
|
92
|
+
NotFoundError: 404 资源不存在
|
|
93
|
+
RateLimitError: 429 请求限流
|
|
94
|
+
ServerError: 5xx 服务器错误
|
|
95
|
+
APIError: 其他错误
|
|
96
|
+
|
|
97
|
+
Note:
|
|
98
|
+
会检查两层错误:
|
|
99
|
+
1. HTTP 状态码(非 2xx)
|
|
100
|
+
2. 业务 code(HTTP 200 但 code 非 200)
|
|
101
|
+
"""
|
|
102
|
+
# 先检查 HTTP 状态码
|
|
103
|
+
if not response.is_success:
|
|
104
|
+
status_code = response.status_code
|
|
105
|
+
request_id = _extract_request_id(response)
|
|
106
|
+
message = _extract_error_message(response)
|
|
107
|
+
response_body = response.text[:1000] if response.text else None
|
|
108
|
+
|
|
109
|
+
common_kwargs = {
|
|
110
|
+
"status_code": status_code,
|
|
111
|
+
"request_id": request_id,
|
|
112
|
+
"response_body": response_body,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if status_code == 400:
|
|
116
|
+
raise ValidationError(message, details=common_kwargs)
|
|
117
|
+
elif status_code == 401:
|
|
118
|
+
raise AuthenticationError(message, **common_kwargs)
|
|
119
|
+
elif status_code == 403:
|
|
120
|
+
raise PermissionDeniedError(message, **common_kwargs)
|
|
121
|
+
elif status_code == 404:
|
|
122
|
+
raise NotFoundError(message, **common_kwargs)
|
|
123
|
+
elif status_code == 429:
|
|
124
|
+
retry_after = response.headers.get("Retry-After")
|
|
125
|
+
retry_after_int = int(retry_after) if retry_after and retry_after.isdigit() else None
|
|
126
|
+
raise RateLimitError(message, retry_after=retry_after_int, **common_kwargs)
|
|
127
|
+
elif status_code >= 500:
|
|
128
|
+
raise ServerError(message, **common_kwargs)
|
|
129
|
+
else:
|
|
130
|
+
raise APIError(message, **common_kwargs)
|
|
131
|
+
|
|
132
|
+
# 检查业务 code(HTTP 200 但业务失败)
|
|
133
|
+
try:
|
|
134
|
+
data = response.json()
|
|
135
|
+
if isinstance(data, dict):
|
|
136
|
+
code = data.get("code")
|
|
137
|
+
if code is not None and code != 200:
|
|
138
|
+
# 提取错误信息
|
|
139
|
+
request_id = data.get("x_request_id") or _extract_request_id(response)
|
|
140
|
+
message = data.get("message", f"业务错误 code: {code}")
|
|
141
|
+
response_body = response.text[:1000] if response.text else None
|
|
142
|
+
|
|
143
|
+
common_kwargs = {
|
|
144
|
+
"request_id": request_id,
|
|
145
|
+
"response_body": response_body,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# 根据业务 code 映射到相应异常
|
|
149
|
+
if code == 400:
|
|
150
|
+
raise ValidationError(message, details=common_kwargs)
|
|
151
|
+
elif code == 401:
|
|
152
|
+
raise AuthenticationError(message, **common_kwargs)
|
|
153
|
+
elif code == 403:
|
|
154
|
+
raise PermissionDeniedError(message, **common_kwargs)
|
|
155
|
+
elif code == 404:
|
|
156
|
+
raise NotFoundError(message, **common_kwargs)
|
|
157
|
+
elif code >= 500:
|
|
158
|
+
raise ServerError(message, **common_kwargs)
|
|
159
|
+
else:
|
|
160
|
+
raise APIError(message, **common_kwargs)
|
|
161
|
+
except (ValueError, KeyError):
|
|
162
|
+
# 如果无法解析 JSON 或没有 code 字段,认为是成功
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class HTTPClient:
|
|
167
|
+
"""HTTP 客户端
|
|
168
|
+
|
|
169
|
+
基于 httpx 的 HTTP 客户端,提供自动重试和错误处理。
|
|
170
|
+
|
|
171
|
+
Attributes:
|
|
172
|
+
config: SDK 配置
|
|
173
|
+
retry_config: 重试配置
|
|
174
|
+
_client: httpx.Client 实例
|
|
175
|
+
|
|
176
|
+
Example:
|
|
177
|
+
>>> config = SDKConfiguration(app_id="xxx", secret_code="xxx")
|
|
178
|
+
>>> http = HTTPClient(config)
|
|
179
|
+
>>> response = http.request("GET", "/api/xparse/parse")
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
config: SDKConfiguration,
|
|
185
|
+
retry_config: RetryConfiguration | None = None,
|
|
186
|
+
client: httpx.Client | None = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""初始化 HTTP 客户端
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
config: SDK 配置
|
|
192
|
+
retry_config: 重试配置(可选)
|
|
193
|
+
client: 自定义 httpx.Client(可选,用于测试)
|
|
194
|
+
"""
|
|
195
|
+
self.config = config
|
|
196
|
+
self.retry_config = retry_config or RetryConfiguration(
|
|
197
|
+
max_retries=config.max_retries,
|
|
198
|
+
backoff_base=config.backoff_base,
|
|
199
|
+
backoff_max=config.backoff_max,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# 使用传入的 client 或创建新的
|
|
203
|
+
self._client = client or httpx.Client(
|
|
204
|
+
base_url=config.get_base_url(),
|
|
205
|
+
headers=config.get_auth_headers(),
|
|
206
|
+
timeout=httpx.Timeout(config.timeout),
|
|
207
|
+
)
|
|
208
|
+
self._owns_client = client is None
|
|
209
|
+
|
|
210
|
+
def request(
|
|
211
|
+
self,
|
|
212
|
+
method: str,
|
|
213
|
+
path: str,
|
|
214
|
+
*,
|
|
215
|
+
json: dict[str, Any] | None = None,
|
|
216
|
+
data: dict[str, Any] | None = None,
|
|
217
|
+
files: dict[str, Any] | None = None,
|
|
218
|
+
params: dict[str, Any] | None = None,
|
|
219
|
+
headers: dict[str, str] | None = None,
|
|
220
|
+
timeout: float | None = None,
|
|
221
|
+
) -> httpx.Response:
|
|
222
|
+
"""发送 HTTP 请求
|
|
223
|
+
|
|
224
|
+
支持自动重试和错误处理。
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
method: HTTP 方法
|
|
228
|
+
path: 请求路径
|
|
229
|
+
json: JSON 请求体
|
|
230
|
+
data: 表单数据
|
|
231
|
+
files: 文件数据
|
|
232
|
+
params: 查询参数
|
|
233
|
+
headers: 额外的请求头
|
|
234
|
+
timeout: 请求超时(覆盖默认值)
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
httpx.Response
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
APIError: API 调用失败
|
|
241
|
+
RequestTimeoutError: 请求超时
|
|
242
|
+
"""
|
|
243
|
+
request_kwargs: dict[str, Any] = {
|
|
244
|
+
"method": method,
|
|
245
|
+
"url": path,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if json is not None:
|
|
249
|
+
request_kwargs["json"] = json
|
|
250
|
+
if data is not None:
|
|
251
|
+
request_kwargs["data"] = data
|
|
252
|
+
if files is not None:
|
|
253
|
+
request_kwargs["files"] = files
|
|
254
|
+
if params is not None:
|
|
255
|
+
request_kwargs["params"] = params
|
|
256
|
+
if headers is not None:
|
|
257
|
+
request_kwargs["headers"] = headers
|
|
258
|
+
if timeout is not None:
|
|
259
|
+
request_kwargs["timeout"] = timeout
|
|
260
|
+
|
|
261
|
+
last_exception: Exception | None = None
|
|
262
|
+
|
|
263
|
+
for attempt in range(self.retry_config.max_retries + 1):
|
|
264
|
+
try:
|
|
265
|
+
logger.debug(
|
|
266
|
+
f"HTTP {method} {path} (attempt {attempt + 1}/{self.retry_config.max_retries + 1})"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
response = self._client.request(**request_kwargs)
|
|
270
|
+
|
|
271
|
+
# 检查是否需要重试
|
|
272
|
+
if self.retry_config.should_retry(response.status_code, attempt):
|
|
273
|
+
backoff = self.retry_config.calculate_backoff(attempt)
|
|
274
|
+
logger.warning(
|
|
275
|
+
f"请求失败 (status={response.status_code}),{backoff:.1f}s 后重试"
|
|
276
|
+
)
|
|
277
|
+
time.sleep(backoff)
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
# 抛出错误或返回响应
|
|
281
|
+
raise_for_status(response)
|
|
282
|
+
return response
|
|
283
|
+
|
|
284
|
+
except httpx.TimeoutException as e:
|
|
285
|
+
last_exception = e
|
|
286
|
+
if attempt < self.retry_config.max_retries:
|
|
287
|
+
backoff = self.retry_config.calculate_backoff(attempt)
|
|
288
|
+
logger.warning(f"请求超时,{backoff:.1f}s 后重试")
|
|
289
|
+
time.sleep(backoff)
|
|
290
|
+
continue
|
|
291
|
+
raise RequestTimeoutError(
|
|
292
|
+
f"请求超时: {path}",
|
|
293
|
+
timeout_seconds=timeout or self.config.timeout,
|
|
294
|
+
) from e
|
|
295
|
+
|
|
296
|
+
except httpx.RequestError as e:
|
|
297
|
+
last_exception = e
|
|
298
|
+
if attempt < self.retry_config.max_retries:
|
|
299
|
+
backoff = self.retry_config.calculate_backoff(attempt)
|
|
300
|
+
logger.warning(f"请求错误: {e},{backoff:.1f}s 后重试")
|
|
301
|
+
time.sleep(backoff)
|
|
302
|
+
continue
|
|
303
|
+
raise APIError(f"请求失败: {e}") from e
|
|
304
|
+
|
|
305
|
+
except (
|
|
306
|
+
AuthenticationError,
|
|
307
|
+
PermissionDeniedError,
|
|
308
|
+
NotFoundError,
|
|
309
|
+
ValidationError,
|
|
310
|
+
):
|
|
311
|
+
# 这些错误不需要重试
|
|
312
|
+
raise
|
|
313
|
+
|
|
314
|
+
# 不应该到达这里,但为了安全
|
|
315
|
+
if last_exception:
|
|
316
|
+
raise APIError(f"请求失败: {last_exception}") from last_exception
|
|
317
|
+
raise APIError("请求失败: 未知错误")
|
|
318
|
+
|
|
319
|
+
def get(self, path: str, **kwargs) -> httpx.Response:
|
|
320
|
+
"""发送 GET 请求"""
|
|
321
|
+
return self.request("GET", path, **kwargs)
|
|
322
|
+
|
|
323
|
+
def post(self, path: str, **kwargs) -> httpx.Response:
|
|
324
|
+
"""发送 POST 请求"""
|
|
325
|
+
return self.request("POST", path, **kwargs)
|
|
326
|
+
|
|
327
|
+
def put(self, path: str, **kwargs) -> httpx.Response:
|
|
328
|
+
"""发送 PUT 请求"""
|
|
329
|
+
return self.request("PUT", path, **kwargs)
|
|
330
|
+
|
|
331
|
+
def delete(self, path: str, **kwargs) -> httpx.Response:
|
|
332
|
+
"""发送 DELETE 请求"""
|
|
333
|
+
return self.request("DELETE", path, **kwargs)
|
|
334
|
+
|
|
335
|
+
def close(self) -> None:
|
|
336
|
+
"""关闭客户端"""
|
|
337
|
+
if self._owns_client:
|
|
338
|
+
self._client.close()
|
|
339
|
+
|
|
340
|
+
def __enter__(self) -> HTTPClient:
|
|
341
|
+
return self
|
|
342
|
+
|
|
343
|
+
def __exit__(self, *args) -> None:
|
|
344
|
+
self.close()
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
__all__ = [
|
|
348
|
+
"HTTPClient",
|
|
349
|
+
"raise_for_status",
|
|
350
|
+
]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Extract API - 信息抽取
|
|
2
|
+
|
|
3
|
+
执行 parse + extract Pipeline,用于结构化信息抽取。
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
>>> result = client.extract.extract(
|
|
7
|
+
... file=file_bytes,
|
|
8
|
+
... filename="invoice.pdf",
|
|
9
|
+
... extract_config=ExtractConfig(schema={"type": "object", ...})
|
|
10
|
+
... )
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from .._base import BaseAPI
|
|
19
|
+
from ..models.extract import ExtractConfig
|
|
20
|
+
from ..models.parse import ParseConfig, ParseResponse
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Extract(BaseAPI):
|
|
27
|
+
"""Extract API - 信息抽取
|
|
28
|
+
|
|
29
|
+
执行 parse + extract Pipeline,从文档中抽取结构化信息。
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> schema = {
|
|
33
|
+
... "type": "object",
|
|
34
|
+
... "properties": {
|
|
35
|
+
... "invoice_number": {"type": "string"},
|
|
36
|
+
... "total_amount": {"type": "number"}
|
|
37
|
+
... }
|
|
38
|
+
... }
|
|
39
|
+
>>> result = client.extract.extract(
|
|
40
|
+
... file=file_bytes,
|
|
41
|
+
... filename="invoice.pdf",
|
|
42
|
+
... extract_config=ExtractConfig(schema=schema)
|
|
43
|
+
... )
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_base_path = "/api/xparse"
|
|
47
|
+
|
|
48
|
+
def extract(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
file: bytes,
|
|
52
|
+
filename: str,
|
|
53
|
+
parse_config: ParseConfig | None = None,
|
|
54
|
+
extract_config: ExtractConfig | None = None,
|
|
55
|
+
) -> ParseResponse:
|
|
56
|
+
"""同步信息抽取
|
|
57
|
+
|
|
58
|
+
从文档中抽取结构化信息。
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
file: 文件内容
|
|
62
|
+
filename: 文件名
|
|
63
|
+
parse_config: 解析配置(可选)
|
|
64
|
+
extract_config: 抽取配置(必须,包含 schema)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
ParseResponse: 抽取结果
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
ValueError: 如果未提供 extract_config
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> result = client.extract.extract(
|
|
74
|
+
... file=file_bytes,
|
|
75
|
+
... filename="invoice.pdf",
|
|
76
|
+
... extract_config=ExtractConfig(
|
|
77
|
+
... schema={
|
|
78
|
+
... "type": "object",
|
|
79
|
+
... "properties": {
|
|
80
|
+
... "invoice_number": {"type": "string"},
|
|
81
|
+
... "total_amount": {"type": "number"}
|
|
82
|
+
... }
|
|
83
|
+
... },
|
|
84
|
+
... generate_citations=True
|
|
85
|
+
... )
|
|
86
|
+
... )
|
|
87
|
+
"""
|
|
88
|
+
files = {"file": (filename, file)}
|
|
89
|
+
data = {}
|
|
90
|
+
|
|
91
|
+
if parse_config:
|
|
92
|
+
data["parse_config"] = json.dumps(
|
|
93
|
+
parse_config.model_dump(), ensure_ascii=False
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 处理 extract_config
|
|
97
|
+
if extract_config:
|
|
98
|
+
data["extract_config"] = json.dumps(
|
|
99
|
+
extract_config.model_dump(), ensure_ascii=False
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError("extract_config is required")
|
|
103
|
+
|
|
104
|
+
response = self._post("/extract/sync", files=files, data=data)
|
|
105
|
+
return self._parse_response(response, ParseResponse)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
__all__ = ["Extract"]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Local API - 本地批处理
|
|
2
|
+
|
|
3
|
+
提供本地同步批处理工作流执行功能。
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
>>> from xparse_client import XParseClient
|
|
7
|
+
>>> from xparse_client.connectors import LocalSource, MilvusDestination
|
|
8
|
+
>>> from xparse_client.models import PipelineStage, ParseConfig
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = XParseClient(app_id="...", secret_code="...")
|
|
11
|
+
>>>
|
|
12
|
+
>>> result = client.local.run_workflow(
|
|
13
|
+
... source=LocalSource(directory="./docs", pattern=["*.pdf"]),
|
|
14
|
+
... destination=MilvusDestination(
|
|
15
|
+
... db_path="./vectors.db",
|
|
16
|
+
... collection_name="documents",
|
|
17
|
+
... dimension=1024
|
|
18
|
+
... ),
|
|
19
|
+
... stages=[
|
|
20
|
+
... PipelineStage(type="parse", config=ParseConfig(provider="textin")),
|
|
21
|
+
... ]
|
|
22
|
+
... )
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import TYPE_CHECKING, Callable, Literal
|
|
28
|
+
|
|
29
|
+
from .._base import BaseAPI
|
|
30
|
+
from ..models.local import FailedFile, WorkflowResult
|
|
31
|
+
from ..models.pipeline import PipelineStage
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Local(BaseAPI):
|
|
38
|
+
"""Local API - 本地批处理
|
|
39
|
+
|
|
40
|
+
提供本地同步批处理工作流执行,在用户代码中遍历文件并调用 API。
|
|
41
|
+
|
|
42
|
+
与 workflows 的区别:
|
|
43
|
+
- local.run_workflow: 同步阻塞执行,需要本地运行 Python
|
|
44
|
+
- workflows: 异步非阻塞,在服务端执行,支持 cron 定时
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
_base_path: API 路径前缀
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
_base_path = "/api/xparse"
|
|
51
|
+
|
|
52
|
+
def run_workflow(
|
|
53
|
+
self,
|
|
54
|
+
source, # Union[LocalSource, S3Source, FtpSource, SmbSource]
|
|
55
|
+
destination, # Union[LocalDestination, S3Destination, MilvusDestination, QdrantDestination]
|
|
56
|
+
stages: list[PipelineStage],
|
|
57
|
+
*,
|
|
58
|
+
progress_callback: Callable[[int, int, str], None] | None = None,
|
|
59
|
+
on_error: Literal["stop", "continue", "retry"] = "stop",
|
|
60
|
+
max_retries: int = 3,
|
|
61
|
+
) -> WorkflowResult:
|
|
62
|
+
"""执行本地同步批处理工作流
|
|
63
|
+
|
|
64
|
+
在你的代码中遍历 source 文件,逐个调用 API,写入 destination。
|
|
65
|
+
这是同步执行的,会阻塞直到所有文件处理完成。
|
|
66
|
+
|
|
67
|
+
如果需要:
|
|
68
|
+
- 异步执行(不阻塞)
|
|
69
|
+
- 定时任务(cron)
|
|
70
|
+
- 无需本地常驻
|
|
71
|
+
|
|
72
|
+
请使用 client.workflows 在服务端配置工作流。
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
source: 数据源(本地 Connector 对象)
|
|
76
|
+
destination: 输出目的地(本地 Connector 对象)
|
|
77
|
+
stages: 处理阶段列表
|
|
78
|
+
progress_callback: 进度回调函数 (current, total, message) -> None
|
|
79
|
+
on_error: 错误处理策略 ("stop"|"continue"|"retry")
|
|
80
|
+
max_retries: 最大重试次数(on_error="retry" 时生效)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
WorkflowResult: 工作流执行结果
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> result = client.local.run_workflow(
|
|
87
|
+
... source=LocalSource(directory="./docs"),
|
|
88
|
+
... destination=MilvusDestination(...),
|
|
89
|
+
... stages=[PipelineStage(type="parse", config=ParseConfig())]
|
|
90
|
+
... )
|
|
91
|
+
>>> print(f"成功: {result.success}/{result.total}")
|
|
92
|
+
|
|
93
|
+
参考:
|
|
94
|
+
client.workflows.create() - 创建远程工作流
|
|
95
|
+
client.workflows.run() - 运行远程工作流
|
|
96
|
+
"""
|
|
97
|
+
import time
|
|
98
|
+
from pathlib import Path
|
|
99
|
+
|
|
100
|
+
from ..exceptions import APIError
|
|
101
|
+
|
|
102
|
+
# 统计信息
|
|
103
|
+
start_time = time.time()
|
|
104
|
+
total = 0
|
|
105
|
+
success = 0
|
|
106
|
+
failed = 0
|
|
107
|
+
failed_files = []
|
|
108
|
+
|
|
109
|
+
# 列出所有文件
|
|
110
|
+
files = source.list_files()
|
|
111
|
+
total = len(files)
|
|
112
|
+
|
|
113
|
+
# 处理每个文件
|
|
114
|
+
for idx, file_path in enumerate(files, start=1):
|
|
115
|
+
# 进度回调
|
|
116
|
+
if progress_callback:
|
|
117
|
+
progress_callback(idx, total, f"处理文件 {file_path}")
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# 读取文件
|
|
121
|
+
file_bytes, data_source = source.read_file(file_path)
|
|
122
|
+
|
|
123
|
+
# 获取文件名(只取最后一部分)
|
|
124
|
+
filename = Path(file_path).name
|
|
125
|
+
|
|
126
|
+
# 调用 Pipeline API
|
|
127
|
+
from .pipeline import PipelineAPI
|
|
128
|
+
pipeline_api = PipelineAPI(self._config, self._http)
|
|
129
|
+
result = pipeline_api.execute(
|
|
130
|
+
file=file_bytes,
|
|
131
|
+
filename=filename,
|
|
132
|
+
stages=stages,
|
|
133
|
+
data_source=data_source,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# 准备写入数据
|
|
137
|
+
# 如果是向量数据库,写入 embeddings
|
|
138
|
+
if hasattr(result, 'elements') and result.elements:
|
|
139
|
+
elements_data = []
|
|
140
|
+
for elem in result.elements:
|
|
141
|
+
elem_dict = elem.model_dump() if hasattr(elem, 'model_dump') else elem
|
|
142
|
+
elements_data.append(elem_dict)
|
|
143
|
+
else:
|
|
144
|
+
elements_data = []
|
|
145
|
+
|
|
146
|
+
# 写入目的地
|
|
147
|
+
metadata = {
|
|
148
|
+
"filename": filename,
|
|
149
|
+
"file_path": file_path,
|
|
150
|
+
}
|
|
151
|
+
if hasattr(result, 'record_id'):
|
|
152
|
+
metadata["record_id"] = result.record_id
|
|
153
|
+
|
|
154
|
+
destination.write(elements_data, metadata)
|
|
155
|
+
|
|
156
|
+
success += 1
|
|
157
|
+
|
|
158
|
+
except (APIError, Exception) as e:
|
|
159
|
+
error_msg = str(e)
|
|
160
|
+
|
|
161
|
+
if on_error == "stop":
|
|
162
|
+
# 立即抛出异常,停止处理
|
|
163
|
+
raise
|
|
164
|
+
else:
|
|
165
|
+
# continue: 记录失败并继续下一个文件
|
|
166
|
+
failed += 1
|
|
167
|
+
failed_files.append(FailedFile(
|
|
168
|
+
file_path=file_path,
|
|
169
|
+
error=error_msg,
|
|
170
|
+
retry_count=0, # HTTP客户端内部已经处理了重试
|
|
171
|
+
))
|
|
172
|
+
|
|
173
|
+
# 计算总耗时
|
|
174
|
+
duration = time.time() - start_time
|
|
175
|
+
|
|
176
|
+
return WorkflowResult(
|
|
177
|
+
total=total,
|
|
178
|
+
success=success,
|
|
179
|
+
failed=failed,
|
|
180
|
+
failed_files=failed_files,
|
|
181
|
+
duration=duration,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
__all__ = ["Local"]
|