xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +188 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +351 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +225 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +132 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b8.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -129
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -690
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.19.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.19.dist-info/RECORD +0 -11
xparse_client/_http.py ADDED
@@ -0,0 +1,351 @@
1
+ """HTTP 客户端抽象层
2
+
3
+ 基于 httpx 的 HTTP 客户端封装。
4
+
5
+ Features:
6
+ - 自动重试(指数退避)
7
+ - 统一的错误处理
8
+ - 请求日志
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import time
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ from ._config import RetryConfiguration, SDKConfiguration
20
+ from .exceptions import (
21
+ APIError,
22
+ AuthenticationError,
23
+ NotFoundError,
24
+ PermissionDeniedError,
25
+ RateLimitError,
26
+ RequestTimeoutError,
27
+ ServerError,
28
+ ValidationError,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def _extract_error_message(response: httpx.Response) -> str:
35
+ """从响应中提取错误信息
36
+
37
+ 支持多种响应格式:
38
+ - JSON: {"message": "..."} 或 {"error": "..."} 或 {"detail": "..."}
39
+ - 纯文本
40
+
41
+ Args:
42
+ response: HTTP 响应
43
+
44
+ Returns:
45
+ 错误信息字符串
46
+ """
47
+ try:
48
+ data = response.json()
49
+ if isinstance(data, dict):
50
+ # 尝试多种常见的错误字段
51
+ for key in ("message", "error", "detail", "msg", "error_message"):
52
+ if key in data:
53
+ return str(data[key])
54
+ # 如果有嵌套的 error 对象
55
+ if "error" in data and isinstance(data["error"], dict):
56
+ error_dict = data["error"]
57
+ return str(error_dict.get("message", error_dict))
58
+ return str(data)
59
+ except Exception:
60
+ # 非 JSON 响应,返回文本内容
61
+ text = response.text.strip()
62
+ if text:
63
+ return text[:500] # 截断过长的响应
64
+ return f"HTTP {response.status_code}"
65
+
66
+
67
+ def _extract_request_id(response: httpx.Response) -> str | None:
68
+ """从响应头中提取 request_id
69
+
70
+ Args:
71
+ response: HTTP 响应
72
+
73
+ Returns:
74
+ request_id 或 None
75
+ """
76
+ # 尝试多种常见的 request id 头
77
+ for header in ("x-request-id", "x-req-id", "request-id", "X-Request-Id"):
78
+ if header in response.headers:
79
+ return response.headers[header]
80
+ return None
81
+
82
+
83
+ def raise_for_status(response: httpx.Response) -> None:
84
+ """根据状态码和业务 code 抛出相应异常
85
+
86
+ Args:
87
+ response: HTTP 响应
88
+
89
+ Raises:
90
+ ValidationError: 400 请求参数错误(HTTP 400 或业务 code 400)
91
+ AuthenticationError: 401 认证失败
92
+ PermissionDeniedError: 403 权限不足
93
+ NotFoundError: 404 资源不存在
94
+ RateLimitError: 429 请求限流
95
+ ServerError: 5xx 服务器错误
96
+ APIError: 其他错误
97
+
98
+ Note:
99
+ 会检查两层错误:
100
+ 1. HTTP 状态码(非 2xx)
101
+ 2. 业务 code(HTTP 200 但 code 非 200)
102
+ """
103
+ # 先检查 HTTP 状态码
104
+ if not response.is_success:
105
+ status_code = response.status_code
106
+ request_id = _extract_request_id(response)
107
+ message = _extract_error_message(response)
108
+ response_body = response.text[:1000] if response.text else None
109
+
110
+ common_kwargs: dict[str, Any] = {
111
+ "status_code": status_code,
112
+ "request_id": request_id,
113
+ "response_body": response_body,
114
+ }
115
+
116
+ if status_code == 400:
117
+ raise ValidationError(message, details=common_kwargs)
118
+ elif status_code == 401:
119
+ raise AuthenticationError(message, **common_kwargs)
120
+ elif status_code == 403:
121
+ raise PermissionDeniedError(message, **common_kwargs)
122
+ elif status_code == 404:
123
+ raise NotFoundError(message, **common_kwargs)
124
+ elif status_code == 429:
125
+ retry_after = response.headers.get("Retry-After")
126
+ retry_after_int = int(retry_after) if retry_after and retry_after.isdigit() else None
127
+ raise RateLimitError(message, retry_after=retry_after_int, **common_kwargs)
128
+ elif status_code >= 500:
129
+ raise ServerError(message, **common_kwargs)
130
+ else:
131
+ raise APIError(message, **common_kwargs)
132
+
133
+ # 检查业务 code(HTTP 200 但业务失败)
134
+ try:
135
+ data = response.json()
136
+ if isinstance(data, dict):
137
+ code = data.get("code")
138
+ if code is not None and code != 200:
139
+ # 提取错误信息
140
+ request_id = data.get("x_request_id") or _extract_request_id(response)
141
+ message = data.get("message", f"业务错误 code: {code}")
142
+ response_body = response.text[:1000] if response.text else None
143
+
144
+ error_kwargs: dict[str, Any] = {
145
+ "request_id": request_id,
146
+ "response_body": response_body,
147
+ }
148
+
149
+ # 根据业务 code 映射到相应异常
150
+ if code == 400:
151
+ raise ValidationError(message, details=error_kwargs)
152
+ elif code == 401:
153
+ raise AuthenticationError(message, **error_kwargs)
154
+ elif code == 403:
155
+ raise PermissionDeniedError(message, **error_kwargs)
156
+ elif code == 404:
157
+ raise NotFoundError(message, **error_kwargs)
158
+ elif code >= 500:
159
+ raise ServerError(message, **error_kwargs)
160
+ else:
161
+ raise APIError(message, **error_kwargs)
162
+ except (ValueError, KeyError):
163
+ # 如果无法解析 JSON 或没有 code 字段,认为是成功
164
+ pass
165
+
166
+
167
+ class HTTPClient:
168
+ """HTTP 客户端
169
+
170
+ 基于 httpx 的 HTTP 客户端,提供自动重试和错误处理。
171
+
172
+ Attributes:
173
+ config: SDK 配置
174
+ retry_config: 重试配置
175
+ _client: httpx.Client 实例
176
+
177
+ Example:
178
+ >>> config = SDKConfiguration(app_id="xxx", secret_code="xxx")
179
+ >>> http = HTTPClient(config)
180
+ >>> response = http.request("GET", "/api/xparse/parse")
181
+ """
182
+
183
+ def __init__(
184
+ self,
185
+ config: SDKConfiguration,
186
+ retry_config: RetryConfiguration | None = None,
187
+ client: httpx.Client | None = None,
188
+ ) -> None:
189
+ """初始化 HTTP 客户端
190
+
191
+ Args:
192
+ config: SDK 配置
193
+ retry_config: 重试配置(可选)
194
+ client: 自定义 httpx.Client(可选,用于测试)
195
+ """
196
+ self.config = config
197
+ self.retry_config = retry_config or RetryConfiguration(
198
+ max_retries=config.max_retries,
199
+ backoff_base=config.backoff_base,
200
+ backoff_max=config.backoff_max,
201
+ )
202
+
203
+ # 使用传入的 client 或创建新的
204
+ self._client = client or httpx.Client(
205
+ base_url=config.get_base_url(),
206
+ headers=config.get_auth_headers(),
207
+ timeout=httpx.Timeout(config.timeout),
208
+ )
209
+ self._owns_client = client is None
210
+
211
+ def request(
212
+ self,
213
+ method: str,
214
+ path: str,
215
+ *,
216
+ json: dict[str, Any] | None = None,
217
+ data: dict[str, Any] | None = None,
218
+ files: dict[str, Any] | None = None,
219
+ params: dict[str, Any] | None = None,
220
+ headers: dict[str, str] | None = None,
221
+ timeout: float | None = None,
222
+ ) -> httpx.Response:
223
+ """发送 HTTP 请求
224
+
225
+ 支持自动重试和错误处理。
226
+
227
+ Args:
228
+ method: HTTP 方法
229
+ path: 请求路径
230
+ json: JSON 请求体
231
+ data: 表单数据
232
+ files: 文件数据
233
+ params: 查询参数
234
+ headers: 额外的请求头
235
+ timeout: 请求超时(覆盖默认值)
236
+
237
+ Returns:
238
+ httpx.Response
239
+
240
+ Raises:
241
+ APIError: API 调用失败
242
+ RequestTimeoutError: 请求超时
243
+ """
244
+ request_kwargs: dict[str, Any] = {
245
+ "method": method,
246
+ "url": path,
247
+ }
248
+
249
+ if json is not None:
250
+ request_kwargs["json"] = json
251
+ if data is not None:
252
+ request_kwargs["data"] = data
253
+ if files is not None:
254
+ request_kwargs["files"] = files
255
+ if params is not None:
256
+ request_kwargs["params"] = params
257
+ if headers is not None:
258
+ request_kwargs["headers"] = headers
259
+ if timeout is not None:
260
+ request_kwargs["timeout"] = timeout
261
+
262
+ last_exception: Exception | None = None
263
+
264
+ for attempt in range(self.retry_config.max_retries + 1):
265
+ try:
266
+ logger.debug(
267
+ f"HTTP {method} {path} (attempt {attempt + 1}/{self.retry_config.max_retries + 1})"
268
+ )
269
+
270
+ response = self._client.request(**request_kwargs)
271
+
272
+ # 检查是否需要重试
273
+ if self.retry_config.should_retry(response.status_code, attempt):
274
+ backoff = self.retry_config.calculate_backoff(attempt)
275
+ logger.warning(
276
+ f"请求失败 (status={response.status_code}),{backoff:.1f}s 后重试"
277
+ )
278
+ time.sleep(backoff)
279
+ continue
280
+
281
+ # 抛出错误或返回响应
282
+ raise_for_status(response)
283
+ return response
284
+
285
+ except httpx.TimeoutException as e:
286
+ last_exception = e
287
+ if attempt < self.retry_config.max_retries:
288
+ backoff = self.retry_config.calculate_backoff(attempt)
289
+ logger.warning(f"请求超时,{backoff:.1f}s 后重试")
290
+ time.sleep(backoff)
291
+ continue
292
+ raise RequestTimeoutError(
293
+ f"请求超时: {path}",
294
+ timeout_seconds=timeout or self.config.timeout,
295
+ ) from e
296
+
297
+ except httpx.RequestError as e:
298
+ last_exception = e
299
+ if attempt < self.retry_config.max_retries:
300
+ backoff = self.retry_config.calculate_backoff(attempt)
301
+ logger.warning(f"请求错误: {e},{backoff:.1f}s 后重试")
302
+ time.sleep(backoff)
303
+ continue
304
+ raise APIError(f"请求失败: {e}") from e
305
+
306
+ except (
307
+ AuthenticationError,
308
+ PermissionDeniedError,
309
+ NotFoundError,
310
+ ValidationError,
311
+ ):
312
+ # 这些错误不需要重试
313
+ raise
314
+
315
+ # 不应该到达这里,但为了安全
316
+ if last_exception:
317
+ raise APIError(f"请求失败: {last_exception}") from last_exception
318
+ raise APIError("请求失败: 未知错误")
319
+
320
+ def get(self, path: str, **kwargs) -> httpx.Response:
321
+ """发送 GET 请求"""
322
+ return self.request("GET", path, **kwargs)
323
+
324
+ def post(self, path: str, **kwargs) -> httpx.Response:
325
+ """发送 POST 请求"""
326
+ return self.request("POST", path, **kwargs)
327
+
328
+ def put(self, path: str, **kwargs) -> httpx.Response:
329
+ """发送 PUT 请求"""
330
+ return self.request("PUT", path, **kwargs)
331
+
332
+ def delete(self, path: str, **kwargs) -> httpx.Response:
333
+ """发送 DELETE 请求"""
334
+ return self.request("DELETE", path, **kwargs)
335
+
336
+ def close(self) -> None:
337
+ """关闭客户端"""
338
+ if self._owns_client:
339
+ self._client.close()
340
+
341
+ def __enter__(self) -> HTTPClient:
342
+ return self
343
+
344
+ def __exit__(self, *args) -> None:
345
+ self.close()
346
+
347
+
348
+ __all__ = [
349
+ "HTTPClient",
350
+ "raise_for_status",
351
+ ]
@@ -0,0 +1,14 @@
1
+ """API 模块
2
+
3
+ 提供各种 API 的实现类。
4
+ """
5
+
6
+ from .extract import Extract
7
+ from .parse import Parse
8
+ from .pipeline import PipelineAPI
9
+
10
+ __all__ = [
11
+ "Parse",
12
+ "Extract",
13
+ "PipelineAPI",
14
+ ]
@@ -0,0 +1,109 @@
1
+ """Extract API - 信息抽取
2
+
3
+ 执行 parse + extract Pipeline,用于结构化信息抽取。
4
+
5
+ Example:
6
+ >>> result = client.extract.extract(
7
+ ... file=file_bytes,
8
+ ... filename="invoice.pdf",
9
+ ... extract_config=ExtractConfig(schema={"type": "object", ...})
10
+ ... )
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from typing import TYPE_CHECKING
17
+
18
+ from .._base import BaseAPI
19
+ from ..models.extract import ExtractConfig
20
+ from ..models.parse import ParseConfig, ParseResponse
21
+
22
+ if TYPE_CHECKING:
23
+ pass
24
+
25
+
26
+ class Extract(BaseAPI):
27
+ """Extract API - 信息抽取
28
+
29
+ 执行 parse + extract Pipeline,从文档中抽取结构化信息。
30
+
31
+ Example:
32
+ >>> schema = {
33
+ ... "type": "object",
34
+ ... "properties": {
35
+ ... "invoice_number": {"type": "string"},
36
+ ... "total_amount": {"type": "number"}
37
+ ... }
38
+ ... }
39
+ >>> result = client.extract.extract(
40
+ ... file=file_bytes,
41
+ ... filename="invoice.pdf",
42
+ ... extract_config=ExtractConfig(schema=schema)
43
+ ... )
44
+ """
45
+
46
+ _base_path = "/api/xparse"
47
+
48
+ def extract(
49
+ self,
50
+ *,
51
+ file: bytes,
52
+ filename: str,
53
+ parse_config: ParseConfig | None = None,
54
+ extract_config: ExtractConfig | None = None,
55
+ ) -> ParseResponse:
56
+ """同步信息抽取
57
+
58
+ 从文档中抽取结构化信息。
59
+
60
+ Args:
61
+ file: 文件内容
62
+ filename: 文件名
63
+ parse_config: 解析配置(可选)
64
+ extract_config: 抽取配置(必须,包含 schema)
65
+
66
+ Returns:
67
+ ParseResponse: 抽取结果
68
+
69
+ Raises:
70
+ ValueError: 如果未提供 extract_config
71
+
72
+ Example:
73
+ >>> result = client.extract.extract(
74
+ ... file=file_bytes,
75
+ ... filename="invoice.pdf",
76
+ ... extract_config=ExtractConfig(
77
+ ... schema={
78
+ ... "type": "object",
79
+ ... "properties": {
80
+ ... "invoice_number": {"type": "string"},
81
+ ... "total_amount": {"type": "number"}
82
+ ... }
83
+ ... },
84
+ ... generate_citations=True
85
+ ... )
86
+ ... )
87
+ """
88
+ files = {"file": (filename, file)}
89
+ data = {}
90
+
91
+ if parse_config:
92
+ data["parse_config"] = json.dumps(
93
+ parse_config.model_dump(), ensure_ascii=False
94
+ )
95
+
96
+ # 处理 extract_config
97
+ if extract_config:
98
+ data["extract_config"] = json.dumps(
99
+ extract_config.model_dump(), ensure_ascii=False
100
+ )
101
+ else:
102
+ raise ValueError("extract_config is required")
103
+
104
+ response = self._post("/extract/sync", files=files, data=data)
105
+ return self._parse_response(response, ParseResponse)
106
+
107
+
108
+
109
+ __all__ = ["Extract"]
@@ -0,0 +1,225 @@
1
+ """Local API - 本地批处理
2
+
3
+ 提供本地同步批处理工作流执行功能。
4
+
5
+ Example:
6
+ >>> from xparse_client import XParseClient
7
+ >>> from xparse_client.connectors import LocalSource, MilvusDestination
8
+ >>> from xparse_client.models import PipelineStage, ParseConfig
9
+ >>>
10
+ >>> client = XParseClient(app_id="...", secret_code="...")
11
+ >>>
12
+ >>> result = client.local.run_workflow(
13
+ ... source=LocalSource(directory="./docs", pattern=["*.pdf"]),
14
+ ... destination=MilvusDestination(
15
+ ... db_path="./vectors.db",
16
+ ... collection_name="documents",
17
+ ... dimension=1024
18
+ ... ),
19
+ ... stages=[
20
+ ... PipelineStage(type="parse", config=ParseConfig(provider="textin")),
21
+ ... ]
22
+ ... )
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import TYPE_CHECKING, Any, Callable, Literal
28
+
29
+ from .._base import BaseAPI
30
+ from ..models.local import FailedFile, WorkflowResult
31
+ from ..models.pipeline import PipelineConfig, PipelineStage
32
+
33
+ if TYPE_CHECKING:
34
+ pass
35
+
36
+
37
+ class Local(BaseAPI):
38
+ """Local API - 本地批处理
39
+
40
+ 提供本地同步批处理工作流执行,在用户代码中遍历文件并调用 API。
41
+
42
+ 与 workflows 的区别:
43
+ - local.run_workflow: 同步阻塞执行,需要本地运行 Python
44
+ - workflows: 异步非阻塞,在服务端执行,支持 cron 定时
45
+
46
+ Attributes:
47
+ _base_path: API 路径前缀
48
+ """
49
+
50
+ _base_path = "/api/xparse"
51
+
52
+ def run_workflow(
53
+ self,
54
+ source, # Union[LocalSource, S3Source, FtpSource, SmbSource]
55
+ destination, # Union[LocalDestination, S3Destination, MilvusDestination, QdrantDestination]
56
+ stages: list[PipelineStage | dict[str, Any]],
57
+ *,
58
+ pipeline_config: PipelineConfig | None = None,
59
+ progress_callback: Callable[[int, int, str], None] | None = None,
60
+ on_error: Literal["stop", "continue", "retry"] = "stop",
61
+ max_retries: int = 3,
62
+ ) -> WorkflowResult:
63
+ """执行本地同步批处理工作流
64
+
65
+ 在你的代码中遍历 source 文件,逐个调用 API,写入 destination。
66
+ 这是同步执行的,会阻塞直到所有文件处理完成。
67
+
68
+ 如果需要:
69
+ - 异步执行(不阻塞)
70
+ - 定时任务(cron)
71
+ - 无需本地常驻
72
+
73
+ 请使用 client.workflows 在服务端配置工作流。
74
+
75
+ Args:
76
+ source: 数据源(本地 Connector 对象)
77
+ destination: 输出目的地(本地 Connector 对象)
78
+ stages: 处理阶段列表
79
+ pipeline_config: Pipeline 配置(包含中间结果保存配置)
80
+ progress_callback: 进度回调函数 (current, total, message) -> None
81
+ on_error: 错误处理策略 ("stop"|"continue"|"retry")
82
+ max_retries: 最大重试次数(on_error="retry" 时生效)
83
+
84
+ Returns:
85
+ WorkflowResult: 工作流执行结果
86
+
87
+ Example:
88
+ >>> result = client.local.run_workflow(
89
+ ... source=LocalSource(directory="./docs"),
90
+ ... destination=MilvusDestination(...),
91
+ ... stages=[PipelineStage(type="parse", config=ParseConfig())]
92
+ ... )
93
+ >>> print(f"成功: {result.success}/{result.total}")
94
+
95
+ 参考:
96
+ client.workflows.create() - 创建远程工作流
97
+ client.workflows.run() - 运行远程工作流
98
+ """
99
+ import time
100
+ from pathlib import Path
101
+
102
+ from ..exceptions import APIError
103
+
104
+ # 统计信息
105
+ start_time = time.time()
106
+ total = 0
107
+ success = 0
108
+ failed = 0
109
+ failed_files = []
110
+
111
+ # 列出所有文件
112
+ files = source.list_files()
113
+ total = len(files)
114
+
115
+ # 处理每个文件
116
+ for idx, file_path in enumerate(files, start=1):
117
+ # 进度回调
118
+ if progress_callback:
119
+ progress_callback(idx, total, f"处理文件 {file_path}")
120
+
121
+ try:
122
+ # 读取文件
123
+ file_bytes, data_source = source.read_file(file_path)
124
+
125
+ # 获取文件名(只取最后一部分)
126
+ filename = Path(file_path).name
127
+
128
+ # 调用 Pipeline API
129
+ from .pipeline import PipelineAPI
130
+
131
+ pipeline_api = PipelineAPI(self._config, self._http)
132
+ result = pipeline_api.execute(
133
+ file=file_bytes,
134
+ filename=filename,
135
+ stages=stages,
136
+ config=pipeline_config,
137
+ data_source=data_source,
138
+ )
139
+
140
+ # 处理中间结果
141
+ if (
142
+ pipeline_config
143
+ and pipeline_config.include_intermediate_results
144
+ and pipeline_config.intermediate_results_destination
145
+ and hasattr(result, "intermediate_results")
146
+ and result.intermediate_results
147
+ ):
148
+ for stage_result in result.intermediate_results:
149
+ stage_name = stage_result.get("stage")
150
+ elements = stage_result.get("elements", [])
151
+ if stage_name and elements:
152
+ # 转换 elements 为字典列表
153
+ elements_data = []
154
+ for elem in elements:
155
+ if hasattr(elem, "model_dump"):
156
+ elements_data.append(elem.model_dump())
157
+ elif isinstance(elem, dict):
158
+ elements_data.append(elem)
159
+ else:
160
+ elements_data.append(elem)
161
+
162
+ metadata_with_stage = {
163
+ "filename": filename,
164
+ "file_path": file_path,
165
+ "stage": stage_name,
166
+ }
167
+ if result.stats and hasattr(result.stats, "record_id"):
168
+ metadata_with_stage["record_id"] = result.stats.record_id
169
+
170
+ pipeline_config.intermediate_results_destination.write(
171
+ elements_data, metadata_with_stage
172
+ )
173
+
174
+ # 准备写入数据
175
+ # 如果是向量数据库,写入 embeddings
176
+ if hasattr(result, "elements") and result.elements:
177
+ elements_data = []
178
+ for elem in result.elements:
179
+ elem_dict = elem.model_dump() if hasattr(elem, "model_dump") else elem
180
+ elements_data.append(elem_dict)
181
+ else:
182
+ elements_data = []
183
+
184
+ # 写入目的地
185
+ metadata = {
186
+ "filename": filename,
187
+ "file_path": file_path,
188
+ }
189
+ if result.stats and hasattr(result.stats, "record_id"):
190
+ metadata["record_id"] = result.stats.record_id
191
+
192
+ destination.write(elements_data, metadata)
193
+
194
+ success += 1
195
+
196
+ except (APIError, Exception) as e:
197
+ error_msg = str(e)
198
+
199
+ if on_error == "stop":
200
+ # 立即抛出异常,停止处理
201
+ raise
202
+ else:
203
+ # continue: 记录失败并继续下一个文件
204
+ failed += 1
205
+ failed_files.append(
206
+ FailedFile(
207
+ file_path=file_path,
208
+ error=error_msg,
209
+ retry_count=0, # HTTP客户端内部已经处理了重试
210
+ )
211
+ )
212
+
213
+ # 计算总耗时
214
+ duration = time.time() - start_time
215
+
216
+ return WorkflowResult(
217
+ total=total,
218
+ success=success,
219
+ failed=failed,
220
+ failed_files=failed_files,
221
+ duration=duration,
222
+ )
223
+
224
+
225
+ __all__ = ["Local"]