xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
  69. example/run_pipeline.py +0 -506
  70. example/run_pipeline_test.py +0 -458
  71. xparse_client/pipeline/__init__.py +0 -3
  72. xparse_client/pipeline/config.py +0 -129
  73. xparse_client/pipeline/destinations.py +0 -487
  74. xparse_client/pipeline/pipeline.py +0 -622
  75. xparse_client/pipeline/sources.py +0 -585
  76. xparse_client-0.2.11.dist-info/METADATA +0 -1050
  77. xparse_client-0.2.11.dist-info/RECORD +0 -13
xparse_client/_http.py ADDED
@@ -0,0 +1,350 @@
1
+ """HTTP 客户端抽象层
2
+
3
+ 基于 httpx 的 HTTP 客户端封装。
4
+
5
+ Features:
6
+ - 自动重试(指数退避)
7
+ - 统一的错误处理
8
+ - 请求日志
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import time
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ from ._config import RetryConfiguration, SDKConfiguration
20
+ from .exceptions import (
21
+ APIError,
22
+ AuthenticationError,
23
+ NotFoundError,
24
+ PermissionDeniedError,
25
+ RateLimitError,
26
+ RequestTimeoutError,
27
+ ServerError,
28
+ ValidationError,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def _extract_error_message(response: httpx.Response) -> str:
35
+ """从响应中提取错误信息
36
+
37
+ 支持多种响应格式:
38
+ - JSON: {"message": "..."} 或 {"error": "..."} 或 {"detail": "..."}
39
+ - 纯文本
40
+
41
+ Args:
42
+ response: HTTP 响应
43
+
44
+ Returns:
45
+ 错误信息字符串
46
+ """
47
+ try:
48
+ data = response.json()
49
+ if isinstance(data, dict):
50
+ # 尝试多种常见的错误字段
51
+ for key in ("message", "error", "detail", "msg", "error_message"):
52
+ if key in data:
53
+ return str(data[key])
54
+ # 如果有嵌套的 error 对象
55
+ if "error" in data and isinstance(data["error"], dict):
56
+ return data["error"].get("message", str(data["error"]))
57
+ return str(data)
58
+ except Exception:
59
+ # 非 JSON 响应,返回文本内容
60
+ text = response.text.strip()
61
+ if text:
62
+ return text[:500] # 截断过长的响应
63
+ return f"HTTP {response.status_code}"
64
+
65
+
66
+ def _extract_request_id(response: httpx.Response) -> str | None:
67
+ """从响应头中提取 request_id
68
+
69
+ Args:
70
+ response: HTTP 响应
71
+
72
+ Returns:
73
+ request_id 或 None
74
+ """
75
+ # 尝试多种常见的 request id 头
76
+ for header in ("x-request-id", "x-req-id", "request-id", "X-Request-Id"):
77
+ if header in response.headers:
78
+ return response.headers[header]
79
+ return None
80
+
81
+
82
+ def raise_for_status(response: httpx.Response) -> None:
83
+ """根据状态码和业务 code 抛出相应异常
84
+
85
+ Args:
86
+ response: HTTP 响应
87
+
88
+ Raises:
89
+ ValidationError: 400 请求参数错误(HTTP 400 或业务 code 400)
90
+ AuthenticationError: 401 认证失败
91
+ PermissionDeniedError: 403 权限不足
92
+ NotFoundError: 404 资源不存在
93
+ RateLimitError: 429 请求限流
94
+ ServerError: 5xx 服务器错误
95
+ APIError: 其他错误
96
+
97
+ Note:
98
+ 会检查两层错误:
99
+ 1. HTTP 状态码(非 2xx)
100
+ 2. 业务 code(HTTP 200 但 code 非 200)
101
+ """
102
+ # 先检查 HTTP 状态码
103
+ if not response.is_success:
104
+ status_code = response.status_code
105
+ request_id = _extract_request_id(response)
106
+ message = _extract_error_message(response)
107
+ response_body = response.text[:1000] if response.text else None
108
+
109
+ common_kwargs = {
110
+ "status_code": status_code,
111
+ "request_id": request_id,
112
+ "response_body": response_body,
113
+ }
114
+
115
+ if status_code == 400:
116
+ raise ValidationError(message, details=common_kwargs)
117
+ elif status_code == 401:
118
+ raise AuthenticationError(message, **common_kwargs)
119
+ elif status_code == 403:
120
+ raise PermissionDeniedError(message, **common_kwargs)
121
+ elif status_code == 404:
122
+ raise NotFoundError(message, **common_kwargs)
123
+ elif status_code == 429:
124
+ retry_after = response.headers.get("Retry-After")
125
+ retry_after_int = int(retry_after) if retry_after and retry_after.isdigit() else None
126
+ raise RateLimitError(message, retry_after=retry_after_int, **common_kwargs)
127
+ elif status_code >= 500:
128
+ raise ServerError(message, **common_kwargs)
129
+ else:
130
+ raise APIError(message, **common_kwargs)
131
+
132
+ # 检查业务 code(HTTP 200 但业务失败)
133
+ try:
134
+ data = response.json()
135
+ if isinstance(data, dict):
136
+ code = data.get("code")
137
+ if code is not None and code != 200:
138
+ # 提取错误信息
139
+ request_id = data.get("x_request_id") or _extract_request_id(response)
140
+ message = data.get("message", f"业务错误 code: {code}")
141
+ response_body = response.text[:1000] if response.text else None
142
+
143
+ common_kwargs = {
144
+ "request_id": request_id,
145
+ "response_body": response_body,
146
+ }
147
+
148
+ # 根据业务 code 映射到相应异常
149
+ if code == 400:
150
+ raise ValidationError(message, details=common_kwargs)
151
+ elif code == 401:
152
+ raise AuthenticationError(message, **common_kwargs)
153
+ elif code == 403:
154
+ raise PermissionDeniedError(message, **common_kwargs)
155
+ elif code == 404:
156
+ raise NotFoundError(message, **common_kwargs)
157
+ elif code >= 500:
158
+ raise ServerError(message, **common_kwargs)
159
+ else:
160
+ raise APIError(message, **common_kwargs)
161
+ except (ValueError, KeyError):
162
+ # 如果无法解析 JSON 或没有 code 字段,认为是成功
163
+ pass
164
+
165
+
166
+ class HTTPClient:
167
+ """HTTP 客户端
168
+
169
+ 基于 httpx 的 HTTP 客户端,提供自动重试和错误处理。
170
+
171
+ Attributes:
172
+ config: SDK 配置
173
+ retry_config: 重试配置
174
+ _client: httpx.Client 实例
175
+
176
+ Example:
177
+ >>> config = SDKConfiguration(app_id="xxx", secret_code="xxx")
178
+ >>> http = HTTPClient(config)
179
+ >>> response = http.request("GET", "/api/xparse/parse")
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ config: SDKConfiguration,
185
+ retry_config: RetryConfiguration | None = None,
186
+ client: httpx.Client | None = None,
187
+ ) -> None:
188
+ """初始化 HTTP 客户端
189
+
190
+ Args:
191
+ config: SDK 配置
192
+ retry_config: 重试配置(可选)
193
+ client: 自定义 httpx.Client(可选,用于测试)
194
+ """
195
+ self.config = config
196
+ self.retry_config = retry_config or RetryConfiguration(
197
+ max_retries=config.max_retries,
198
+ backoff_base=config.backoff_base,
199
+ backoff_max=config.backoff_max,
200
+ )
201
+
202
+ # 使用传入的 client 或创建新的
203
+ self._client = client or httpx.Client(
204
+ base_url=config.get_base_url(),
205
+ headers=config.get_auth_headers(),
206
+ timeout=httpx.Timeout(config.timeout),
207
+ )
208
+ self._owns_client = client is None
209
+
210
+ def request(
211
+ self,
212
+ method: str,
213
+ path: str,
214
+ *,
215
+ json: dict[str, Any] | None = None,
216
+ data: dict[str, Any] | None = None,
217
+ files: dict[str, Any] | None = None,
218
+ params: dict[str, Any] | None = None,
219
+ headers: dict[str, str] | None = None,
220
+ timeout: float | None = None,
221
+ ) -> httpx.Response:
222
+ """发送 HTTP 请求
223
+
224
+ 支持自动重试和错误处理。
225
+
226
+ Args:
227
+ method: HTTP 方法
228
+ path: 请求路径
229
+ json: JSON 请求体
230
+ data: 表单数据
231
+ files: 文件数据
232
+ params: 查询参数
233
+ headers: 额外的请求头
234
+ timeout: 请求超时(覆盖默认值)
235
+
236
+ Returns:
237
+ httpx.Response
238
+
239
+ Raises:
240
+ APIError: API 调用失败
241
+ RequestTimeoutError: 请求超时
242
+ """
243
+ request_kwargs: dict[str, Any] = {
244
+ "method": method,
245
+ "url": path,
246
+ }
247
+
248
+ if json is not None:
249
+ request_kwargs["json"] = json
250
+ if data is not None:
251
+ request_kwargs["data"] = data
252
+ if files is not None:
253
+ request_kwargs["files"] = files
254
+ if params is not None:
255
+ request_kwargs["params"] = params
256
+ if headers is not None:
257
+ request_kwargs["headers"] = headers
258
+ if timeout is not None:
259
+ request_kwargs["timeout"] = timeout
260
+
261
+ last_exception: Exception | None = None
262
+
263
+ for attempt in range(self.retry_config.max_retries + 1):
264
+ try:
265
+ logger.debug(
266
+ f"HTTP {method} {path} (attempt {attempt + 1}/{self.retry_config.max_retries + 1})"
267
+ )
268
+
269
+ response = self._client.request(**request_kwargs)
270
+
271
+ # 检查是否需要重试
272
+ if self.retry_config.should_retry(response.status_code, attempt):
273
+ backoff = self.retry_config.calculate_backoff(attempt)
274
+ logger.warning(
275
+ f"请求失败 (status={response.status_code}),{backoff:.1f}s 后重试"
276
+ )
277
+ time.sleep(backoff)
278
+ continue
279
+
280
+ # 抛出错误或返回响应
281
+ raise_for_status(response)
282
+ return response
283
+
284
+ except httpx.TimeoutException as e:
285
+ last_exception = e
286
+ if attempt < self.retry_config.max_retries:
287
+ backoff = self.retry_config.calculate_backoff(attempt)
288
+ logger.warning(f"请求超时,{backoff:.1f}s 后重试")
289
+ time.sleep(backoff)
290
+ continue
291
+ raise RequestTimeoutError(
292
+ f"请求超时: {path}",
293
+ timeout_seconds=timeout or self.config.timeout,
294
+ ) from e
295
+
296
+ except httpx.RequestError as e:
297
+ last_exception = e
298
+ if attempt < self.retry_config.max_retries:
299
+ backoff = self.retry_config.calculate_backoff(attempt)
300
+ logger.warning(f"请求错误: {e},{backoff:.1f}s 后重试")
301
+ time.sleep(backoff)
302
+ continue
303
+ raise APIError(f"请求失败: {e}") from e
304
+
305
+ except (
306
+ AuthenticationError,
307
+ PermissionDeniedError,
308
+ NotFoundError,
309
+ ValidationError,
310
+ ):
311
+ # 这些错误不需要重试
312
+ raise
313
+
314
+ # 不应该到达这里,但为了安全
315
+ if last_exception:
316
+ raise APIError(f"请求失败: {last_exception}") from last_exception
317
+ raise APIError("请求失败: 未知错误")
318
+
319
+ def get(self, path: str, **kwargs) -> httpx.Response:
320
+ """发送 GET 请求"""
321
+ return self.request("GET", path, **kwargs)
322
+
323
+ def post(self, path: str, **kwargs) -> httpx.Response:
324
+ """发送 POST 请求"""
325
+ return self.request("POST", path, **kwargs)
326
+
327
+ def put(self, path: str, **kwargs) -> httpx.Response:
328
+ """发送 PUT 请求"""
329
+ return self.request("PUT", path, **kwargs)
330
+
331
+ def delete(self, path: str, **kwargs) -> httpx.Response:
332
+ """发送 DELETE 请求"""
333
+ return self.request("DELETE", path, **kwargs)
334
+
335
+ def close(self) -> None:
336
+ """关闭客户端"""
337
+ if self._owns_client:
338
+ self._client.close()
339
+
340
+ def __enter__(self) -> HTTPClient:
341
+ return self
342
+
343
+ def __exit__(self, *args) -> None:
344
+ self.close()
345
+
346
+
347
+ __all__ = [
348
+ "HTTPClient",
349
+ "raise_for_status",
350
+ ]
@@ -0,0 +1,14 @@
1
+ """API 模块
2
+
3
+ 提供各种 API 的实现类。
4
+ """
5
+
6
+ from .extract import Extract
7
+ from .parse import Parse
8
+ from .pipeline import PipelineAPI
9
+
10
+ __all__ = [
11
+ "Parse",
12
+ "Extract",
13
+ "PipelineAPI",
14
+ ]
@@ -0,0 +1,109 @@
1
+ """Extract API - 信息抽取
2
+
3
+ 执行 parse + extract Pipeline,用于结构化信息抽取。
4
+
5
+ Example:
6
+ >>> result = client.extract.extract(
7
+ ... file=file_bytes,
8
+ ... filename="invoice.pdf",
9
+ ... extract_config=ExtractConfig(schema={"type": "object", ...})
10
+ ... )
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from typing import TYPE_CHECKING
17
+
18
+ from .._base import BaseAPI
19
+ from ..models.extract import ExtractConfig
20
+ from ..models.parse import ParseConfig, ParseResponse
21
+
22
+ if TYPE_CHECKING:
23
+ pass
24
+
25
+
26
+ class Extract(BaseAPI):
27
+ """Extract API - 信息抽取
28
+
29
+ 执行 parse + extract Pipeline,从文档中抽取结构化信息。
30
+
31
+ Example:
32
+ >>> schema = {
33
+ ... "type": "object",
34
+ ... "properties": {
35
+ ... "invoice_number": {"type": "string"},
36
+ ... "total_amount": {"type": "number"}
37
+ ... }
38
+ ... }
39
+ >>> result = client.extract.extract(
40
+ ... file=file_bytes,
41
+ ... filename="invoice.pdf",
42
+ ... extract_config=ExtractConfig(schema=schema)
43
+ ... )
44
+ """
45
+
46
+ _base_path = "/api/xparse"
47
+
48
+ def extract(
49
+ self,
50
+ *,
51
+ file: bytes,
52
+ filename: str,
53
+ parse_config: ParseConfig | None = None,
54
+ extract_config: ExtractConfig | None = None,
55
+ ) -> ParseResponse:
56
+ """同步信息抽取
57
+
58
+ 从文档中抽取结构化信息。
59
+
60
+ Args:
61
+ file: 文件内容
62
+ filename: 文件名
63
+ parse_config: 解析配置(可选)
64
+ extract_config: 抽取配置(必须,包含 schema)
65
+
66
+ Returns:
67
+ ParseResponse: 抽取结果
68
+
69
+ Raises:
70
+ ValueError: 如果未提供 extract_config
71
+
72
+ Example:
73
+ >>> result = client.extract.extract(
74
+ ... file=file_bytes,
75
+ ... filename="invoice.pdf",
76
+ ... extract_config=ExtractConfig(
77
+ ... schema={
78
+ ... "type": "object",
79
+ ... "properties": {
80
+ ... "invoice_number": {"type": "string"},
81
+ ... "total_amount": {"type": "number"}
82
+ ... }
83
+ ... },
84
+ ... generate_citations=True
85
+ ... )
86
+ ... )
87
+ """
88
+ files = {"file": (filename, file)}
89
+ data = {}
90
+
91
+ if parse_config:
92
+ data["parse_config"] = json.dumps(
93
+ parse_config.model_dump(), ensure_ascii=False
94
+ )
95
+
96
+ # 处理 extract_config
97
+ if extract_config:
98
+ data["extract_config"] = json.dumps(
99
+ extract_config.model_dump(), ensure_ascii=False
100
+ )
101
+ else:
102
+ raise ValueError("extract_config is required")
103
+
104
+ response = self._post("/extract/sync", files=files, data=data)
105
+ return self._parse_response(response, ParseResponse)
106
+
107
+
108
+
109
+ __all__ = ["Extract"]
@@ -0,0 +1,215 @@
1
+ """Local API - 本地批处理
2
+
3
+ 提供本地同步批处理工作流执行功能。
4
+
5
+ Example:
6
+ >>> from xparse_client import XParseClient
7
+ >>> from xparse_client.connectors import LocalSource, MilvusDestination
8
+ >>> from xparse_client.models import PipelineStage, ParseConfig
9
+ >>>
10
+ >>> client = XParseClient(app_id="...", secret_code="...")
11
+ >>>
12
+ >>> result = client.local.run_workflow(
13
+ ... source=LocalSource(directory="./docs", pattern=["*.pdf"]),
14
+ ... destination=MilvusDestination(
15
+ ... db_path="./vectors.db",
16
+ ... collection_name="documents",
17
+ ... dimension=1024
18
+ ... ),
19
+ ... stages=[
20
+ ... PipelineStage(type="parse", config=ParseConfig(provider="textin")),
21
+ ... ]
22
+ ... )
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import TYPE_CHECKING, Callable, Literal
28
+
29
+ from .._base import BaseAPI
30
+ from ..models.local import FailedFile, WorkflowResult
31
+ from ..models.pipeline import PipelineConfig, PipelineStage
32
+
33
+ if TYPE_CHECKING:
34
+ pass
35
+
36
+
37
+ class Local(BaseAPI):
38
+ """Local API - 本地批处理
39
+
40
+ 提供本地同步批处理工作流执行,在用户代码中遍历文件并调用 API。
41
+
42
+ 与 workflows 的区别:
43
+ - local.run_workflow: 同步阻塞执行,需要本地运行 Python
44
+ - workflows: 异步非阻塞,在服务端执行,支持 cron 定时
45
+
46
+ Attributes:
47
+ _base_path: API 路径前缀
48
+ """
49
+
50
+ _base_path = "/api/xparse"
51
+
52
+ def run_workflow(
53
+ self,
54
+ source, # Union[LocalSource, S3Source, FtpSource, SmbSource]
55
+ destination, # Union[LocalDestination, S3Destination, MilvusDestination, QdrantDestination]
56
+ stages: list[PipelineStage],
57
+ *,
58
+ pipeline_config: PipelineConfig | None = None,
59
+ progress_callback: Callable[[int, int, str], None] | None = None,
60
+ on_error: Literal["stop", "continue", "retry"] = "stop",
61
+ max_retries: int = 3,
62
+ ) -> WorkflowResult:
63
+ """执行本地同步批处理工作流
64
+
65
+ 在你的代码中遍历 source 文件,逐个调用 API,写入 destination。
66
+ 这是同步执行的,会阻塞直到所有文件处理完成。
67
+
68
+ 如果需要:
69
+ - 异步执行(不阻塞)
70
+ - 定时任务(cron)
71
+ - 无需本地常驻
72
+
73
+ 请使用 client.workflows 在服务端配置工作流。
74
+
75
+ Args:
76
+ source: 数据源(本地 Connector 对象)
77
+ destination: 输出目的地(本地 Connector 对象)
78
+ stages: 处理阶段列表
79
+ pipeline_config: Pipeline 配置(包含中间结果保存配置)
80
+ progress_callback: 进度回调函数 (current, total, message) -> None
81
+ on_error: 错误处理策略 ("stop"|"continue"|"retry")
82
+ max_retries: 最大重试次数(on_error="retry" 时生效)
83
+
84
+ Returns:
85
+ WorkflowResult: 工作流执行结果
86
+
87
+ Example:
88
+ >>> result = client.local.run_workflow(
89
+ ... source=LocalSource(directory="./docs"),
90
+ ... destination=MilvusDestination(...),
91
+ ... stages=[PipelineStage(type="parse", config=ParseConfig())]
92
+ ... )
93
+ >>> print(f"成功: {result.success}/{result.total}")
94
+
95
+ 参考:
96
+ client.workflows.create() - 创建远程工作流
97
+ client.workflows.run() - 运行远程工作流
98
+ """
99
+ import time
100
+ from pathlib import Path
101
+
102
+ from ..exceptions import APIError
103
+
104
+ # 统计信息
105
+ start_time = time.time()
106
+ total = 0
107
+ success = 0
108
+ failed = 0
109
+ failed_files = []
110
+
111
+ # 列出所有文件
112
+ files = source.list_files()
113
+ total = len(files)
114
+
115
+ # 处理每个文件
116
+ for idx, file_path in enumerate(files, start=1):
117
+ # 进度回调
118
+ if progress_callback:
119
+ progress_callback(idx, total, f"处理文件 {file_path}")
120
+
121
+ try:
122
+ # 读取文件
123
+ file_bytes, data_source = source.read_file(file_path)
124
+
125
+ # 获取文件名(只取最后一部分)
126
+ filename = Path(file_path).name
127
+
128
+ # 调用 Pipeline API
129
+ from .pipeline import PipelineAPI
130
+
131
+ pipeline_api = PipelineAPI(self._config, self._http)
132
+ result = pipeline_api.execute(
133
+ file=file_bytes,
134
+ filename=filename,
135
+ stages=stages,
136
+ config=pipeline_config,
137
+ data_source=data_source,
138
+ )
139
+
140
+ # 处理中间结果
141
+ if (
142
+ pipeline_config
143
+ and pipeline_config.include_intermediate_results
144
+ and pipeline_config.intermediate_results_destination
145
+ and hasattr(result, "intermediate_results")
146
+ and result.intermediate_results
147
+ ):
148
+ for stage_result in result.intermediate_results:
149
+ stage_name = stage_result.get("stage")
150
+ elements = stage_result.get("elements", [])
151
+ if stage_name and elements:
152
+ metadata_with_stage = {
153
+ "filename": filename,
154
+ "file_path": file_path,
155
+ "stage": stage_name,
156
+ }
157
+ if result.stats and hasattr(result.stats, "record_id"):
158
+ metadata_with_stage["record_id"] = result.stats.record_id
159
+
160
+ pipeline_config.intermediate_results_destination.write(
161
+ elements, metadata_with_stage
162
+ )
163
+
164
+ # 准备写入数据
165
+ # 如果是向量数据库,写入 embeddings
166
+ if hasattr(result, "elements") and result.elements:
167
+ elements_data = []
168
+ for elem in result.elements:
169
+ elem_dict = elem.model_dump() if hasattr(elem, "model_dump") else elem
170
+ elements_data.append(elem_dict)
171
+ else:
172
+ elements_data = []
173
+
174
+ # 写入目的地
175
+ metadata = {
176
+ "filename": filename,
177
+ "file_path": file_path,
178
+ }
179
+ if result.stats and hasattr(result.stats, "record_id"):
180
+ metadata["record_id"] = result.stats.record_id
181
+
182
+ destination.write(elements_data, metadata)
183
+
184
+ success += 1
185
+
186
+ except (APIError, Exception) as e:
187
+ error_msg = str(e)
188
+
189
+ if on_error == "stop":
190
+ # 立即抛出异常,停止处理
191
+ raise
192
+ else:
193
+ # continue: 记录失败并继续下一个文件
194
+ failed += 1
195
+ failed_files.append(
196
+ FailedFile(
197
+ file_path=file_path,
198
+ error=error_msg,
199
+ retry_count=0, # HTTP客户端内部已经处理了重试
200
+ )
201
+ )
202
+
203
+ # 计算总耗时
204
+ duration = time.time() - start_time
205
+
206
+ return WorkflowResult(
207
+ total=total,
208
+ success=success,
209
+ failed=failed,
210
+ failed_files=failed_files,
211
+ duration=duration,
212
+ )
213
+
214
+
215
+ __all__ = ["Local"]