xparse-client 0.3.0b1__tar.gz → 0.3.0b6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xparse_client-0.3.0b1/xparse_client.egg-info → xparse_client-0.3.0b6}/PKG-INFO +4 -4
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/README.md +3 -3
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/2_async_job.py +1 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/4_advanced_workflow.py +3 -3
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/pyproject.toml +1 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/models/test_pipeline_stages.py +2 -2
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_base.py +15 -6
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_http.py +10 -9
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/extract.py +4 -8
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/local.py +52 -12
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/pipeline.py +3 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/qdrant.py +1 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/ftp.py +2 -2
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/smb.py +1 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/extract.py +5 -3
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/parse.py +2 -6
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/pipeline.py +3 -1
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6/xparse_client.egg-info}/PKG-INFO +4 -4
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/CHANGELOG.md +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/LICENSE +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/MANIFEST.in +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/1_basic_api_usage.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/3_local_workflow.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/README.md +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/config_example.json +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/setup.cfg +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/conftest.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/test_extract.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/test_local.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/test_parse.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/test_pipeline.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/api/test_workflows.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_ftp.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_local_connectors.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_milvus.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_qdrant.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_s3.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_smb.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_utils.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/models/test_local.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/models/test_workflows.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/test_base.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/test_client.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/test_config.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/test_exceptions.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/test_http.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_client.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_config.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/parse.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/workflows.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/_utils.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/base.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/local.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/milvus.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/s3.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/base.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/local.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/s3.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/exceptions.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/__init__.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/chunk.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/embed.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/local.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/workflows.py +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client.egg-info/SOURCES.txt +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client.egg-info/dependency_links.txt +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client.egg-info/requires.txt +0 -0
- {xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.0b6
|
|
4
4
|
Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
|
|
5
5
|
Author-email: INTSIG-TEXTIN <support@textin.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -213,17 +213,17 @@ schema = {
|
|
|
213
213
|
"author": {"type": "string", "description": "作者"},
|
|
214
214
|
"date": {"type": "string", "description": "日期"}
|
|
215
215
|
},
|
|
216
|
-
"required": ["title"]
|
|
216
|
+
"required": ["title", "author", "date"]
|
|
217
217
|
}
|
|
218
218
|
|
|
219
219
|
with open("document.pdf", "rb") as f:
|
|
220
220
|
result = client.extract.extract(
|
|
221
221
|
file=f,
|
|
222
222
|
filename="document.pdf",
|
|
223
|
-
|
|
223
|
+
extract_config=ExtractConfig(schema=schema)
|
|
224
224
|
)
|
|
225
225
|
|
|
226
|
-
print(result.
|
|
226
|
+
print(result.result)
|
|
227
227
|
```
|
|
228
228
|
|
|
229
229
|
### 3. 本地批处理
|
|
@@ -164,17 +164,17 @@ schema = {
|
|
|
164
164
|
"author": {"type": "string", "description": "作者"},
|
|
165
165
|
"date": {"type": "string", "description": "日期"}
|
|
166
166
|
},
|
|
167
|
-
"required": ["title"]
|
|
167
|
+
"required": ["title", "author", "date"]
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
with open("document.pdf", "rb") as f:
|
|
171
171
|
result = client.extract.extract(
|
|
172
172
|
file=f,
|
|
173
173
|
filename="document.pdf",
|
|
174
|
-
|
|
174
|
+
extract_config=ExtractConfig(schema=schema)
|
|
175
175
|
)
|
|
176
176
|
|
|
177
|
-
print(result.
|
|
177
|
+
print(result.result)
|
|
178
178
|
```
|
|
179
179
|
|
|
180
180
|
### 3. 本地批处理
|
|
@@ -91,12 +91,12 @@ def example_1_config_file():
|
|
|
91
91
|
XParseClient.from_env()
|
|
92
92
|
|
|
93
93
|
# 演示如何从配置构建 source 和 destination(不实际创建)
|
|
94
|
-
print(
|
|
95
|
-
print(
|
|
94
|
+
print("\n✅ 配置加载成功!")
|
|
95
|
+
print(" - Source 配置:")
|
|
96
96
|
print(f" 目录: {loaded_config['source']['directory']}")
|
|
97
97
|
print(f" 模式: {loaded_config['source']['pattern']}")
|
|
98
98
|
print(f" 递归: {loaded_config['source']['recursive']}")
|
|
99
|
-
print(
|
|
99
|
+
print(" - Destination 配置:")
|
|
100
100
|
print(f" 输出目录: {loaded_config['destination']['output_dir']}")
|
|
101
101
|
|
|
102
102
|
# 构建 stages
|
|
@@ -89,8 +89,8 @@ def test_extract_stage_with_config():
|
|
|
89
89
|
assert stage.type == "extract"
|
|
90
90
|
assert stage.config.generate_citations is True
|
|
91
91
|
|
|
92
|
-
#
|
|
93
|
-
dumped = stage.model_dump()
|
|
92
|
+
# 序列化检查(使用 by_alias=True 来确保 schema_ 序列化为 schema)
|
|
93
|
+
dumped = stage.model_dump(by_alias=True)
|
|
94
94
|
assert "schema" in dumped["config"]
|
|
95
95
|
assert dumped["config"]["schema"] == schema
|
|
96
96
|
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
from abc import ABC
|
|
9
|
-
from typing import TYPE_CHECKING, Any, TypeVar
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
|
|
10
10
|
|
|
11
11
|
import httpx
|
|
12
12
|
|
|
@@ -14,7 +14,16 @@ if TYPE_CHECKING:
|
|
|
14
14
|
from ._config import SDKConfiguration
|
|
15
15
|
from ._http import HTTPClient
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
|
|
18
|
+
class PydanticModel(Protocol):
|
|
19
|
+
"""Pydantic 模型协议"""
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def model_validate(cls, obj: Any) -> Any:
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
T = TypeVar("T", bound=PydanticModel)
|
|
18
27
|
|
|
19
28
|
|
|
20
29
|
class BaseAPI(ABC): # noqa: B024
|
|
@@ -130,7 +139,7 @@ class BaseAPI(ABC): # noqa: B024
|
|
|
130
139
|
if isinstance(data, dict) and "data" in data:
|
|
131
140
|
data = data["data"]
|
|
132
141
|
|
|
133
|
-
return model_class.model_validate(data)
|
|
142
|
+
return cast(T, model_class.model_validate(data))
|
|
134
143
|
|
|
135
144
|
def _parse_list_response(
|
|
136
145
|
self,
|
|
@@ -156,7 +165,7 @@ class BaseAPI(ABC): # noqa: B024
|
|
|
156
165
|
if not isinstance(data, list):
|
|
157
166
|
data = [data]
|
|
158
167
|
|
|
159
|
-
return [model_class.model_validate(item) for item in data]
|
|
168
|
+
return [cast(T, model_class.model_validate(item)) for item in data]
|
|
160
169
|
|
|
161
170
|
def _parse_raw_response(self, response: httpx.Response) -> dict[str, Any]:
|
|
162
171
|
"""解析原始 JSON 响应
|
|
@@ -171,9 +180,9 @@ class BaseAPI(ABC): # noqa: B024
|
|
|
171
180
|
|
|
172
181
|
# 处理标准响应格式
|
|
173
182
|
if isinstance(data, dict) and "data" in data:
|
|
174
|
-
return data["data"]
|
|
183
|
+
return cast(dict[str, Any], data["data"])
|
|
175
184
|
|
|
176
|
-
return data
|
|
185
|
+
return cast(dict[str, Any], data)
|
|
177
186
|
|
|
178
187
|
|
|
179
188
|
__all__ = ["BaseAPI"]
|
|
@@ -53,7 +53,8 @@ def _extract_error_message(response: httpx.Response) -> str:
|
|
|
53
53
|
return str(data[key])
|
|
54
54
|
# 如果有嵌套的 error 对象
|
|
55
55
|
if "error" in data and isinstance(data["error"], dict):
|
|
56
|
-
|
|
56
|
+
error_dict = data["error"]
|
|
57
|
+
return str(error_dict.get("message", error_dict))
|
|
57
58
|
return str(data)
|
|
58
59
|
except Exception:
|
|
59
60
|
# 非 JSON 响应,返回文本内容
|
|
@@ -106,7 +107,7 @@ def raise_for_status(response: httpx.Response) -> None:
|
|
|
106
107
|
message = _extract_error_message(response)
|
|
107
108
|
response_body = response.text[:1000] if response.text else None
|
|
108
109
|
|
|
109
|
-
common_kwargs = {
|
|
110
|
+
common_kwargs: dict[str, Any] = {
|
|
110
111
|
"status_code": status_code,
|
|
111
112
|
"request_id": request_id,
|
|
112
113
|
"response_body": response_body,
|
|
@@ -140,24 +141,24 @@ def raise_for_status(response: httpx.Response) -> None:
|
|
|
140
141
|
message = data.get("message", f"业务错误 code: {code}")
|
|
141
142
|
response_body = response.text[:1000] if response.text else None
|
|
142
143
|
|
|
143
|
-
|
|
144
|
+
error_kwargs: dict[str, Any] = {
|
|
144
145
|
"request_id": request_id,
|
|
145
146
|
"response_body": response_body,
|
|
146
147
|
}
|
|
147
148
|
|
|
148
149
|
# 根据业务 code 映射到相应异常
|
|
149
150
|
if code == 400:
|
|
150
|
-
raise ValidationError(message, details=
|
|
151
|
+
raise ValidationError(message, details=error_kwargs)
|
|
151
152
|
elif code == 401:
|
|
152
|
-
raise AuthenticationError(message, **
|
|
153
|
+
raise AuthenticationError(message, **error_kwargs)
|
|
153
154
|
elif code == 403:
|
|
154
|
-
raise PermissionDeniedError(message, **
|
|
155
|
+
raise PermissionDeniedError(message, **error_kwargs)
|
|
155
156
|
elif code == 404:
|
|
156
|
-
raise NotFoundError(message, **
|
|
157
|
+
raise NotFoundError(message, **error_kwargs)
|
|
157
158
|
elif code >= 500:
|
|
158
|
-
raise ServerError(message, **
|
|
159
|
+
raise ServerError(message, **error_kwargs)
|
|
159
160
|
else:
|
|
160
|
-
raise APIError(message, **
|
|
161
|
+
raise APIError(message, **error_kwargs)
|
|
161
162
|
except (ValueError, KeyError):
|
|
162
163
|
# 如果无法解析 JSON 或没有 code 字段,认为是成功
|
|
163
164
|
pass
|
|
@@ -79,7 +79,8 @@ class Extract(BaseAPI):
|
|
|
79
79
|
... "properties": {
|
|
80
80
|
... "invoice_number": {"type": "string"},
|
|
81
81
|
... "total_amount": {"type": "number"}
|
|
82
|
-
... }
|
|
82
|
+
... },
|
|
83
|
+
... "required": ["invoice_number", "total_amount"]
|
|
83
84
|
... },
|
|
84
85
|
... generate_citations=True
|
|
85
86
|
... )
|
|
@@ -89,15 +90,11 @@ class Extract(BaseAPI):
|
|
|
89
90
|
data = {}
|
|
90
91
|
|
|
91
92
|
if parse_config:
|
|
92
|
-
data["parse_config"] = json.dumps(
|
|
93
|
-
parse_config.model_dump(), ensure_ascii=False
|
|
94
|
-
)
|
|
93
|
+
data["parse_config"] = json.dumps(parse_config.model_dump(), ensure_ascii=False)
|
|
95
94
|
|
|
96
95
|
# 处理 extract_config
|
|
97
96
|
if extract_config:
|
|
98
|
-
data["extract_config"] = json.dumps(
|
|
99
|
-
extract_config.model_dump(), ensure_ascii=False
|
|
100
|
-
)
|
|
97
|
+
data["extract_config"] = json.dumps(extract_config.model_dump(), ensure_ascii=False)
|
|
101
98
|
else:
|
|
102
99
|
raise ValueError("extract_config is required")
|
|
103
100
|
|
|
@@ -105,5 +102,4 @@ class Extract(BaseAPI):
|
|
|
105
102
|
return self._parse_response(response, ParseResponse)
|
|
106
103
|
|
|
107
104
|
|
|
108
|
-
|
|
109
105
|
__all__ = ["Extract"]
|
|
@@ -24,11 +24,11 @@ Example:
|
|
|
24
24
|
|
|
25
25
|
from __future__ import annotations
|
|
26
26
|
|
|
27
|
-
from typing import TYPE_CHECKING, Callable, Literal
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
28
28
|
|
|
29
29
|
from .._base import BaseAPI
|
|
30
30
|
from ..models.local import FailedFile, WorkflowResult
|
|
31
|
-
from ..models.pipeline import PipelineStage
|
|
31
|
+
from ..models.pipeline import PipelineConfig, PipelineStage
|
|
32
32
|
|
|
33
33
|
if TYPE_CHECKING:
|
|
34
34
|
pass
|
|
@@ -53,8 +53,9 @@ class Local(BaseAPI):
|
|
|
53
53
|
self,
|
|
54
54
|
source, # Union[LocalSource, S3Source, FtpSource, SmbSource]
|
|
55
55
|
destination, # Union[LocalDestination, S3Destination, MilvusDestination, QdrantDestination]
|
|
56
|
-
stages: list[PipelineStage],
|
|
56
|
+
stages: list[PipelineStage | dict[str, Any]],
|
|
57
57
|
*,
|
|
58
|
+
pipeline_config: PipelineConfig | None = None,
|
|
58
59
|
progress_callback: Callable[[int, int, str], None] | None = None,
|
|
59
60
|
on_error: Literal["stop", "continue", "retry"] = "stop",
|
|
60
61
|
max_retries: int = 3,
|
|
@@ -75,6 +76,7 @@ class Local(BaseAPI):
|
|
|
75
76
|
source: 数据源(本地 Connector 对象)
|
|
76
77
|
destination: 输出目的地(本地 Connector 对象)
|
|
77
78
|
stages: 处理阶段列表
|
|
79
|
+
pipeline_config: Pipeline 配置(包含中间结果保存配置)
|
|
78
80
|
progress_callback: 进度回调函数 (current, total, message) -> None
|
|
79
81
|
on_error: 错误处理策略 ("stop"|"continue"|"retry")
|
|
80
82
|
max_retries: 最大重试次数(on_error="retry" 时生效)
|
|
@@ -125,20 +127,56 @@ class Local(BaseAPI):
|
|
|
125
127
|
|
|
126
128
|
# 调用 Pipeline API
|
|
127
129
|
from .pipeline import PipelineAPI
|
|
130
|
+
|
|
128
131
|
pipeline_api = PipelineAPI(self._config, self._http)
|
|
129
132
|
result = pipeline_api.execute(
|
|
130
133
|
file=file_bytes,
|
|
131
134
|
filename=filename,
|
|
132
135
|
stages=stages,
|
|
136
|
+
config=pipeline_config,
|
|
133
137
|
data_source=data_source,
|
|
134
138
|
)
|
|
135
139
|
|
|
140
|
+
# 处理中间结果
|
|
141
|
+
if (
|
|
142
|
+
pipeline_config
|
|
143
|
+
and pipeline_config.include_intermediate_results
|
|
144
|
+
and pipeline_config.intermediate_results_destination
|
|
145
|
+
and hasattr(result, "intermediate_results")
|
|
146
|
+
and result.intermediate_results
|
|
147
|
+
):
|
|
148
|
+
for stage_result in result.intermediate_results:
|
|
149
|
+
stage_name = stage_result.get("stage")
|
|
150
|
+
elements = stage_result.get("elements", [])
|
|
151
|
+
if stage_name and elements:
|
|
152
|
+
# 转换 elements 为字典列表
|
|
153
|
+
elements_data = []
|
|
154
|
+
for elem in elements:
|
|
155
|
+
if hasattr(elem, "model_dump"):
|
|
156
|
+
elements_data.append(elem.model_dump())
|
|
157
|
+
elif isinstance(elem, dict):
|
|
158
|
+
elements_data.append(elem)
|
|
159
|
+
else:
|
|
160
|
+
elements_data.append(elem)
|
|
161
|
+
|
|
162
|
+
metadata_with_stage = {
|
|
163
|
+
"filename": filename,
|
|
164
|
+
"file_path": file_path,
|
|
165
|
+
"stage": stage_name,
|
|
166
|
+
}
|
|
167
|
+
if result.stats and hasattr(result.stats, "record_id"):
|
|
168
|
+
metadata_with_stage["record_id"] = result.stats.record_id
|
|
169
|
+
|
|
170
|
+
pipeline_config.intermediate_results_destination.write(
|
|
171
|
+
elements_data, metadata_with_stage
|
|
172
|
+
)
|
|
173
|
+
|
|
136
174
|
# 准备写入数据
|
|
137
175
|
# 如果是向量数据库,写入 embeddings
|
|
138
|
-
if hasattr(result,
|
|
176
|
+
if hasattr(result, "elements") and result.elements:
|
|
139
177
|
elements_data = []
|
|
140
178
|
for elem in result.elements:
|
|
141
|
-
elem_dict = elem.model_dump() if hasattr(elem,
|
|
179
|
+
elem_dict = elem.model_dump() if hasattr(elem, "model_dump") else elem
|
|
142
180
|
elements_data.append(elem_dict)
|
|
143
181
|
else:
|
|
144
182
|
elements_data = []
|
|
@@ -148,8 +186,8 @@ class Local(BaseAPI):
|
|
|
148
186
|
"filename": filename,
|
|
149
187
|
"file_path": file_path,
|
|
150
188
|
}
|
|
151
|
-
if hasattr(result,
|
|
152
|
-
metadata["record_id"] = result.record_id
|
|
189
|
+
if result.stats and hasattr(result.stats, "record_id"):
|
|
190
|
+
metadata["record_id"] = result.stats.record_id
|
|
153
191
|
|
|
154
192
|
destination.write(elements_data, metadata)
|
|
155
193
|
|
|
@@ -164,11 +202,13 @@ class Local(BaseAPI):
|
|
|
164
202
|
else:
|
|
165
203
|
# continue: 记录失败并继续下一个文件
|
|
166
204
|
failed += 1
|
|
167
|
-
failed_files.append(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
205
|
+
failed_files.append(
|
|
206
|
+
FailedFile(
|
|
207
|
+
file_path=file_path,
|
|
208
|
+
error=error_msg,
|
|
209
|
+
retry_count=0, # HTTP客户端内部已经处理了重试
|
|
210
|
+
)
|
|
211
|
+
)
|
|
172
212
|
|
|
173
213
|
# 计算总耗时
|
|
174
214
|
duration = time.time() - start_time
|
|
@@ -120,7 +120,9 @@ class PipelineAPI(BaseAPI):
|
|
|
120
120
|
}
|
|
121
121
|
|
|
122
122
|
if config:
|
|
123
|
-
|
|
123
|
+
# 只序列化 API 需要的字段(排除 intermediate_results_destination)
|
|
124
|
+
config_dict = config.model_dump(exclude={"intermediate_results_destination"})
|
|
125
|
+
data["config"] = json.dumps(config_dict, ensure_ascii=False)
|
|
124
126
|
if data_source:
|
|
125
127
|
data["data_source"] = json.dumps(data_source, ensure_ascii=False)
|
|
126
128
|
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/qdrant.py
RENAMED
|
@@ -90,7 +90,7 @@ class QdrantDestination(Destination, VectorDestinationMixin):
|
|
|
90
90
|
self.dimension = dimension
|
|
91
91
|
self._PointStruct = PointStruct
|
|
92
92
|
|
|
93
|
-
client_kwargs = {"url": url}
|
|
93
|
+
client_kwargs: dict[str, Any] = {"url": url}
|
|
94
94
|
if api_key:
|
|
95
95
|
client_kwargs["api_key"] = api_key
|
|
96
96
|
if prefer_grpc:
|
|
@@ -90,7 +90,7 @@ class FtpSource(Source):
|
|
|
90
90
|
SourceError: 列出文件失败
|
|
91
91
|
"""
|
|
92
92
|
try:
|
|
93
|
-
files = []
|
|
93
|
+
files: list[str] = []
|
|
94
94
|
current_dir = self.client.pwd()
|
|
95
95
|
|
|
96
96
|
if self.recursive:
|
|
@@ -178,7 +178,7 @@ class FtpSource(Source):
|
|
|
178
178
|
|
|
179
179
|
# 回退到 LIST 命令
|
|
180
180
|
try:
|
|
181
|
-
lines = []
|
|
181
|
+
lines: list[str] = []
|
|
182
182
|
self.client.retrlines("LIST", lines.append)
|
|
183
183
|
for line in lines:
|
|
184
184
|
parts = line.split()
|
|
@@ -11,7 +11,7 @@ class ExtractConfig(BaseModel):
|
|
|
11
11
|
"""抽取配置
|
|
12
12
|
|
|
13
13
|
Attributes:
|
|
14
|
-
|
|
14
|
+
schema_: JSON Schema 定义抽取的结构(序列化为 schema)
|
|
15
15
|
generate_citations: 是否生成引用
|
|
16
16
|
stamp: 是否添加时间戳
|
|
17
17
|
|
|
@@ -23,11 +23,12 @@ class ExtractConfig(BaseModel):
|
|
|
23
23
|
... "total_amount": {"type": "number"}
|
|
24
24
|
... }
|
|
25
25
|
... }
|
|
26
|
-
>>> config = ExtractConfig(
|
|
26
|
+
>>> config = ExtractConfig(schema_=schema)
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
schema_: dict[str, Any] = Field(
|
|
30
30
|
default_factory=dict,
|
|
31
|
+
alias="schema",
|
|
31
32
|
description="JSON Schema 定义抽取的结构"
|
|
32
33
|
)
|
|
33
34
|
generate_citations: bool = False
|
|
@@ -35,6 +36,7 @@ class ExtractConfig(BaseModel):
|
|
|
35
36
|
|
|
36
37
|
model_config = {
|
|
37
38
|
"extra": "allow",
|
|
39
|
+
"populate_by_name": True, # 允许使用 schema_ 和 schema 两种名称
|
|
38
40
|
}
|
|
39
41
|
|
|
40
42
|
|
|
@@ -63,17 +63,13 @@ class ParseResponse(BaseModel):
|
|
|
63
63
|
|
|
64
64
|
Attributes:
|
|
65
65
|
elements: 解析后的元素列表(Parse/Chunk/Embed 返回)
|
|
66
|
-
|
|
66
|
+
result: 提取结果(Extract 返回)
|
|
67
67
|
success_count: 成功数量
|
|
68
|
-
consume_time: 耗时
|
|
69
|
-
record_id: 记录 ID
|
|
70
68
|
"""
|
|
71
69
|
|
|
72
70
|
elements: list[Element] = Field(default_factory=list)
|
|
73
|
-
|
|
71
|
+
result: dict[str, Any] | None = None # Extract API 返回
|
|
74
72
|
success_count: int | None = None
|
|
75
|
-
consume_time: str | None = None
|
|
76
|
-
record_id: str | None = None
|
|
77
73
|
|
|
78
74
|
model_config = {"extra": "allow"}
|
|
79
75
|
|
|
@@ -99,11 +99,13 @@ class PipelineConfig(BaseModel):
|
|
|
99
99
|
|
|
100
100
|
Attributes:
|
|
101
101
|
include_intermediate_results: 是否包含中间结果
|
|
102
|
+
intermediate_results_destination: 中间结果保存目的地(仅支持 LocalDestination)
|
|
102
103
|
"""
|
|
103
104
|
|
|
104
105
|
include_intermediate_results: bool = False
|
|
106
|
+
intermediate_results_destination: Any = None # LocalDestination 对象
|
|
105
107
|
|
|
106
|
-
model_config = {"extra": "allow"}
|
|
108
|
+
model_config = {"extra": "allow", "arbitrary_types_allowed": True}
|
|
107
109
|
|
|
108
110
|
|
|
109
111
|
class PipelineResponse(BaseModel):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.0b6
|
|
4
4
|
Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
|
|
5
5
|
Author-email: INTSIG-TEXTIN <support@textin.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -213,17 +213,17 @@ schema = {
|
|
|
213
213
|
"author": {"type": "string", "description": "作者"},
|
|
214
214
|
"date": {"type": "string", "description": "日期"}
|
|
215
215
|
},
|
|
216
|
-
"required": ["title"]
|
|
216
|
+
"required": ["title", "author", "date"]
|
|
217
217
|
}
|
|
218
218
|
|
|
219
219
|
with open("document.pdf", "rb") as f:
|
|
220
220
|
result = client.extract.extract(
|
|
221
221
|
file=f,
|
|
222
222
|
filename="document.pdf",
|
|
223
|
-
|
|
223
|
+
extract_config=ExtractConfig(schema=schema)
|
|
224
224
|
)
|
|
225
225
|
|
|
226
|
-
print(result.
|
|
226
|
+
print(result.result)
|
|
227
227
|
```
|
|
228
228
|
|
|
229
229
|
### 3. 本地批处理
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/connectors/test_local_connectors.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/__init__.py
RENAMED
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/base.py
RENAMED
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/local.py
RENAMED
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/milvus.py
RENAMED
|
File without changes
|
|
File without changes
|
{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|