xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +111 -20
- xparse_client/_base.py +179 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +350 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +215 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +134 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +136 -0
- xparse_client/models/pipeline.py +134 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b3.dist-info/RECORD +68 -0
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
- example/run_pipeline.py +0 -506
- example/run_pipeline_test.py +0 -458
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -129
- xparse_client/pipeline/destinations.py +0 -487
- xparse_client/pipeline/pipeline.py +0 -622
- xparse_client/pipeline/sources.py +0 -585
- xparse_client-0.2.11.dist-info/METADATA +0 -1050
- xparse_client-0.2.11.dist-info/RECORD +0 -13
|
@@ -0,0 +1,1075 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xparse-client
|
|
3
|
+
Version: 0.3.0b3
|
|
4
|
+
Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
|
|
5
|
+
Author-email: INTSIG-TEXTIN <support@textin.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/textin/xparse-client
|
|
8
|
+
Project-URL: Repository, https://github.com/textin/xparse-client
|
|
9
|
+
Project-URL: Documentation, https://github.com/textin/xparse-client#readme
|
|
10
|
+
Keywords: xparse,pipeline,rag,document,parsing,textin
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Text Processing :: General
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: httpx<1.0.0,>=0.24.0
|
|
25
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
26
|
+
Requires-Dist: eval-type-backport>=0.2.0; python_version < "3.10"
|
|
27
|
+
Provides-Extra: s3
|
|
28
|
+
Requires-Dist: boto3>=1.26.0; extra == "s3"
|
|
29
|
+
Provides-Extra: milvus
|
|
30
|
+
Requires-Dist: pymilvus>=2.3.0; extra == "milvus"
|
|
31
|
+
Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "milvus"
|
|
32
|
+
Provides-Extra: qdrant
|
|
33
|
+
Requires-Dist: qdrant-client>=1.7.0; extra == "qdrant"
|
|
34
|
+
Provides-Extra: smb
|
|
35
|
+
Requires-Dist: pysmb>=1.2.0; extra == "smb"
|
|
36
|
+
Provides-Extra: legacy
|
|
37
|
+
Requires-Dist: boto3>=1.26.0; extra == "legacy"
|
|
38
|
+
Requires-Dist: pymilvus>=2.3.0; extra == "legacy"
|
|
39
|
+
Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "legacy"
|
|
40
|
+
Requires-Dist: qdrant-client>=1.7.0; extra == "legacy"
|
|
41
|
+
Requires-Dist: pysmb>=1.2.0; extra == "legacy"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: boto3>=1.26.0; extra == "all"
|
|
44
|
+
Requires-Dist: pymilvus>=2.3.0; extra == "all"
|
|
45
|
+
Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "all"
|
|
46
|
+
Requires-Dist: qdrant-client>=1.7.0; extra == "all"
|
|
47
|
+
Requires-Dist: pysmb>=1.2.0; extra == "all"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
# xParse Client
|
|
51
|
+
|
|
52
|
+
<h3 align="center">
|
|
53
|
+
面向 Agent 和 RAG 的新一代文档处理 Python SDK
|
|
54
|
+
</h3>
|
|
55
|
+
|
|
56
|
+
<div align="center">
|
|
57
|
+
|
|
58
|
+
[](https://badge.fury.io/py/xparse-client)
|
|
59
|
+
[](https://pypi.org/project/xparse-client/)
|
|
60
|
+
[](LICENSE)
|
|
61
|
+
|
|
62
|
+
</div>
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## ⚠️ 重要提示
|
|
67
|
+
|
|
68
|
+
**Pipeline 类已在 v0.3.0 版本中完全移除。**
|
|
69
|
+
|
|
70
|
+
如果你的代码中使用了 `Pipeline` 类,请立即查看 [完整迁移指南](docs/MIGRATION_FROM_PIPELINE.md)。
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## 目录
|
|
75
|
+
|
|
76
|
+
- [SDK 安装](#sdk-安装)
|
|
77
|
+
- [快速开始](#快速开始)
|
|
78
|
+
- [核心特性](#核心特性)
|
|
79
|
+
- [错误处理](#错误处理)
|
|
80
|
+
- [资源管理](#资源管理)
|
|
81
|
+
- [配置说明](#配置说明)
|
|
82
|
+
- [认证配置](#认证配置)
|
|
83
|
+
- [数据源配置](#数据源配置)
|
|
84
|
+
- [目的地配置](#目的地配置)
|
|
85
|
+
- [高级配置](#高级配置)
|
|
86
|
+
- [使用示例](#使用示例)
|
|
87
|
+
- [调试与日志](#调试与日志)
|
|
88
|
+
- [本地开发](#本地开发)
|
|
89
|
+
- [相关资源](#相关资源)
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## SDK 安装
|
|
94
|
+
|
|
95
|
+
> [!NOTE]
|
|
96
|
+
> **Python 版本要求**
|
|
97
|
+
>
|
|
98
|
+
> 本 SDK 支持 Python 3.9 及以上版本。一旦某个 Python 版本达到其[官方生命周期结束日期](https://devguide.python.org/versions/),将提供 3 个月的宽限期供用户升级。
|
|
99
|
+
|
|
100
|
+
SDK 支持多种包管理器安装。
|
|
101
|
+
|
|
102
|
+
### uv(推荐)
|
|
103
|
+
|
|
104
|
+
[uv](https://docs.astral.sh/uv/) 是一个快速的 Python 包管理器,推荐用于现代 Python 项目。
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uv add xparse-client
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### pip
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install xparse-client
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### poetry
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
poetry add xparse-client
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 可选依赖
|
|
123
|
+
|
|
124
|
+
根据使用的 Destination 类型安装额外依赖:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Milvus 向量数据库支持
|
|
128
|
+
pip install xparse-client[milvus]
|
|
129
|
+
|
|
130
|
+
# Qdrant 向量数据库支持
|
|
131
|
+
pip install xparse-client[qdrant]
|
|
132
|
+
|
|
133
|
+
# 完整安装(包含所有可选依赖)
|
|
134
|
+
pip install xparse-client[all]
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Shell 脚本使用
|
|
138
|
+
|
|
139
|
+
使用 `uv` 可以快速编写独立的 Python 脚本,无需创建完整项目:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
#!/usr/bin/env -S uv run --script
|
|
143
|
+
# /// script
|
|
144
|
+
# requires-python = ">=3.9"
|
|
145
|
+
# dependencies = [
|
|
146
|
+
# "xparse-client",
|
|
147
|
+
# ]
|
|
148
|
+
# ///
|
|
149
|
+
|
|
150
|
+
from xparse_client import XParseClient
|
|
151
|
+
|
|
152
|
+
# 初始化客户端
|
|
153
|
+
client = XParseClient.from_env()
|
|
154
|
+
|
|
155
|
+
# 处理文档
|
|
156
|
+
with open("document.pdf", "rb") as f:
|
|
157
|
+
result = client.parse.partition(file=f, filename="document.pdf")
|
|
158
|
+
print(f"解析出 {len(result.elements)} 个元素")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
保存为 `process.py` 后,直接运行:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
uv run process.py
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 快速开始
|
|
170
|
+
|
|
171
|
+
### 1. 环境配置
|
|
172
|
+
|
|
173
|
+
首先设置 TextIn 认证信息:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
export TEXTIN_APP_ID="your-app-id"
|
|
177
|
+
export TEXTIN_SECRET_CODE="your-secret-code"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
可以在 [TextIn 开发者控制台](https://www.textin.com/console/dashboard/setting) 获取认证凭证。
|
|
181
|
+
|
|
182
|
+
### 2. 单文件处理
|
|
183
|
+
|
|
184
|
+
**场景 1:解析文档**
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from xparse_client import XParseClient
|
|
188
|
+
from xparse_client.models import ParseConfig
|
|
189
|
+
|
|
190
|
+
client = XParseClient.from_env()
|
|
191
|
+
|
|
192
|
+
# 解析文档
|
|
193
|
+
with open("document.pdf", "rb") as f:
|
|
194
|
+
result = client.parse.partition(
|
|
195
|
+
file=f,
|
|
196
|
+
filename="document.pdf",
|
|
197
|
+
config=ParseConfig(provider="textin")
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
print(f"解析出 {len(result.elements)} 个元素")
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
**场景 2:提取结构化数据**
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from xparse_client.models import ExtractConfig
|
|
207
|
+
|
|
208
|
+
# 定义提取 schema
|
|
209
|
+
schema = {
|
|
210
|
+
"type": "object",
|
|
211
|
+
"properties": {
|
|
212
|
+
"title": {"type": "string", "description": "文档标题"},
|
|
213
|
+
"author": {"type": "string", "description": "作者"},
|
|
214
|
+
"date": {"type": "string", "description": "日期"}
|
|
215
|
+
},
|
|
216
|
+
"required": ["title"]
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
with open("document.pdf", "rb") as f:
|
|
220
|
+
result = client.extract.extract(
|
|
221
|
+
file=f,
|
|
222
|
+
filename="document.pdf",
|
|
223
|
+
config=ExtractConfig(schema=schema)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
print(result.extracted_data)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### 3. 本地批处理
|
|
230
|
+
|
|
231
|
+
批量处理本地文件并写入向量数据库:
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from xparse_client import XParseClient
|
|
235
|
+
from xparse_client.connectors import LocalSource, MilvusDestination
|
|
236
|
+
from xparse_client.models import ParseStage, ChunkStage, EmbedStage
|
|
237
|
+
from xparse_client.models import ParseConfig, ChunkConfig, EmbedConfig
|
|
238
|
+
|
|
239
|
+
client = XParseClient.from_env()
|
|
240
|
+
|
|
241
|
+
result = client.local.run_workflow(
|
|
242
|
+
source=LocalSource(directory="./docs", pattern=["*.pdf"]),
|
|
243
|
+
destination=MilvusDestination(
|
|
244
|
+
db_path="./vectors.db",
|
|
245
|
+
collection_name="documents",
|
|
246
|
+
dimension=1024
|
|
247
|
+
),
|
|
248
|
+
stages=[
|
|
249
|
+
ParseStage(config=ParseConfig(provider="textin")),
|
|
250
|
+
ChunkStage(config=ChunkConfig(strategy="by_title")),
|
|
251
|
+
EmbedStage(config=EmbedConfig(provider="qwen"))
|
|
252
|
+
]
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
print(f"处理完成: {result.success}/{result.total}")
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### 4. 异步任务处理
|
|
259
|
+
|
|
260
|
+
处理大文件时使用服务端异步任务:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
# 创建异步任务
|
|
264
|
+
with open("large_document.pdf", "rb") as f:
|
|
265
|
+
job = client.parse.create_async_job(
|
|
266
|
+
file=f,
|
|
267
|
+
filename="large_document.pdf",
|
|
268
|
+
config=ParseConfig(provider="textin")
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
print(f"任务已创建: {job.job_id}")
|
|
272
|
+
|
|
273
|
+
# 等待任务完成
|
|
274
|
+
result = client.parse.wait_for_result(
|
|
275
|
+
job_id=job.job_id,
|
|
276
|
+
timeout=300.0,
|
|
277
|
+
poll_interval=5.0
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
print(f"任务完成,解析出 {len(result.result.elements)} 个元素")
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### 同步 vs 异步
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
# 同步调用(适合小文件)
|
|
287
|
+
result = client.parse.partition(file=f, filename="doc.pdf")
|
|
288
|
+
|
|
289
|
+
# 异步调用(相同接口,加上 _async 后缀)
|
|
290
|
+
import asyncio
|
|
291
|
+
|
|
292
|
+
async def main():
|
|
293
|
+
result = await client.parse.partition_async(file=f, filename="doc.pdf")
|
|
294
|
+
|
|
295
|
+
asyncio.run(main())
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## 核心特性
|
|
301
|
+
|
|
302
|
+
### 📥 灵活的数据源
|
|
303
|
+
|
|
304
|
+
- **S3 兼容存储**:支持 MinIO、AWS S3、阿里云 OSS、腾讯云 COS、火山引擎 TOS、华为云 OBS
|
|
305
|
+
- **本地文件系统**:支持本地目录批量处理
|
|
306
|
+
- **FTP/SMB**:支持 FTP 和 SMB 协议文件系统
|
|
307
|
+
|
|
308
|
+
详细配置请查看:[云厂商配置指南](docs/CLOUD_PROVIDERS.md)
|
|
309
|
+
|
|
310
|
+
### 📤 灵活的输出目的地
|
|
311
|
+
|
|
312
|
+
- **向量数据库**:Milvus、Zilliz Cloud、Qdrant
|
|
313
|
+
- **文件存储**:本地文件系统、S3 兼容存储
|
|
314
|
+
|
|
315
|
+
### 🔄 统一的 Pipeline API
|
|
316
|
+
|
|
317
|
+
使用 `/api/xparse/pipeline` 一次性完成 parse → chunk → embed 全流程:
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
result = client.pipeline.execute(
|
|
321
|
+
file=f,
|
|
322
|
+
filename="document.pdf",
|
|
323
|
+
stages=[ParseStage(...), ChunkStage(...), EmbedStage(...)]
|
|
324
|
+
)
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### 📊 详细的处理统计
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
print(f"原始元素: {result.stats.original_elements}")
|
|
331
|
+
print(f"分块后: {result.stats.chunked_elements}")
|
|
332
|
+
print(f"向量化: {result.stats.embedded_elements}")
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
### 🔧 易于扩展
|
|
336
|
+
|
|
337
|
+
基于抽象类设计,可轻松添加自定义 Source 和 Destination:
|
|
338
|
+
|
|
339
|
+
```python
|
|
340
|
+
from xparse_client.connectors.sources import Source
|
|
341
|
+
|
|
342
|
+
class MyCustomSource(Source):
|
|
343
|
+
def list_files(self) -> List[str]:
|
|
344
|
+
# 实现文件列表逻辑
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
348
|
+
# 实现文件读取逻辑
|
|
349
|
+
pass
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
---
|
|
353
|
+
|
|
354
|
+
## 错误处理
|
|
355
|
+
|
|
356
|
+
### 错误类层次结构
|
|
357
|
+
|
|
358
|
+
SDK 提供了完善的错误类型系统:
|
|
359
|
+
|
|
360
|
+
| 错误类 | 说明 | 使用场景 |
|
|
361
|
+
|--------|------|----------|
|
|
362
|
+
| `XParseError` | 基础错误类 | 捕获所有 SDK 错误 |
|
|
363
|
+
| `AuthenticationError` | 认证失败 | app_id 或 secret_code 错误 |
|
|
364
|
+
| `RateLimitError` | 超过速率限制 | 需要等待或提升配额 |
|
|
365
|
+
| `APIError` | API 请求错误 | 服务端错误 |
|
|
366
|
+
| `ValidationError` | 参数验证错误 | 检查输入参数 |
|
|
367
|
+
|
|
368
|
+
### 错误处理示例
|
|
369
|
+
|
|
370
|
+
```python
|
|
371
|
+
from xparse_client import XParseClient
|
|
372
|
+
from xparse_client.exceptions import (
|
|
373
|
+
XParseError,
|
|
374
|
+
RateLimitError,
|
|
375
|
+
AuthenticationError,
|
|
376
|
+
APIError
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
client = XParseClient.from_env()
|
|
380
|
+
|
|
381
|
+
try:
|
|
382
|
+
with open("document.pdf", "rb") as f:
|
|
383
|
+
result = client.parse.partition(file=f, filename="document.pdf")
|
|
384
|
+
|
|
385
|
+
except AuthenticationError as e:
|
|
386
|
+
print(f"认证失败: {e.message}")
|
|
387
|
+
print("请检查 TEXTIN_APP_ID 和 TEXTIN_SECRET_CODE 环境变量")
|
|
388
|
+
|
|
389
|
+
except RateLimitError as e:
|
|
390
|
+
print(f"超过速率限制: {e.message}")
|
|
391
|
+
print(f"建议等待 {e.retry_after} 秒后重试")
|
|
392
|
+
# 自动重试
|
|
393
|
+
import time
|
|
394
|
+
time.sleep(e.retry_after)
|
|
395
|
+
result = client.parse.partition(file=f, filename="document.pdf")
|
|
396
|
+
|
|
397
|
+
except APIError as e:
|
|
398
|
+
print(f"API 错误: {e.message}")
|
|
399
|
+
print(f"请求 ID: {e.request_id}") # 用于技术支持排查
|
|
400
|
+
print(f"状态码: {e.status_code}")
|
|
401
|
+
|
|
402
|
+
except XParseError as e:
|
|
403
|
+
print(f"SDK 错误: {e.message}")
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### 获取请求 ID
|
|
407
|
+
|
|
408
|
+
当遇到问题需要技术支持时,提供 `request_id` 可以帮助快速定位问题:
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
try:
|
|
412
|
+
result = client.parse.partition(...)
|
|
413
|
+
except APIError as e:
|
|
414
|
+
# 记录 request_id
|
|
415
|
+
logger.error(f"请求失败,request_id: {e.request_id}")
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
---
|
|
419
|
+
|
|
420
|
+
## 资源管理
|
|
421
|
+
|
|
422
|
+
`XParseClient` 实现了上下文管理器协议,会自动管理底层 HTTP 连接和资源释放。
|
|
423
|
+
|
|
424
|
+
### 使用上下文管理器(推荐)
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
from xparse_client import XParseClient
|
|
428
|
+
|
|
429
|
+
def main():
|
|
430
|
+
with XParseClient.from_env() as client:
|
|
431
|
+
# 应用逻辑
|
|
432
|
+
result = client.parse.partition(...)
|
|
433
|
+
# 退出时自动关闭连接
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### 异步场景
|
|
437
|
+
|
|
438
|
+
```python
|
|
439
|
+
async def amain():
|
|
440
|
+
async with XParseClient.from_env() as client:
|
|
441
|
+
result = await client.parse.partition_async(...)
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### 最佳实践
|
|
445
|
+
|
|
446
|
+
**✅ 推荐做法:**
|
|
447
|
+
|
|
448
|
+
```python
|
|
449
|
+
# 长时间运行的程序,复用 client 实例
|
|
450
|
+
with XParseClient.from_env() as client:
|
|
451
|
+
for file_path in file_list:
|
|
452
|
+
result = client.parse.partition(...)
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**⚠️ 不推荐:**
|
|
456
|
+
|
|
457
|
+
```python
|
|
458
|
+
# 避免频繁创建和销毁 client
|
|
459
|
+
for file_path in file_list:
|
|
460
|
+
client = XParseClient.from_env() # ❌ 每次都创建新实例
|
|
461
|
+
result = client.parse.partition(...)
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### 何时可以不使用上下文管理器
|
|
465
|
+
|
|
466
|
+
- 短生命周期脚本(处理单个文件后即退出)
|
|
467
|
+
- Jupyter Notebook 交互式环境
|
|
468
|
+
- 快速原型开发
|
|
469
|
+
|
|
470
|
+
在这些场景下,Python 的垃圾回收机制会自动清理资源:
|
|
471
|
+
|
|
472
|
+
```python
|
|
473
|
+
# 短脚本中可以不使用 with
|
|
474
|
+
client = XParseClient.from_env()
|
|
475
|
+
result = client.parse.partition(...)
|
|
476
|
+
# 脚本退出时自动清理
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
---
|
|
480
|
+
|
|
481
|
+
## 配置说明
|
|
482
|
+
|
|
483
|
+
### 认证配置
|
|
484
|
+
|
|
485
|
+
**方式 1:环境变量(推荐)**
|
|
486
|
+
|
|
487
|
+
```bash
|
|
488
|
+
export TEXTIN_APP_ID="your-app-id"
|
|
489
|
+
export TEXTIN_SECRET_CODE="your-secret-code"
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
```python
|
|
493
|
+
client = XParseClient.from_env()
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
**方式 2:直接传参**
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
client = XParseClient(
|
|
500
|
+
app_id="your-app-id",
|
|
501
|
+
secret_code="your-secret-code"
|
|
502
|
+
)
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
**方式 3:配置文件**
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
import os
|
|
509
|
+
from dotenv import load_dotenv
|
|
510
|
+
|
|
511
|
+
load_dotenv() # 从 .env 文件加载
|
|
512
|
+
client = XParseClient.from_env()
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
`.env` 文件内容:
|
|
516
|
+
|
|
517
|
+
```bash
|
|
518
|
+
TEXTIN_APP_ID=your-app-id
|
|
519
|
+
TEXTIN_SECRET_CODE=your-secret-code
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
### 数据源配置
|
|
523
|
+
|
|
524
|
+
#### 本地文件系统
|
|
525
|
+
|
|
526
|
+
```python
|
|
527
|
+
from xparse_client.connectors import LocalSource
|
|
528
|
+
|
|
529
|
+
source = LocalSource(
|
|
530
|
+
directory='./documents',
|
|
531
|
+
pattern=['*.pdf', '*.docx'], # 可选,文件类型过滤
|
|
532
|
+
recursive=True # 可选,递归子目录
|
|
533
|
+
)
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
#### S3 兼容存储
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
from xparse_client.connectors import S3Source
|
|
540
|
+
|
|
541
|
+
source = S3Source(
|
|
542
|
+
endpoint='https://s3.amazonaws.com',
|
|
543
|
+
access_key='your-access-key',
|
|
544
|
+
secret_key='your-secret-key',
|
|
545
|
+
bucket='my-bucket',
|
|
546
|
+
prefix='documents/', # 可选,指定前缀
|
|
547
|
+
region='us-east-1',
|
|
548
|
+
pattern=['*.pdf'], # 可选,文件类型过滤
|
|
549
|
+
recursive=True # 可选,递归子目录
|
|
550
|
+
)
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
**支持的云厂商:**
|
|
554
|
+
- AWS S3
|
|
555
|
+
- MinIO
|
|
556
|
+
- 阿里云 OSS
|
|
557
|
+
- 腾讯云 COS
|
|
558
|
+
- 火山引擎 TOS
|
|
559
|
+
- 华为云 OBS
|
|
560
|
+
|
|
561
|
+
详细配置请查看:[云厂商配置指南](docs/CLOUD_PROVIDERS.md)
|
|
562
|
+
|
|
563
|
+
#### FTP 数据源
|
|
564
|
+
|
|
565
|
+
```python
|
|
566
|
+
from xparse_client.connectors import FtpSource
|
|
567
|
+
|
|
568
|
+
source = FtpSource(
|
|
569
|
+
host='ftp.example.com',
|
|
570
|
+
port=21,
|
|
571
|
+
username='user',
|
|
572
|
+
password='pass',
|
|
573
|
+
pattern=['*.pdf'],
|
|
574
|
+
recursive=True
|
|
575
|
+
)
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
#### SMB 数据源
|
|
579
|
+
|
|
580
|
+
```python
|
|
581
|
+
from xparse_client.connectors import SmbSource
|
|
582
|
+
|
|
583
|
+
source = SmbSource(
|
|
584
|
+
host='smb.example.com',
|
|
585
|
+
share_name='documents',
|
|
586
|
+
username='user',
|
|
587
|
+
password='pass',
|
|
588
|
+
domain='WORKGROUP',
|
|
589
|
+
pattern=['**/*.pdf'],
|
|
590
|
+
recursive=True
|
|
591
|
+
)
|
|
592
|
+
```
|
|
593
|
+
|
|
594
|
+
### 目的地配置
|
|
595
|
+
|
|
596
|
+
#### 本地文件系统
|
|
597
|
+
|
|
598
|
+
```python
|
|
599
|
+
from xparse_client.connectors import LocalDestination
|
|
600
|
+
|
|
601
|
+
destination = LocalDestination(
|
|
602
|
+
output_dir='./output'
|
|
603
|
+
)
|
|
604
|
+
```
|
|
605
|
+
|
|
606
|
+
输出文件格式:`{filename}_{timestamp}.json`
|
|
607
|
+
|
|
608
|
+
#### Milvus 本地数据库
|
|
609
|
+
|
|
610
|
+
```python
|
|
611
|
+
from xparse_client.connectors import MilvusDestination
|
|
612
|
+
|
|
613
|
+
destination = MilvusDestination(
|
|
614
|
+
db_path='./vectors.db',
|
|
615
|
+
collection_name='documents',
|
|
616
|
+
dimension=1024 # 必须与 embed 模型维度一致
|
|
617
|
+
)
|
|
618
|
+
```
|
|
619
|
+
|
|
620
|
+
**Collection 必需字段:**
|
|
621
|
+
- `element_id` - 元素唯一标识
|
|
622
|
+
- `text` - 文本内容
|
|
623
|
+
- `embeddings` - 向量
|
|
624
|
+
- `record_id` - 记录 ID
|
|
625
|
+
|
|
626
|
+
#### Zilliz Cloud
|
|
627
|
+
|
|
628
|
+
```python
|
|
629
|
+
destination = MilvusDestination(
|
|
630
|
+
db_path='https://your-instance.cloud.zilliz.com.cn',
|
|
631
|
+
collection_name='documents',
|
|
632
|
+
dimension=1024,
|
|
633
|
+
api_key='your-api-key'
|
|
634
|
+
)
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
#### Qdrant
|
|
638
|
+
|
|
639
|
+
```python
|
|
640
|
+
from xparse_client.connectors import QdrantDestination
|
|
641
|
+
|
|
642
|
+
# 本地 Qdrant
|
|
643
|
+
destination = QdrantDestination(
|
|
644
|
+
url='http://localhost:6333',
|
|
645
|
+
collection_name='documents',
|
|
646
|
+
dimension=1024
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
# Qdrant Cloud
|
|
650
|
+
destination = QdrantDestination(
|
|
651
|
+
url='https://your-cluster.cloud.qdrant.io',
|
|
652
|
+
collection_name='documents',
|
|
653
|
+
dimension=1024,
|
|
654
|
+
api_key='your-api-key'
|
|
655
|
+
)
|
|
656
|
+
```
|
|
657
|
+
|
|
658
|
+
#### S3 兼容存储
|
|
659
|
+
|
|
660
|
+
```python
|
|
661
|
+
from xparse_client.connectors import S3Destination
|
|
662
|
+
|
|
663
|
+
destination = S3Destination(
|
|
664
|
+
endpoint='https://s3.amazonaws.com',
|
|
665
|
+
access_key='your-access-key',
|
|
666
|
+
secret_key='your-secret-key',
|
|
667
|
+
bucket='my-bucket',
|
|
668
|
+
prefix='output/',
|
|
669
|
+
region='us-east-1'
|
|
670
|
+
)
|
|
671
|
+
```
|
|
672
|
+
|
|
673
|
+
### 高级配置
|
|
674
|
+
|
|
675
|
+
#### 超时和重试
|
|
676
|
+
|
|
677
|
+
```python
|
|
678
|
+
client = XParseClient(
|
|
679
|
+
app_id="...",
|
|
680
|
+
secret_code="...",
|
|
681
|
+
timeout=120.0, # 请求超时时间(秒),默认 630
|
|
682
|
+
max_retries=3, # 最大重试次数,默认 3
|
|
683
|
+
retry_delay=1.0 # 重试间隔(秒),默认 1.0
|
|
684
|
+
)
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
#### 自定义 API 地址
|
|
688
|
+
|
|
689
|
+
```python
|
|
690
|
+
client = XParseClient(
|
|
691
|
+
app_id="...",
|
|
692
|
+
secret_code="...",
|
|
693
|
+
base_url="https://custom-api.example.com/api/xparse"
|
|
694
|
+
)
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
#### 自定义 HTTP 客户端
|
|
698
|
+
|
|
699
|
+
```python
|
|
700
|
+
import httpx
|
|
701
|
+
|
|
702
|
+
http_client = httpx.Client(
|
|
703
|
+
headers={"x-custom-header": "value"},
|
|
704
|
+
proxies="http://proxy.example.com:8080"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
client = XParseClient(
|
|
708
|
+
app_id="...",
|
|
709
|
+
secret_code="...",
|
|
710
|
+
client=http_client
|
|
711
|
+
)
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
---
|
|
715
|
+
|
|
716
|
+
## 使用示例
|
|
717
|
+
|
|
718
|
+
完整示例代码请查看 [example/](example/) 目录。
|
|
719
|
+
|
|
720
|
+
### 示例 1:基础 API 使用
|
|
721
|
+
|
|
722
|
+
解析文档、提取结构化数据、Pipeline 完整流程。
|
|
723
|
+
|
|
724
|
+
```python
|
|
725
|
+
from xparse_client import XParseClient
|
|
726
|
+
from xparse_client.models import ParseConfig
|
|
727
|
+
|
|
728
|
+
client = XParseClient.from_env()
|
|
729
|
+
|
|
730
|
+
# 解析单个文档
|
|
731
|
+
with open("document.pdf", "rb") as f:
|
|
732
|
+
result = client.parse.partition(
|
|
733
|
+
file=f,
|
|
734
|
+
filename="document.pdf",
|
|
735
|
+
config=ParseConfig(provider="textin")
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
print(f"解析出 {len(result.elements)} 个元素")
|
|
739
|
+
```
|
|
740
|
+
|
|
741
|
+
完整示例:[example/1_basic_api_usage.py](example/1_basic_api_usage.py)
|
|
742
|
+
|
|
743
|
+
### 示例 2:服务端异步任务
|
|
744
|
+
|
|
745
|
+
处理大文件时使用服务端异步任务,支持轮询和自动等待。
|
|
746
|
+
|
|
747
|
+
```python
|
|
748
|
+
# 创建异步任务
|
|
749
|
+
job = client.parse.create_async_job(file=f, filename="large.pdf")
|
|
750
|
+
|
|
751
|
+
# 方式 1:自动等待完成
|
|
752
|
+
result = client.parse.wait_for_result(job_id=job.job_id, timeout=300)
|
|
753
|
+
|
|
754
|
+
# 方式 2:手动轮询
|
|
755
|
+
while True:
|
|
756
|
+
status = client.parse.get_result(job_id=job.job_id)
|
|
757
|
+
if status.status == "completed":
|
|
758
|
+
break
|
|
759
|
+
time.sleep(5)
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
完整示例:[example/2_async_job.py](example/2_async_job.py)
|
|
763
|
+
|
|
764
|
+
### 示例 3:本地批处理工作流
|
|
765
|
+
|
|
766
|
+
批量处理本地文件,支持进度回调和错误处理。
|
|
767
|
+
|
|
768
|
+
```python
|
|
769
|
+
result = client.local.run_workflow(
|
|
770
|
+
source=LocalSource(directory="./docs", pattern=["*.pdf"]),
|
|
771
|
+
destination=MilvusDestination(db_path="./vectors.db", ...),
|
|
772
|
+
stages=[ParseStage(...), ChunkStage(...), EmbedStage(...)],
|
|
773
|
+
progress_callback=lambda c, t, m: print(f"[{c}/{t}] {m}"),
|
|
774
|
+
on_error="continue", # 遇到错误继续处理
|
|
775
|
+
max_retries=3
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
print(f"成功: {result.success}, 失败: {result.failed}")
|
|
779
|
+
```
|
|
780
|
+
|
|
781
|
+
完整示例:[example/3_local_workflow.py](example/3_local_workflow.py)
|
|
782
|
+
|
|
783
|
+
### 示例 4:生产环境最佳实践
|
|
784
|
+
|
|
785
|
+
包含错误处理、日志记录、进度回调、自定义 Source 的完整工作流。
|
|
786
|
+
|
|
787
|
+
```python
|
|
788
|
+
import logging
|
|
789
|
+
|
|
790
|
+
logging.basicConfig(level=logging.INFO)
|
|
791
|
+
logger = logging.getLogger(__name__)
|
|
792
|
+
|
|
793
|
+
# 自定义 Source(按文件大小过滤)
|
|
794
|
+
class FilteredLocalSource(Source):
|
|
795
|
+
def __init__(self, directory, min_size=0, max_size=10*1024*1024):
|
|
796
|
+
self.directory = Path(directory)
|
|
797
|
+
self.min_size = min_size
|
|
798
|
+
self.max_size = max_size
|
|
799
|
+
|
|
800
|
+
def list_files(self) -> List[str]:
|
|
801
|
+
files = []
|
|
802
|
+
for f in self.directory.glob("**/*.pdf"):
|
|
803
|
+
size = f.stat().st_size
|
|
804
|
+
if self.min_size <= size <= self.max_size:
|
|
805
|
+
files.append(str(f))
|
|
806
|
+
return files
|
|
807
|
+
|
|
808
|
+
# ... read_file 实现
|
|
809
|
+
```
|
|
810
|
+
|
|
811
|
+
完整示例:[example/4_advanced_workflow.py](example/4_advanced_workflow.py)
|
|
812
|
+
|
|
813
|
+
---
|
|
814
|
+
|
|
815
|
+
## 调试与日志
|
|
816
|
+
|
|
817
|
+
### 启用调试日志
|
|
818
|
+
|
|
819
|
+
```python
|
|
820
|
+
from xparse_client import XParseClient
|
|
821
|
+
import logging
|
|
822
|
+
|
|
823
|
+
# 配置日志级别
|
|
824
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
825
|
+
|
|
826
|
+
# 传入自定义 logger
|
|
827
|
+
client = XParseClient(
|
|
828
|
+
app_id="...",
|
|
829
|
+
secret_code="...",
|
|
830
|
+
debug_logger=logging.getLogger("xparse_client")
|
|
831
|
+
)
|
|
832
|
+
```
|
|
833
|
+
|
|
834
|
+
### 日志级别说明
|
|
835
|
+
|
|
836
|
+
| 级别 | 用途 | 输出内容 |
|
|
837
|
+
|------|------|----------|
|
|
838
|
+
| `DEBUG` | 开发调试 | 详细的请求/响应日志 |
|
|
839
|
+
| `INFO` | 正常运行 | 关键操作日志(默认) |
|
|
840
|
+
| `WARNING` | 警告信息 | 潜在问题提示 |
|
|
841
|
+
| `ERROR` | 错误信息 | 错误详情和堆栈 |
|
|
842
|
+
|
|
843
|
+
### 日志示例
|
|
844
|
+
|
|
845
|
+
```python
|
|
846
|
+
import logging
|
|
847
|
+
|
|
848
|
+
# 配置文件日志
|
|
849
|
+
logging.basicConfig(
|
|
850
|
+
level=logging.INFO,
|
|
851
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
852
|
+
handlers=[
|
|
853
|
+
logging.FileHandler('xparse.log'),
|
|
854
|
+
logging.StreamHandler()
|
|
855
|
+
]
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
client = XParseClient.from_env()
|
|
859
|
+
|
|
860
|
+
# 日志会自动记录关键操作
|
|
861
|
+
result = client.parse.partition(...)
|
|
862
|
+
# 输出: 2024-01-15 10:30:00 - xparse_client - INFO - Parsing document.pdf
|
|
863
|
+
```
|
|
864
|
+
|
|
865
|
+
---
|
|
866
|
+
|
|
867
|
+
## 本地开发
|
|
868
|
+
|
|
869
|
+
### 环境准备
|
|
870
|
+
|
|
871
|
+
推荐使用 `uv` 管理开发环境:
|
|
872
|
+
|
|
873
|
+
```bash
|
|
874
|
+
# 克隆仓库
|
|
875
|
+
git clone https://github.com/intsig-textin/xparse-python-client.git
|
|
876
|
+
cd xparse-python-client
|
|
877
|
+
|
|
878
|
+
# 安装开发依赖
|
|
879
|
+
uv sync --dev
|
|
880
|
+
|
|
881
|
+
# 运行测试
|
|
882
|
+
make test
|
|
883
|
+
|
|
884
|
+
# 代码格式化
|
|
885
|
+
make format
|
|
886
|
+
|
|
887
|
+
# 运行示例
|
|
888
|
+
python example/1_basic_api_usage.py
|
|
889
|
+
```
|
|
890
|
+
|
|
891
|
+
### 项目结构
|
|
892
|
+
|
|
893
|
+
```
|
|
894
|
+
xparse-client/
|
|
895
|
+
├── xparse_client/ # 主包
|
|
896
|
+
│ ├── __init__.py
|
|
897
|
+
│ ├── client.py # XParseClient 主类
|
|
898
|
+
│ ├── models/ # 数据模型
|
|
899
|
+
│ └── connectors/ # Source/Destination
|
|
900
|
+
├── tests/ # 测试
|
|
901
|
+
│ ├── unit/ # 单元测试
|
|
902
|
+
│ └── integration/ # 集成测试
|
|
903
|
+
├── example/ # 示例代码
|
|
904
|
+
├── docs/ # 文档
|
|
905
|
+
└── Makefile # 开发命令
|
|
906
|
+
```
|
|
907
|
+
|
|
908
|
+
### 常用命令
|
|
909
|
+
|
|
910
|
+
```bash
|
|
911
|
+
# 运行所有测试
|
|
912
|
+
make test
|
|
913
|
+
|
|
914
|
+
# 运行单元测试
|
|
915
|
+
make test-unit
|
|
916
|
+
|
|
917
|
+
# 代码覆盖率
|
|
918
|
+
make test-cov
|
|
919
|
+
|
|
920
|
+
# 代码格式化
|
|
921
|
+
make format
|
|
922
|
+
|
|
923
|
+
# 代码检查
|
|
924
|
+
make lint
|
|
925
|
+
|
|
926
|
+
# 清理缓存
|
|
927
|
+
make clean
|
|
928
|
+
```
|
|
929
|
+
|
|
930
|
+
### 贡献流程
|
|
931
|
+
|
|
932
|
+
1. Fork 本仓库
|
|
933
|
+
2. 创建特性分支:`git checkout -b feature/amazing-feature`
|
|
934
|
+
3. 编写代码和测试
|
|
935
|
+
4. 确保测试通过:`make test`
|
|
936
|
+
5. 提交更改:`git commit -m 'Add amazing feature'`
|
|
937
|
+
6. 推送到分支:`git push origin feature/amazing-feature`
|
|
938
|
+
7. 提交 Pull Request
|
|
939
|
+
|
|
940
|
+
### 测试要求
|
|
941
|
+
|
|
942
|
+
- 所有新功能必须包含单元测试
|
|
943
|
+
- 确保所有测试通过:`make test`
|
|
944
|
+
- 代码覆盖率不低于 80%:`make test-cov`
|
|
945
|
+
- 遵循代码风格规范:`make format && make lint`
|
|
946
|
+
|
|
947
|
+
---
|
|
948
|
+
|
|
949
|
+
## 版本成熟度
|
|
950
|
+
|
|
951
|
+
**当前版本:v0.3.0**
|
|
952
|
+
|
|
953
|
+
- ✅ 核心 API 已稳定
|
|
954
|
+
- ✅ 生产环境可用
|
|
955
|
+
- ⚠️ 破坏性变更请查看 [CHANGELOG.md](CHANGELOG.md)
|
|
956
|
+
|
|
957
|
+
我们遵循[语义化版本规范](https://semver.org/lang/zh-CN/)(SemVer),主版本号变更可能包含破坏性变更。建议锁定版本号:
|
|
958
|
+
|
|
959
|
+
```bash
|
|
960
|
+
# 锁定主版本
|
|
961
|
+
pip install "xparse-client>=0.3,<1.0"
|
|
962
|
+
|
|
963
|
+
# 锁定次版本
|
|
964
|
+
pip install "xparse-client==0.3.*"
|
|
965
|
+
```
|
|
966
|
+
|
|
967
|
+
---
|
|
968
|
+
|
|
969
|
+
## 相关资源
|
|
970
|
+
|
|
971
|
+
### 📖 文档
|
|
972
|
+
|
|
973
|
+
- [完整文档](https://docs.textin.com/pipeline/overview) - 官方文档
|
|
974
|
+
- [API 接口规范](docs/API_REFERENCE.md) - Pipeline API 详细说明
|
|
975
|
+
- [云厂商配置指南](docs/CLOUD_PROVIDERS.md) - S3/OSS/COS 等配置
|
|
976
|
+
- [迁移指南](docs/MIGRATION_FROM_PIPELINE.md) - 从 Pipeline 类迁移
|
|
977
|
+
|
|
978
|
+
### 🔗 链接
|
|
979
|
+
|
|
980
|
+
- [GitHub 仓库](https://github.com/intsig-textin/xparse-python-client)
|
|
981
|
+
- [PyPI 页面](https://pypi.org/project/xparse-client/)
|
|
982
|
+
- [问题反馈](https://github.com/intsig-textin/xparse-python-client/issues)
|
|
983
|
+
- [讨论区](https://github.com/intsig-textin/xparse-python-client/discussions)
|
|
984
|
+
- [更新日志](CHANGELOG.md)
|
|
985
|
+
- [TextIn 开发者控制台](https://www.textin.com/console/dashboard/setting)
|
|
986
|
+
|
|
987
|
+
### 💰 计费
|
|
988
|
+
|
|
989
|
+
Pipeline 接口按页计费,具体价格请查看:[通用文档解析](https://www.textin.com/market/detail/xparse)
|
|
990
|
+
|
|
991
|
+
---
|
|
992
|
+
|
|
993
|
+
## 许可证
|
|
994
|
+
|
|
995
|
+
[MIT License](LICENSE)
|
|
996
|
+
|
|
997
|
+
---
|
|
998
|
+
|
|
999
|
+
## 故障排查
|
|
1000
|
+
|
|
1001
|
+
### 常见问题
|
|
1002
|
+
|
|
1003
|
+
#### 1. 认证失败
|
|
1004
|
+
|
|
1005
|
+
**错误**:`AuthenticationError: Invalid app_id or secret_code`
|
|
1006
|
+
|
|
1007
|
+
**解决方案**:
|
|
1008
|
+
1. 检查环境变量:`echo $TEXTIN_APP_ID`
|
|
1009
|
+
2. 登录 [TextIn 控制台](https://www.textin.com/console/dashboard/setting) 确认凭证
|
|
1010
|
+
3. 确保没有多余的空格或引号
|
|
1011
|
+
|
|
1012
|
+
#### 2. 文件过大
|
|
1013
|
+
|
|
1014
|
+
**错误**:`ValidationError: File size exceeds maximum limit of 100MB`
|
|
1015
|
+
|
|
1016
|
+
**解决方案**:
|
|
1017
|
+
1. 压缩文件
|
|
1018
|
+
2. 分割成多个文件
|
|
1019
|
+
3. 联系技术支持提升限制
|
|
1020
|
+
|
|
1021
|
+
#### 3. 向量维度不匹配
|
|
1022
|
+
|
|
1023
|
+
**错误**:`MilvusException: dimension mismatch`
|
|
1024
|
+
|
|
1025
|
+
**解决方案**:
|
|
1026
|
+
确保 Destination 的 dimension 参数与 embed 模型一致(当前所有模型都是 1024 维):
|
|
1027
|
+
|
|
1028
|
+
```python
|
|
1029
|
+
destination = MilvusDestination(
|
|
1030
|
+
dimension=1024 # 必须与 embed 模型维度一致
|
|
1031
|
+
)
|
|
1032
|
+
```
|
|
1033
|
+
|
|
1034
|
+
#### 4. 连接超时
|
|
1035
|
+
|
|
1036
|
+
**错误**:`TimeoutException: Request timeout`
|
|
1037
|
+
|
|
1038
|
+
**解决方案**:
|
|
1039
|
+
1. 检查网络连接
|
|
1040
|
+
2. 增加超时时间:`client = XParseClient(..., timeout=300.0)`
|
|
1041
|
+
3. 使用异步任务处理大文件
|
|
1042
|
+
|
|
1043
|
+
#### 5. Rate Limit
|
|
1044
|
+
|
|
1045
|
+
**错误**:`RateLimitError: Rate limit exceeded`
|
|
1046
|
+
|
|
1047
|
+
**解决方案**:
|
|
1048
|
+
1. 等待 `retry_after` 秒后重试
|
|
1049
|
+
2. 联系客服提升配额
|
|
1050
|
+
3. 使用批量处理降低请求频率
|
|
1051
|
+
|
|
1052
|
+
### 获取帮助
|
|
1053
|
+
|
|
1054
|
+
如果遇到问题,可以通过以下方式获取帮助:
|
|
1055
|
+
|
|
1056
|
+
1. **查看文档**:[https://docs.textin.com/pipeline/overview](https://docs.textin.com/pipeline/overview)
|
|
1057
|
+
2. **提交 Issue**:[GitHub Issues](https://github.com/intsig-textin/xparse-python-client/issues)
|
|
1058
|
+
3. **联系技术支持**:提供 `request_id` 可以加快问题定位
|
|
1059
|
+
|
|
1060
|
+
```python
|
|
1061
|
+
try:
|
|
1062
|
+
result = client.parse.partition(...)
|
|
1063
|
+
except APIError as e:
|
|
1064
|
+
print(f"请联系技术支持,request_id: {e.request_id}")
|
|
1065
|
+
```
|
|
1066
|
+
|
|
1067
|
+
---
|
|
1068
|
+
|
|
1069
|
+
<div align="center">
|
|
1070
|
+
|
|
1071
|
+
**感谢使用 xParse Client!**
|
|
1072
|
+
|
|
1073
|
+
[⭐ Star on GitHub](https://github.com/intsig-textin/xparse-python-client) | [📖 Read the Docs](https://docs.textin.com/pipeline/overview) | [💬 Discussions](https://github.com/intsig-textin/xparse-python-client/discussions)
|
|
1074
|
+
|
|
1075
|
+
</div>
|