xparse-client 0.2.19__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -129
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -690
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.19.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.19.dist-info/RECORD +0 -11
@@ -0,0 +1,1075 @@
1
+ Metadata-Version: 2.4
2
+ Name: xparse-client
3
+ Version: 0.3.0b3
4
+ Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
5
+ Author-email: INTSIG-TEXTIN <support@textin.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/textin/xparse-client
8
+ Project-URL: Repository, https://github.com/textin/xparse-client
9
+ Project-URL: Documentation, https://github.com/textin/xparse-client#readme
10
+ Keywords: xparse,pipeline,rag,document,parsing,textin
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing :: General
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: httpx<1.0.0,>=0.24.0
25
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
26
+ Requires-Dist: eval-type-backport>=0.2.0; python_version < "3.10"
27
+ Provides-Extra: s3
28
+ Requires-Dist: boto3>=1.26.0; extra == "s3"
29
+ Provides-Extra: milvus
30
+ Requires-Dist: pymilvus>=2.3.0; extra == "milvus"
31
+ Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "milvus"
32
+ Provides-Extra: qdrant
33
+ Requires-Dist: qdrant-client>=1.7.0; extra == "qdrant"
34
+ Provides-Extra: smb
35
+ Requires-Dist: pysmb>=1.2.0; extra == "smb"
36
+ Provides-Extra: legacy
37
+ Requires-Dist: boto3>=1.26.0; extra == "legacy"
38
+ Requires-Dist: pymilvus>=2.3.0; extra == "legacy"
39
+ Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "legacy"
40
+ Requires-Dist: qdrant-client>=1.7.0; extra == "legacy"
41
+ Requires-Dist: pysmb>=1.2.0; extra == "legacy"
42
+ Provides-Extra: all
43
+ Requires-Dist: boto3>=1.26.0; extra == "all"
44
+ Requires-Dist: pymilvus>=2.3.0; extra == "all"
45
+ Requires-Dist: milvus-lite; sys_platform != "win32" and extra == "all"
46
+ Requires-Dist: qdrant-client>=1.7.0; extra == "all"
47
+ Requires-Dist: pysmb>=1.2.0; extra == "all"
48
+ Dynamic: license-file
49
+
50
+ # xParse Client
51
+
52
+ <h3 align="center">
53
+ 面向 Agent 和 RAG 的新一代文档处理 Python SDK
54
+ </h3>
55
+
56
+ <div align="center">
57
+
58
+ [![PyPI version](https://badge.fury.io/py/xparse-client.svg)](https://badge.fury.io/py/xparse-client)
59
+ [![Python](https://img.shields.io/pypi/pyversions/xparse-client.svg)](https://pypi.org/project/xparse-client/)
60
+ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
61
+
62
+ </div>
63
+
64
+ ---
65
+
66
+ ## ⚠️ 重要提示
67
+
68
+ **Pipeline 类已在 v0.3.0 版本中完全移除。**
69
+
70
+ 如果你的代码中使用了 `Pipeline` 类,请立即查看 [完整迁移指南](docs/MIGRATION_FROM_PIPELINE.md)。
71
+
72
+ ---
73
+
74
+ ## 目录
75
+
76
+ - [SDK 安装](#sdk-安装)
77
+ - [快速开始](#快速开始)
78
+ - [核心特性](#核心特性)
79
+ - [错误处理](#错误处理)
80
+ - [资源管理](#资源管理)
81
+ - [配置说明](#配置说明)
82
+ - [认证配置](#认证配置)
83
+ - [数据源配置](#数据源配置)
84
+ - [目的地配置](#目的地配置)
85
+ - [高级配置](#高级配置)
86
+ - [使用示例](#使用示例)
87
+ - [调试与日志](#调试与日志)
88
+ - [本地开发](#本地开发)
89
+ - [相关资源](#相关资源)
90
+
91
+ ---
92
+
93
+ ## SDK 安装
94
+
95
+ > [!NOTE]
96
+ > **Python 版本要求**
97
+ >
98
+ > 本 SDK 支持 Python 3.9 及以上版本。一旦某个 Python 版本达到其[官方生命周期结束日期](https://devguide.python.org/versions/),将提供 3 个月的宽限期供用户升级。
99
+
100
+ SDK 支持多种包管理器安装。
101
+
102
+ ### uv(推荐)
103
+
104
+ [uv](https://docs.astral.sh/uv/) 是一个快速的 Python 包管理器,推荐用于现代 Python 项目。
105
+
106
+ ```bash
107
+ uv add xparse-client
108
+ ```
109
+
110
+ ### pip
111
+
112
+ ```bash
113
+ pip install xparse-client
114
+ ```
115
+
116
+ ### poetry
117
+
118
+ ```bash
119
+ poetry add xparse-client
120
+ ```
121
+
122
+ ### 可选依赖
123
+
124
+ 根据使用的 Destination 类型安装额外依赖:
125
+
126
+ ```bash
127
+ # Milvus 向量数据库支持
128
+ pip install xparse-client[milvus]
129
+
130
+ # Qdrant 向量数据库支持
131
+ pip install xparse-client[qdrant]
132
+
133
+ # 完整安装(包含所有可选依赖)
134
+ pip install xparse-client[all]
135
+ ```
136
+
137
+ ### Shell 脚本使用
138
+
139
+ 使用 `uv` 可以快速编写独立的 Python 脚本,无需创建完整项目:
140
+
141
+ ```python
142
+ #!/usr/bin/env -S uv run --script
143
+ # /// script
144
+ # requires-python = ">=3.9"
145
+ # dependencies = [
146
+ # "xparse-client",
147
+ # ]
148
+ # ///
149
+
150
+ from xparse_client import XParseClient
151
+
152
+ # 初始化客户端
153
+ client = XParseClient.from_env()
154
+
155
+ # 处理文档
156
+ with open("document.pdf", "rb") as f:
157
+ result = client.parse.partition(file=f, filename="document.pdf")
158
+ print(f"解析出 {len(result.elements)} 个元素")
159
+ ```
160
+
161
+ 保存为 `process.py` 后,直接运行:
162
+
163
+ ```bash
164
+ uv run process.py
165
+ ```
166
+
167
+ ---
168
+
169
+ ## 快速开始
170
+
171
+ ### 1. 环境配置
172
+
173
+ 首先设置 TextIn 认证信息:
174
+
175
+ ```bash
176
+ export TEXTIN_APP_ID="your-app-id"
177
+ export TEXTIN_SECRET_CODE="your-secret-code"
178
+ ```
179
+
180
+ 可以在 [TextIn 开发者控制台](https://www.textin.com/console/dashboard/setting) 获取认证凭证。
181
+
182
+ ### 2. 单文件处理
183
+
184
+ **场景 1:解析文档**
185
+
186
+ ```python
187
+ from xparse_client import XParseClient
188
+ from xparse_client.models import ParseConfig
189
+
190
+ client = XParseClient.from_env()
191
+
192
+ # 解析文档
193
+ with open("document.pdf", "rb") as f:
194
+ result = client.parse.partition(
195
+ file=f,
196
+ filename="document.pdf",
197
+ config=ParseConfig(provider="textin")
198
+ )
199
+
200
+ print(f"解析出 {len(result.elements)} 个元素")
201
+ ```
202
+
203
+ **场景 2:提取结构化数据**
204
+
205
+ ```python
206
+ from xparse_client.models import ExtractConfig
207
+
208
+ # 定义提取 schema
209
+ schema = {
210
+ "type": "object",
211
+ "properties": {
212
+ "title": {"type": "string", "description": "文档标题"},
213
+ "author": {"type": "string", "description": "作者"},
214
+ "date": {"type": "string", "description": "日期"}
215
+ },
216
+ "required": ["title"]
217
+ }
218
+
219
+ with open("document.pdf", "rb") as f:
220
+ result = client.extract.extract(
221
+ file=f,
222
+ filename="document.pdf",
223
+ config=ExtractConfig(schema=schema)
224
+ )
225
+
226
+ print(result.extracted_data)
227
+ ```
228
+
229
+ ### 3. 本地批处理
230
+
231
+ 批量处理本地文件并写入向量数据库:
232
+
233
+ ```python
234
+ from xparse_client import XParseClient
235
+ from xparse_client.connectors import LocalSource, MilvusDestination
236
+ from xparse_client.models import ParseStage, ChunkStage, EmbedStage
237
+ from xparse_client.models import ParseConfig, ChunkConfig, EmbedConfig
238
+
239
+ client = XParseClient.from_env()
240
+
241
+ result = client.local.run_workflow(
242
+ source=LocalSource(directory="./docs", pattern=["*.pdf"]),
243
+ destination=MilvusDestination(
244
+ db_path="./vectors.db",
245
+ collection_name="documents",
246
+ dimension=1024
247
+ ),
248
+ stages=[
249
+ ParseStage(config=ParseConfig(provider="textin")),
250
+ ChunkStage(config=ChunkConfig(strategy="by_title")),
251
+ EmbedStage(config=EmbedConfig(provider="qwen"))
252
+ ]
253
+ )
254
+
255
+ print(f"处理完成: {result.success}/{result.total}")
256
+ ```
257
+
258
+ ### 4. 异步任务处理
259
+
260
+ 处理大文件时使用服务端异步任务:
261
+
262
+ ```python
263
+ # 创建异步任务
264
+ with open("large_document.pdf", "rb") as f:
265
+ job = client.parse.create_async_job(
266
+ file=f,
267
+ filename="large_document.pdf",
268
+ config=ParseConfig(provider="textin")
269
+ )
270
+
271
+ print(f"任务已创建: {job.job_id}")
272
+
273
+ # 等待任务完成
274
+ result = client.parse.wait_for_result(
275
+ job_id=job.job_id,
276
+ timeout=300.0,
277
+ poll_interval=5.0
278
+ )
279
+
280
+ print(f"任务完成,解析出 {len(result.result.elements)} 个元素")
281
+ ```
282
+
283
+ ### 同步 vs 异步
284
+
285
+ ```python
286
+ # 同步调用(适合小文件)
287
+ result = client.parse.partition(file=f, filename="doc.pdf")
288
+
289
+ # 异步调用(相同接口,加上 _async 后缀)
290
+ import asyncio
291
+
292
+ async def main():
293
+ result = await client.parse.partition_async(file=f, filename="doc.pdf")
294
+
295
+ asyncio.run(main())
296
+ ```
297
+
298
+ ---
299
+
300
+ ## 核心特性
301
+
302
+ ### 📥 灵活的数据源
303
+
304
+ - **S3 兼容存储**:支持 MinIO、AWS S3、阿里云 OSS、腾讯云 COS、火山引擎 TOS、华为云 OBS
305
+ - **本地文件系统**:支持本地目录批量处理
306
+ - **FTP/SMB**:支持 FTP 和 SMB 协议文件系统
307
+
308
+ 详细配置请查看:[云厂商配置指南](docs/CLOUD_PROVIDERS.md)
309
+
310
+ ### 📤 灵活的输出目的地
311
+
312
+ - **向量数据库**:Milvus、Zilliz Cloud、Qdrant
313
+ - **文件存储**:本地文件系统、S3 兼容存储
314
+
315
+ ### 🔄 统一的 Pipeline API
316
+
317
+ 使用 `/api/xparse/pipeline` 一次性完成 parse → chunk → embed 全流程:
318
+
319
+ ```python
320
+ result = client.pipeline.execute(
321
+ file=f,
322
+ filename="document.pdf",
323
+ stages=[ParseStage(...), ChunkStage(...), EmbedStage(...)]
324
+ )
325
+ ```
326
+
327
+ ### 📊 详细的处理统计
328
+
329
+ ```python
330
+ print(f"原始元素: {result.stats.original_elements}")
331
+ print(f"分块后: {result.stats.chunked_elements}")
332
+ print(f"向量化: {result.stats.embedded_elements}")
333
+ ```
334
+
335
+ ### 🔧 易于扩展
336
+
337
+ 基于抽象类设计,可轻松添加自定义 Source 和 Destination:
338
+
339
+ ```python
340
+ from xparse_client.connectors.sources import Source
341
+
342
+ class MyCustomSource(Source):
343
+ def list_files(self) -> List[str]:
344
+ # 实现文件列表逻辑
345
+ pass
346
+
347
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
348
+ # 实现文件读取逻辑
349
+ pass
350
+ ```
351
+
352
+ ---
353
+
354
+ ## 错误处理
355
+
356
+ ### 错误类层次结构
357
+
358
+ SDK 提供了完善的错误类型系统:
359
+
360
+ | 错误类 | 说明 | 使用场景 |
361
+ |--------|------|----------|
362
+ | `XParseError` | 基础错误类 | 捕获所有 SDK 错误 |
363
+ | `AuthenticationError` | 认证失败 | app_id 或 secret_code 错误 |
364
+ | `RateLimitError` | 超过速率限制 | 需要等待或提升配额 |
365
+ | `APIError` | API 请求错误 | 服务端错误 |
366
+ | `ValidationError` | 参数验证错误 | 检查输入参数 |
367
+
368
+ ### 错误处理示例
369
+
370
+ ```python
371
+ from xparse_client import XParseClient
372
+ from xparse_client.exceptions import (
373
+ XParseError,
374
+ RateLimitError,
375
+ AuthenticationError,
376
+ APIError
377
+ )
378
+
379
+ client = XParseClient.from_env()
380
+
381
+ try:
382
+ with open("document.pdf", "rb") as f:
383
+ result = client.parse.partition(file=f, filename="document.pdf")
384
+
385
+ except AuthenticationError as e:
386
+ print(f"认证失败: {e.message}")
387
+ print("请检查 TEXTIN_APP_ID 和 TEXTIN_SECRET_CODE 环境变量")
388
+
389
+ except RateLimitError as e:
390
+ print(f"超过速率限制: {e.message}")
391
+ print(f"建议等待 {e.retry_after} 秒后重试")
392
+ # 自动重试
393
+ import time
394
+ time.sleep(e.retry_after)
395
+ result = client.parse.partition(file=f, filename="document.pdf")
396
+
397
+ except APIError as e:
398
+ print(f"API 错误: {e.message}")
399
+ print(f"请求 ID: {e.request_id}") # 用于技术支持排查
400
+ print(f"状态码: {e.status_code}")
401
+
402
+ except XParseError as e:
403
+ print(f"SDK 错误: {e.message}")
404
+ ```
405
+
406
+ ### 获取请求 ID
407
+
408
+ 当遇到问题需要技术支持时,提供 `request_id` 可以帮助快速定位问题:
409
+
410
+ ```python
411
+ try:
412
+ result = client.parse.partition(...)
413
+ except APIError as e:
414
+ # 记录 request_id
415
+ logger.error(f"请求失败,request_id: {e.request_id}")
416
+ ```
417
+
418
+ ---
419
+
420
+ ## 资源管理
421
+
422
+ `XParseClient` 实现了上下文管理器协议,会自动管理底层 HTTP 连接和资源释放。
423
+
424
+ ### 使用上下文管理器(推荐)
425
+
426
+ ```python
427
+ from xparse_client import XParseClient
428
+
429
+ def main():
430
+ with XParseClient.from_env() as client:
431
+ # 应用逻辑
432
+ result = client.parse.partition(...)
433
+ # 退出时自动关闭连接
434
+ ```
435
+
436
+ ### 异步场景
437
+
438
+ ```python
439
+ async def amain():
440
+ async with XParseClient.from_env() as client:
441
+ result = await client.parse.partition_async(...)
442
+ ```
443
+
444
+ ### 最佳实践
445
+
446
+ **✅ 推荐做法:**
447
+
448
+ ```python
449
+ # 长时间运行的程序,复用 client 实例
450
+ with XParseClient.from_env() as client:
451
+ for file_path in file_list:
452
+ result = client.parse.partition(...)
453
+ ```
454
+
455
+ **⚠️ 不推荐:**
456
+
457
+ ```python
458
+ # 避免频繁创建和销毁 client
459
+ for file_path in file_list:
460
+ client = XParseClient.from_env() # ❌ 每次都创建新实例
461
+ result = client.parse.partition(...)
462
+ ```
463
+
464
+ ### 何时可以不使用上下文管理器
465
+
466
+ - 短生命周期脚本(处理单个文件后即退出)
467
+ - Jupyter Notebook 交互式环境
468
+ - 快速原型开发
469
+
470
+ 在这些场景下,Python 的垃圾回收机制会自动清理资源:
471
+
472
+ ```python
473
+ # 短脚本中可以不使用 with
474
+ client = XParseClient.from_env()
475
+ result = client.parse.partition(...)
476
+ # 脚本退出时自动清理
477
+ ```
478
+
479
+ ---
480
+
481
+ ## 配置说明
482
+
483
+ ### 认证配置
484
+
485
+ **方式 1:环境变量(推荐)**
486
+
487
+ ```bash
488
+ export TEXTIN_APP_ID="your-app-id"
489
+ export TEXTIN_SECRET_CODE="your-secret-code"
490
+ ```
491
+
492
+ ```python
493
+ client = XParseClient.from_env()
494
+ ```
495
+
496
+ **方式 2:直接传参**
497
+
498
+ ```python
499
+ client = XParseClient(
500
+ app_id="your-app-id",
501
+ secret_code="your-secret-code"
502
+ )
503
+ ```
504
+
505
+ **方式 3:配置文件**
506
+
507
+ ```python
508
+ import os
509
+ from dotenv import load_dotenv
510
+
511
+ load_dotenv() # 从 .env 文件加载
512
+ client = XParseClient.from_env()
513
+ ```
514
+
515
+ `.env` 文件内容:
516
+
517
+ ```bash
518
+ TEXTIN_APP_ID=your-app-id
519
+ TEXTIN_SECRET_CODE=your-secret-code
520
+ ```
521
+
522
+ ### 数据源配置
523
+
524
+ #### 本地文件系统
525
+
526
+ ```python
527
+ from xparse_client.connectors import LocalSource
528
+
529
+ source = LocalSource(
530
+ directory='./documents',
531
+ pattern=['*.pdf', '*.docx'], # 可选,文件类型过滤
532
+ recursive=True # 可选,递归子目录
533
+ )
534
+ ```
535
+
536
+ #### S3 兼容存储
537
+
538
+ ```python
539
+ from xparse_client.connectors import S3Source
540
+
541
+ source = S3Source(
542
+ endpoint='https://s3.amazonaws.com',
543
+ access_key='your-access-key',
544
+ secret_key='your-secret-key',
545
+ bucket='my-bucket',
546
+ prefix='documents/', # 可选,指定前缀
547
+ region='us-east-1',
548
+ pattern=['*.pdf'], # 可选,文件类型过滤
549
+ recursive=True # 可选,递归子目录
550
+ )
551
+ ```
552
+
553
+ **支持的云厂商:**
554
+ - AWS S3
555
+ - MinIO
556
+ - 阿里云 OSS
557
+ - 腾讯云 COS
558
+ - 火山引擎 TOS
559
+ - 华为云 OBS
560
+
561
+ 详细配置请查看:[云厂商配置指南](docs/CLOUD_PROVIDERS.md)
562
+
563
+ #### FTP 数据源
564
+
565
+ ```python
566
+ from xparse_client.connectors import FtpSource
567
+
568
+ source = FtpSource(
569
+ host='ftp.example.com',
570
+ port=21,
571
+ username='user',
572
+ password='pass',
573
+ pattern=['*.pdf'],
574
+ recursive=True
575
+ )
576
+ ```
577
+
578
+ #### SMB 数据源
579
+
580
+ ```python
581
+ from xparse_client.connectors import SmbSource
582
+
583
+ source = SmbSource(
584
+ host='smb.example.com',
585
+ share_name='documents',
586
+ username='user',
587
+ password='pass',
588
+ domain='WORKGROUP',
589
+ pattern=['**/*.pdf'],
590
+ recursive=True
591
+ )
592
+ ```
593
+
594
+ ### 目的地配置
595
+
596
+ #### 本地文件系统
597
+
598
+ ```python
599
+ from xparse_client.connectors import LocalDestination
600
+
601
+ destination = LocalDestination(
602
+ output_dir='./output'
603
+ )
604
+ ```
605
+
606
+ 输出文件格式:`{filename}_{timestamp}.json`
607
+
608
+ #### Milvus 本地数据库
609
+
610
+ ```python
611
+ from xparse_client.connectors import MilvusDestination
612
+
613
+ destination = MilvusDestination(
614
+ db_path='./vectors.db',
615
+ collection_name='documents',
616
+ dimension=1024 # 必须与 embed 模型维度一致
617
+ )
618
+ ```
619
+
620
+ **Collection 必需字段:**
621
+ - `element_id` - 元素唯一标识
622
+ - `text` - 文本内容
623
+ - `embeddings` - 向量
624
+ - `record_id` - 记录 ID
625
+
626
+ #### Zilliz Cloud
627
+
628
+ ```python
629
+ destination = MilvusDestination(
630
+ db_path='https://your-instance.cloud.zilliz.com.cn',
631
+ collection_name='documents',
632
+ dimension=1024,
633
+ api_key='your-api-key'
634
+ )
635
+ ```
636
+
637
+ #### Qdrant
638
+
639
+ ```python
640
+ from xparse_client.connectors import QdrantDestination
641
+
642
+ # 本地 Qdrant
643
+ destination = QdrantDestination(
644
+ url='http://localhost:6333',
645
+ collection_name='documents',
646
+ dimension=1024
647
+ )
648
+
649
+ # Qdrant Cloud
650
+ destination = QdrantDestination(
651
+ url='https://your-cluster.cloud.qdrant.io',
652
+ collection_name='documents',
653
+ dimension=1024,
654
+ api_key='your-api-key'
655
+ )
656
+ ```
657
+
658
+ #### S3 兼容存储
659
+
660
+ ```python
661
+ from xparse_client.connectors import S3Destination
662
+
663
+ destination = S3Destination(
664
+ endpoint='https://s3.amazonaws.com',
665
+ access_key='your-access-key',
666
+ secret_key='your-secret-key',
667
+ bucket='my-bucket',
668
+ prefix='output/',
669
+ region='us-east-1'
670
+ )
671
+ ```
672
+
673
+ ### 高级配置
674
+
675
+ #### 超时和重试
676
+
677
+ ```python
678
+ client = XParseClient(
679
+ app_id="...",
680
+ secret_code="...",
681
+ timeout=120.0, # 请求超时时间(秒),默认 630
682
+ max_retries=3, # 最大重试次数,默认 3
683
+ retry_delay=1.0 # 重试间隔(秒),默认 1.0
684
+ )
685
+ ```
686
+
687
+ #### 自定义 API 地址
688
+
689
+ ```python
690
+ client = XParseClient(
691
+ app_id="...",
692
+ secret_code="...",
693
+ base_url="https://custom-api.example.com/api/xparse"
694
+ )
695
+ ```
696
+
697
+ #### 自定义 HTTP 客户端
698
+
699
+ ```python
700
+ import httpx
701
+
702
+ http_client = httpx.Client(
703
+ headers={"x-custom-header": "value"},
704
+ proxies="http://proxy.example.com:8080"
705
+ )
706
+
707
+ client = XParseClient(
708
+ app_id="...",
709
+ secret_code="...",
710
+ client=http_client
711
+ )
712
+ ```
713
+
714
+ ---
715
+
716
+ ## 使用示例
717
+
718
+ 完整示例代码请查看 [example/](example/) 目录。
719
+
720
+ ### 示例 1:基础 API 使用
721
+
722
+ 解析文档、提取结构化数据、Pipeline 完整流程。
723
+
724
+ ```python
725
+ from xparse_client import XParseClient
726
+ from xparse_client.models import ParseConfig
727
+
728
+ client = XParseClient.from_env()
729
+
730
+ # 解析单个文档
731
+ with open("document.pdf", "rb") as f:
732
+ result = client.parse.partition(
733
+ file=f,
734
+ filename="document.pdf",
735
+ config=ParseConfig(provider="textin")
736
+ )
737
+
738
+ print(f"解析出 {len(result.elements)} 个元素")
739
+ ```
740
+
741
+ 完整示例:[example/1_basic_api_usage.py](example/1_basic_api_usage.py)
742
+
743
+ ### 示例 2:服务端异步任务
744
+
745
+ 处理大文件时使用服务端异步任务,支持轮询和自动等待。
746
+
747
+ ```python
748
+ # 创建异步任务
749
+ job = client.parse.create_async_job(file=f, filename="large.pdf")
750
+
751
+ # 方式 1:自动等待完成
752
+ result = client.parse.wait_for_result(job_id=job.job_id, timeout=300)
753
+
754
+ # 方式 2:手动轮询
755
+ while True:
756
+ status = client.parse.get_result(job_id=job.job_id)
757
+ if status.status == "completed":
758
+ break
759
+ time.sleep(5)
760
+ ```
761
+
762
+ 完整示例:[example/2_async_job.py](example/2_async_job.py)
763
+
764
+ ### 示例 3:本地批处理工作流
765
+
766
+ 批量处理本地文件,支持进度回调和错误处理。
767
+
768
+ ```python
769
+ result = client.local.run_workflow(
770
+ source=LocalSource(directory="./docs", pattern=["*.pdf"]),
771
+ destination=MilvusDestination(db_path="./vectors.db", ...),
772
+ stages=[ParseStage(...), ChunkStage(...), EmbedStage(...)],
773
+ progress_callback=lambda c, t, m: print(f"[{c}/{t}] {m}"),
774
+ on_error="continue", # 遇到错误继续处理
775
+ max_retries=3
776
+ )
777
+
778
+ print(f"成功: {result.success}, 失败: {result.failed}")
779
+ ```
780
+
781
+ 完整示例:[example/3_local_workflow.py](example/3_local_workflow.py)
782
+
783
+ ### 示例 4:生产环境最佳实践
784
+
785
+ 包含错误处理、日志记录、进度回调、自定义 Source 的完整工作流。
786
+
787
+ ```python
788
+ import logging
789
+
790
+ logging.basicConfig(level=logging.INFO)
791
+ logger = logging.getLogger(__name__)
792
+
793
+ # 自定义 Source(按文件大小过滤)
794
+ class FilteredLocalSource(Source):
795
+ def __init__(self, directory, min_size=0, max_size=10*1024*1024):
796
+ self.directory = Path(directory)
797
+ self.min_size = min_size
798
+ self.max_size = max_size
799
+
800
+ def list_files(self) -> List[str]:
801
+ files = []
802
+ for f in self.directory.glob("**/*.pdf"):
803
+ size = f.stat().st_size
804
+ if self.min_size <= size <= self.max_size:
805
+ files.append(str(f))
806
+ return files
807
+
808
+ # ... read_file 实现
809
+ ```
810
+
811
+ 完整示例:[example/4_advanced_workflow.py](example/4_advanced_workflow.py)
812
+
813
+ ---
814
+
815
+ ## 调试与日志
816
+
817
+ ### 启用调试日志
818
+
819
+ ```python
820
+ from xparse_client import XParseClient
821
+ import logging
822
+
823
+ # 配置日志级别
824
+ logging.basicConfig(level=logging.DEBUG)
825
+
826
+ # 传入自定义 logger
827
+ client = XParseClient(
828
+ app_id="...",
829
+ secret_code="...",
830
+ debug_logger=logging.getLogger("xparse_client")
831
+ )
832
+ ```
833
+
834
+ ### 日志级别说明
835
+
836
+ | 级别 | 用途 | 输出内容 |
837
+ |------|------|----------|
838
+ | `DEBUG` | 开发调试 | 详细的请求/响应日志 |
839
+ | `INFO` | 正常运行 | 关键操作日志(默认) |
840
+ | `WARNING` | 警告信息 | 潜在问题提示 |
841
+ | `ERROR` | 错误信息 | 错误详情和堆栈 |
842
+
843
+ ### 日志示例
844
+
845
+ ```python
846
+ import logging
847
+
848
+ # 配置文件日志
849
+ logging.basicConfig(
850
+ level=logging.INFO,
851
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
852
+ handlers=[
853
+ logging.FileHandler('xparse.log'),
854
+ logging.StreamHandler()
855
+ ]
856
+ )
857
+
858
+ client = XParseClient.from_env()
859
+
860
+ # 日志会自动记录关键操作
861
+ result = client.parse.partition(...)
862
+ # 输出: 2024-01-15 10:30:00 - xparse_client - INFO - Parsing document.pdf
863
+ ```
864
+
865
+ ---
866
+
867
+ ## 本地开发
868
+
869
+ ### 环境准备
870
+
871
+ 推荐使用 `uv` 管理开发环境:
872
+
873
+ ```bash
874
+ # 克隆仓库
875
+ git clone https://github.com/intsig-textin/xparse-python-client.git
876
+ cd xparse-python-client
877
+
878
+ # 安装开发依赖
879
+ uv sync --dev
880
+
881
+ # 运行测试
882
+ make test
883
+
884
+ # 代码格式化
885
+ make format
886
+
887
+ # 运行示例
888
+ python example/1_basic_api_usage.py
889
+ ```
890
+
891
+ ### 项目结构
892
+
893
+ ```
894
+ xparse-client/
895
+ ├── xparse_client/ # 主包
896
+ │ ├── __init__.py
897
+ │ ├── client.py # XParseClient 主类
898
+ │ ├── models/ # 数据模型
899
+ │ └── connectors/ # Source/Destination
900
+ ├── tests/ # 测试
901
+ │ ├── unit/ # 单元测试
902
+ │ └── integration/ # 集成测试
903
+ ├── example/ # 示例代码
904
+ ├── docs/ # 文档
905
+ └── Makefile # 开发命令
906
+ ```
907
+
908
+ ### 常用命令
909
+
910
+ ```bash
911
+ # 运行所有测试
912
+ make test
913
+
914
+ # 运行单元测试
915
+ make test-unit
916
+
917
+ # 代码覆盖率
918
+ make test-cov
919
+
920
+ # 代码格式化
921
+ make format
922
+
923
+ # 代码检查
924
+ make lint
925
+
926
+ # 清理缓存
927
+ make clean
928
+ ```
929
+
930
+ ### 贡献流程
931
+
932
+ 1. Fork 本仓库
933
+ 2. 创建特性分支:`git checkout -b feature/amazing-feature`
934
+ 3. 编写代码和测试
935
+ 4. 确保测试通过:`make test`
936
+ 5. 提交更改:`git commit -m 'Add amazing feature'`
937
+ 6. 推送到分支:`git push origin feature/amazing-feature`
938
+ 7. 提交 Pull Request
939
+
940
+ ### 测试要求
941
+
942
+ - 所有新功能必须包含单元测试
943
+ - 确保所有测试通过:`make test`
944
+ - 代码覆盖率不低于 80%:`make test-cov`
945
+ - 遵循代码风格规范:`make format && make lint`
946
+
947
+ ---
948
+
949
+ ## 版本成熟度
950
+
951
+ **当前版本:v0.3.0**
952
+
953
+ - ✅ 核心 API 已稳定
954
+ - ✅ 生产环境可用
955
+ - ⚠️ 破坏性变更请查看 [CHANGELOG.md](CHANGELOG.md)
956
+
957
+ 我们遵循[语义化版本规范](https://semver.org/lang/zh-CN/)(SemVer),主版本号变更可能包含破坏性变更。建议锁定版本号:
958
+
959
+ ```bash
960
+ # 锁定主版本
961
+ pip install "xparse-client>=0.3,<1.0"
962
+
963
+ # 锁定次版本
964
+ pip install "xparse-client==0.3.*"
965
+ ```
966
+
967
+ ---
968
+
969
+ ## 相关资源
970
+
971
+ ### 📖 文档
972
+
973
+ - [完整文档](https://docs.textin.com/pipeline/overview) - 官方文档
974
+ - [API 接口规范](docs/API_REFERENCE.md) - Pipeline API 详细说明
975
+ - [云厂商配置指南](docs/CLOUD_PROVIDERS.md) - S3/OSS/COS 等配置
976
+ - [迁移指南](docs/MIGRATION_FROM_PIPELINE.md) - 从 Pipeline 类迁移
977
+
978
+ ### 🔗 链接
979
+
980
+ - [GitHub 仓库](https://github.com/intsig-textin/xparse-python-client)
981
+ - [PyPI 页面](https://pypi.org/project/xparse-client/)
982
+ - [问题反馈](https://github.com/intsig-textin/xparse-python-client/issues)
983
+ - [讨论区](https://github.com/intsig-textin/xparse-python-client/discussions)
984
+ - [更新日志](CHANGELOG.md)
985
+ - [TextIn 开发者控制台](https://www.textin.com/console/dashboard/setting)
986
+
987
+ ### 💰 计费
988
+
989
+ Pipeline 接口按页计费,具体价格请查看:[通用文档解析](https://www.textin.com/market/detail/xparse)
990
+
991
+ ---
992
+
993
+ ## 许可证
994
+
995
+ [MIT License](LICENSE)
996
+
997
+ ---
998
+
999
+ ## 故障排查
1000
+
1001
+ ### 常见问题
1002
+
1003
+ #### 1. 认证失败
1004
+
1005
+ **错误**:`AuthenticationError: Invalid app_id or secret_code`
1006
+
1007
+ **解决方案**:
1008
+ 1. 检查环境变量:`echo $TEXTIN_APP_ID`
1009
+ 2. 登录 [TextIn 控制台](https://www.textin.com/console/dashboard/setting) 确认凭证
1010
+ 3. 确保没有多余的空格或引号
1011
+
1012
+ #### 2. 文件过大
1013
+
1014
+ **错误**:`ValidationError: File size exceeds maximum limit of 100MB`
1015
+
1016
+ **解决方案**:
1017
+ 1. 压缩文件
1018
+ 2. 分割成多个文件
1019
+ 3. 联系技术支持提升限制
1020
+
1021
+ #### 3. 向量维度不匹配
1022
+
1023
+ **错误**:`MilvusException: dimension mismatch`
1024
+
1025
+ **解决方案**:
1026
+ 确保 Destination 的 dimension 参数与 embed 模型一致(当前所有模型都是 1024 维):
1027
+
1028
+ ```python
1029
+ destination = MilvusDestination(
1030
+ dimension=1024 # 必须与 embed 模型维度一致
1031
+ )
1032
+ ```
1033
+
1034
+ #### 4. 连接超时
1035
+
1036
+ **错误**:`TimeoutException: Request timeout`
1037
+
1038
+ **解决方案**:
1039
+ 1. 检查网络连接
1040
+ 2. 增加超时时间:`client = XParseClient(..., timeout=300.0)`
1041
+ 3. 使用异步任务处理大文件
1042
+
1043
+ #### 5. Rate Limit
1044
+
1045
+ **错误**:`RateLimitError: Rate limit exceeded`
1046
+
1047
+ **解决方案**:
1048
+ 1. 等待 `retry_after` 秒后重试
1049
+ 2. 联系客服提升配额
1050
+ 3. 使用批量处理降低请求频率
1051
+
1052
+ ### 获取帮助
1053
+
1054
+ 如果遇到问题,可以通过以下方式获取帮助:
1055
+
1056
+ 1. **查看文档**:[https://docs.textin.com/pipeline/overview](https://docs.textin.com/pipeline/overview)
1057
+ 2. **提交 Issue**:[GitHub Issues](https://github.com/intsig-textin/xparse-python-client/issues)
1058
+ 3. **联系技术支持**:提供 `request_id` 可以加快问题定位
1059
+
1060
+ ```python
1061
+ try:
1062
+ result = client.parse.partition(...)
1063
+ except APIError as e:
1064
+ print(f"请联系技术支持,request_id: {e.request_id}")
1065
+ ```
1066
+
1067
+ ---
1068
+
1069
+ <div align="center">
1070
+
1071
+ **感谢使用 xParse Client!**
1072
+
1073
+ [⭐ Star on GitHub](https://github.com/intsig-textin/xparse-python-client) | [📖 Read the Docs](https://docs.textin.com/pipeline/overview) | [💬 Discussions](https://github.com/intsig-textin/xparse-python-client/discussions)
1074
+
1075
+ </div>