xparse-client 0.2.19__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -129
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -690
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.19.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.19.dist-info/RECORD +0 -11
@@ -0,0 +1,198 @@
1
+ """示例 1: 基础 API 使用
2
+
3
+ 演示 xparse-client 的基本 API 调用:
4
+ - 单文件解析
5
+ - 文档提取
6
+ - Pipeline 执行
7
+
8
+ 运行方式:
9
+ export TEXTIN_APP_ID="your-app-id"
10
+ export TEXTIN_SECRET_CODE="your-secret-code"
11
+ python example/1_basic_api_usage.py
12
+ """
13
+
14
+ import os
15
+ from pathlib import Path
16
+
17
+ from xparse_client import XParseClient
18
+ from xparse_client.models import (
19
+ ChunkConfig,
20
+ ChunkStage,
21
+ EmbedConfig,
22
+ EmbedStage,
23
+ ExtractConfig,
24
+ ParseConfig,
25
+ ParseStage,
26
+ )
27
+
28
+
29
+ def example_1_parse_single_file():
30
+ """示例 1.1: 解析单个文件"""
31
+ print("\n" + "=" * 60)
32
+ print("示例 1.1: 解析单个文件")
33
+ print("=" * 60)
34
+
35
+ # 创建客户端
36
+ client = XParseClient.from_env()
37
+
38
+ # 创建测试文件
39
+ test_file = Path("test_document.txt")
40
+ test_file.write_text("这是一个测试文档。\n包含多行内容。")
41
+
42
+ try:
43
+ # 解析文档
44
+ with open(test_file, "rb") as f:
45
+ result = client.parse.partition(
46
+ file=f, filename=test_file.name, config=ParseConfig(provider="textin")
47
+ )
48
+
49
+ print("\n✅ 解析成功!")
50
+ print(f" - 元素数量: {len(result.elements)}")
51
+ print(f" - 记录 ID: {result.record_id}")
52
+
53
+ # 显示前 3 个元素
54
+ print("\n 前 3 个元素:")
55
+ for i, elem in enumerate(result.elements[:3], 1):
56
+ print(f" {i}. [{elem.type}] {elem.text[:50]}...")
57
+
58
+ finally:
59
+ test_file.unlink(missing_ok=True)
60
+
61
+
62
+ def example_2_extract_structured_data():
63
+ """示例 1.2: 提取结构化数据"""
64
+ print("\n" + "=" * 60)
65
+ print("示例 1.2: 提取结构化数据")
66
+ print("=" * 60)
67
+
68
+ client = XParseClient.from_env()
69
+
70
+ # 创建包含结构化信息的测试文件
71
+ test_file = Path("test_invoice.txt")
72
+ test_file.write_text("""
73
+ 发票信息
74
+ --------
75
+ 客户名称: 张三
76
+ 金额: 1000 元
77
+ 日期: 2026-01-27
78
+ """)
79
+
80
+ try:
81
+ # 定义提取 schema
82
+ schema = {
83
+ "type": "object",
84
+ "properties": {
85
+ "客户名称": {"type": "string", "description": "客户的名称"},
86
+ "金额": {"type": "number", "description": "发票金额(数字)"},
87
+ "日期": {"type": "string", "description": "发票日期"},
88
+ },
89
+ "required": [],
90
+ }
91
+
92
+ with open(test_file, "rb") as f:
93
+ result = client.extract.extract(
94
+ file=f, filename=test_file.name, extract_config=ExtractConfig(schema=schema)
95
+ )
96
+
97
+ print("\n✅ 提取成功!")
98
+ # Extract API 返回的数据在 extract_result.extracted_schema 中
99
+ if result.extract_result and "extracted_schema" in result.extract_result:
100
+ print(f" 提取的数据: {result.extract_result['extracted_schema']}")
101
+ else:
102
+ print(" 未提取到数据")
103
+
104
+ finally:
105
+ test_file.unlink(missing_ok=True)
106
+
107
+
108
+ def example_3_pipeline_execution():
109
+ """示例 1.3: Pipeline 完整流程"""
110
+ print("\n" + "=" * 60)
111
+ print("示例 1.3: Pipeline 完整流程 (parse → chunk → embed)")
112
+ print("=" * 60)
113
+
114
+ client = XParseClient.from_env()
115
+
116
+ # 创建测试文件
117
+ test_file = Path("test_long_doc.txt")
118
+ test_file.write_text("""
119
+ 第一章 引言
120
+
121
+ 本文档是一个测试文档,用于演示 xparse-client 的 Pipeline 功能。
122
+ Pipeline 可以将文档处理流程串联起来,包括解析、分块和向量化。
123
+
124
+ 第二章 功能介绍
125
+
126
+ xparse-client 提供了统一的 API 入口,支持多种处理方式。
127
+ 你可以根据需求选择合适的 API。
128
+
129
+ 第三章 总结
130
+
131
+ 希望这个示例对你有帮助!
132
+ """)
133
+
134
+ try:
135
+ # 定义处理阶段
136
+ stages = [
137
+ ParseStage(config=ParseConfig(provider="textin")),
138
+ ChunkStage(
139
+ config=ChunkConfig(
140
+ strategy="by_title", # 按标题分块
141
+ max_characters=200,
142
+ overlap=20,
143
+ )
144
+ ),
145
+ EmbedStage(config=EmbedConfig(provider="qwen", model_name="text-embedding-v3")),
146
+ ]
147
+
148
+ with open(test_file, "rb") as f:
149
+ result = client.pipeline.execute(file=f, filename=test_file.name, stages=stages)
150
+
151
+ print("\n✅ Pipeline 执行成功!")
152
+ print(f" - 原始元素数: {result.stats.original_elements}")
153
+ print(f" - 分块后元素数: {result.stats.chunked_elements}")
154
+ print(f" - 向量化元素数: {result.stats.embedded_elements}")
155
+
156
+ # 显示第一个向量化的结果
157
+ if result.elements:
158
+ first_elem = result.elements[0]
159
+ print("\n 第一个分块:")
160
+ print(f" - 文本: {first_elem.text[:100]}...")
161
+ print(f" - 向量维度: {len(first_elem.embeddings) if first_elem.embeddings else 0}")
162
+
163
+ finally:
164
+ test_file.unlink(missing_ok=True)
165
+
166
+
167
+ def main():
168
+ """主函数"""
169
+ print("\n" + "=" * 60)
170
+ print("xparse-client 基础 API 使用示例")
171
+ print("=" * 60)
172
+
173
+ # 检查环境变量
174
+ if not os.getenv("TEXTIN_APP_ID") or not os.getenv("TEXTIN_SECRET_CODE"):
175
+ print("\n❌ 错误: 请先设置环境变量:")
176
+ print(" export TEXTIN_APP_ID='your-app-id'")
177
+ print(" export TEXTIN_SECRET_CODE='your-secret-code'")
178
+ return
179
+
180
+ try:
181
+ # 运行示例
182
+ example_1_parse_single_file()
183
+ example_2_extract_structured_data()
184
+ example_3_pipeline_execution()
185
+
186
+ print("\n" + "=" * 60)
187
+ print("✅ 所有示例运行完成!")
188
+ print("=" * 60)
189
+
190
+ except Exception as e:
191
+ print(f"\n❌ 错误: {e}")
192
+ import traceback
193
+
194
+ traceback.print_exc()
195
+
196
+
197
+ if __name__ == "__main__":
198
+ main()
example/2_async_job.py ADDED
@@ -0,0 +1,210 @@
1
+ """示例 2: 服务端异步任务
2
+
3
+ 演示如何使用服务端异步任务处理文档:
4
+ - 创建异步任务
5
+ - 轮询任务状态
6
+ - 等待任务完成
7
+ - 错误处理
8
+
9
+ 运行方式:
10
+ export TEXTIN_APP_ID="your-app-id"
11
+ export TEXTIN_SECRET_CODE="your-secret-code"
12
+ python example/2_async_job.py
13
+ """
14
+
15
+ import os
16
+ import time
17
+ from pathlib import Path
18
+
19
+ from xparse_client import XParseClient
20
+ from xparse_client.models import ParseConfig
21
+
22
+
23
+ def example_1_create_and_wait():
24
+ """示例 2.1: 创建异步任务并等待完成"""
25
+ print("\n" + "="*60)
26
+ print("示例 2.1: 创建异步任务并等待完成")
27
+ print("="*60)
28
+
29
+ client = XParseClient.from_env()
30
+
31
+ # 创建测试文件
32
+ test_file = Path("test_async.txt")
33
+ test_file.write_text("这是一个异步处理的测试文档。" * 100)
34
+
35
+ try:
36
+ # 创建异步任务
37
+ print("\n📤 创建异步任务...")
38
+ with open(test_file, "rb") as f:
39
+ job = client.parse.create_async_job(
40
+ file=f,
41
+ filename=test_file.name,
42
+ config=ParseConfig(provider="textin")
43
+ )
44
+
45
+ print("✅ 任务已创建:")
46
+ print(f" - 任务 ID: {job.job_id}")
47
+
48
+ # 等待任务完成
49
+ print("\n⏳ 等待任务完成...")
50
+ result = client.parse.wait_for_result(
51
+ job_id=job.job_id,
52
+ timeout_seconds=60.0,
53
+ poll_interval_seconds=2.0
54
+ )
55
+
56
+ if result.is_completed:
57
+ print("\n✅ 任务完成!")
58
+ print(f" - 任务 ID: {result.job_id}")
59
+ print(f" - 文件 ID: {result.file_id}")
60
+ print(f" - 结果 URL: {result.result_url}")
61
+ print("\n 💡 提示: 需要下载 result_url 来获取解析结果")
62
+ elif result.is_failed:
63
+ print(f"\n❌ 任务失败: {result.error_message}")
64
+ else:
65
+ print(f"\n⏸️ 任务超时,当前状态: {result.status}")
66
+
67
+ finally:
68
+ test_file.unlink(missing_ok=True)
69
+
70
+
71
+ def example_2_manual_polling():
72
+ """示例 2.2: 手动轮询任务状态"""
73
+ print("\n" + "="*60)
74
+ print("示例 2.2: 手动轮询任务状态")
75
+ print("="*60)
76
+
77
+ client = XParseClient.from_env()
78
+
79
+ # 创建测试文件
80
+ test_file = Path("test_manual_poll.txt")
81
+ test_file.write_text("手动轮询测试文档。" * 50)
82
+
83
+ try:
84
+ # 创建异步任务
85
+ with open(test_file, "rb") as f:
86
+ job = client.parse.create_async_job(
87
+ file=f,
88
+ filename=test_file.name,
89
+ config=ParseConfig(provider="textin")
90
+ )
91
+
92
+ print(f"✅ 任务已创建: {job.job_id}")
93
+
94
+ # 手动轮询
95
+ max_attempts = 30
96
+ poll_interval = 2.0
97
+
98
+ for attempt in range(max_attempts):
99
+ print(f"\n⏳ 第 {attempt + 1}/{max_attempts} 次检查...")
100
+
101
+ # 查询任务状态
102
+ status = client.parse.get_result(job_id=job.job_id)
103
+
104
+ if status.is_completed:
105
+ print("\n✅ 任务完成!")
106
+ print(f" - 耗时: ~{(attempt + 1) * poll_interval} 秒")
107
+ print(f" - 结果 URL: {status.result_url}")
108
+ break
109
+
110
+ elif status.is_failed:
111
+ print(f"\n❌ 任务失败: {status.error_message}")
112
+ break
113
+
114
+ elif status.status == "scheduled":
115
+ print(" 状态: 队列中...")
116
+ time.sleep(poll_interval)
117
+
118
+ elif status.status == "in_progress":
119
+ print(" 状态: 处理中...")
120
+ time.sleep(poll_interval)
121
+
122
+ else:
123
+ print(f"\n⏸️ 超时: 任务未在 {max_attempts * poll_interval} 秒内完成")
124
+
125
+ finally:
126
+ test_file.unlink(missing_ok=True)
127
+
128
+
129
+ def example_3_error_handling():
130
+ """示例 2.3: 异步任务错误处理"""
131
+ print("\n" + "="*60)
132
+ print("示例 2.3: 异步任务错误处理")
133
+ print("="*60)
134
+
135
+ client = XParseClient.from_env()
136
+
137
+ # 创建一个正常的任务,演示如何处理超时
138
+ test_file = Path("test_timeout.txt")
139
+ test_file.write_text("测试超时处理")
140
+
141
+ try:
142
+ print("\n📤 创建任务并设置很短的超时时间...")
143
+ with open(test_file, "rb") as f:
144
+ job = client.parse.create_async_job(
145
+ file=f,
146
+ filename=test_file.name,
147
+ config=ParseConfig(provider="textin")
148
+ )
149
+
150
+ print(f"✅ 任务已创建: {job.job_id}")
151
+
152
+ # 设置一个很短的超时时间来演示超时处理
153
+ try:
154
+ result = client.parse.wait_for_result(
155
+ job_id=job.job_id,
156
+ timeout_seconds=0.5, # 很短的超时
157
+ poll_interval_seconds=0.2
158
+ )
159
+
160
+ if result.is_completed:
161
+ print("\n✅ 任务很快完成了!")
162
+ elif result.is_failed:
163
+ print("\n❌ 任务失败:")
164
+ print(f" - 错误信息: {result.error_message}")
165
+
166
+ except Exception as e:
167
+ print(f"\n⏸️ 捕获到超时异常: {type(e).__name__}")
168
+ print(f" 这是正常的,演示了如何处理超时情况")
169
+
170
+ finally:
171
+ test_file.unlink(missing_ok=True)
172
+
173
+
174
+ def main():
175
+ """主函数"""
176
+ print("\n" + "="*60)
177
+ print("xparse-client 服务端异步任务示例")
178
+ print("="*60)
179
+
180
+ # 检查环境变量
181
+ if not os.getenv("TEXTIN_APP_ID") or not os.getenv("TEXTIN_SECRET_CODE"):
182
+ print("\n❌ 错误: 请先设置环境变量:")
183
+ print(" export TEXTIN_APP_ID='your-app-id'")
184
+ print(" export TEXTIN_SECRET_CODE='your-secret-code'")
185
+ return
186
+
187
+ print("\n💡 提示:")
188
+ print(" - 异步任务适合处理大文件或耗时操作")
189
+ print(" - 创建任务后立即返回,不会阻塞")
190
+ print(" - 可以使用 wait_for_result() 自动等待")
191
+ print(" - 也可以手动轮询 get_result() 检查状态")
192
+
193
+ try:
194
+ # 运行示例
195
+ example_1_create_and_wait()
196
+ example_2_manual_polling()
197
+ example_3_error_handling()
198
+
199
+ print("\n" + "="*60)
200
+ print("✅ 所有示例运行完成!")
201
+ print("="*60)
202
+
203
+ except Exception as e:
204
+ print(f"\n❌ 错误: {e}")
205
+ import traceback
206
+ traceback.print_exc()
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
@@ -0,0 +1,300 @@
1
+ """示例 3: 本地批量处理工作流
2
+
3
+ 演示本地批量处理的完整流程:
4
+ - 从本地目录读取文件
5
+ - 批量调用 Pipeline API
6
+ - 写入本地或向量数据库
7
+ - 查看处理统计
8
+
9
+ 运行方式:
10
+ export TEXTIN_APP_ID="your-app-id"
11
+ export TEXTIN_SECRET_CODE="your-secret-code"
12
+ python example/3_local_workflow.py
13
+ """
14
+
15
+ import os
16
+ from pathlib import Path
17
+
18
+ from xparse_client import XParseClient
19
+ from xparse_client.connectors import LocalDestination, LocalSource
20
+ from xparse_client.models import (
21
+ ChunkConfig,
22
+ ChunkStage,
23
+ EmbedConfig,
24
+ EmbedStage,
25
+ ParseConfig,
26
+ ParseStage,
27
+ )
28
+
29
+
30
+ def example_1_basic_workflow():
31
+ """示例 3.1: 基础本地批量处理"""
32
+ print("\n" + "="*60)
33
+ print("示例 3.1: 基础本地批量处理")
34
+ print("="*60)
35
+
36
+ client = XParseClient.from_env()
37
+
38
+ # 准备测试目录和文件
39
+ source_dir = Path("./test_docs")
40
+ output_dir = Path("./test_output")
41
+ source_dir.mkdir(exist_ok=True)
42
+ output_dir.mkdir(exist_ok=True)
43
+
44
+ # 创建多个测试文件
45
+ for i in range(3):
46
+ (source_dir / f"doc_{i+1}.txt").write_text(
47
+ f"这是第 {i+1} 个测试文档。\n" * 10
48
+ )
49
+
50
+ try:
51
+ print(f"\n📁 源目录: {source_dir}")
52
+ print(f"📁 输出目录: {output_dir}")
53
+
54
+ # 定义处理阶段
55
+ stages = [
56
+ ParseStage(config=ParseConfig(provider="textin")),
57
+ ChunkStage(config=ChunkConfig(
58
+ strategy="basic",
59
+ max_characters=500
60
+ )),
61
+ EmbedStage(config=EmbedConfig(provider="qwen"))
62
+ ]
63
+
64
+ # 执行工作流
65
+ print("\n⏳ 开始处理...")
66
+ result = client.local.run_workflow(
67
+ source=LocalSource(directory=str(source_dir), pattern=["*.txt"]),
68
+ destination=LocalDestination(output_dir=str(output_dir)),
69
+ stages=stages
70
+ )
71
+
72
+ # 显示结果
73
+ print("\n✅ 处理完成!")
74
+ print(f" - 总文件数: {result.total}")
75
+ print(f" - 成功: {result.success}")
76
+ print(f" - 失败: {result.failed}")
77
+ print(f" - 耗时: {result.duration:.2f} 秒")
78
+
79
+ # 列出输出文件
80
+ output_files = list(output_dir.glob("*.json"))
81
+ print(f"\n📄 输出文件 ({len(output_files)} 个):")
82
+ for f in output_files:
83
+ print(f" - {f.name}")
84
+
85
+ finally:
86
+ # 清理测试文件
87
+ import shutil
88
+ if source_dir.exists():
89
+ shutil.rmtree(source_dir)
90
+ if output_dir.exists():
91
+ shutil.rmtree(output_dir)
92
+
93
+
94
+ def example_2_with_progress():
95
+ """示例 3.2: 带进度回调的批量处理"""
96
+ print("\n" + "="*60)
97
+ print("示例 3.2: 带进度回调的批量处理")
98
+ print("="*60)
99
+
100
+ client = XParseClient.from_env()
101
+
102
+ # 准备测试目录和文件
103
+ source_dir = Path("./test_docs_progress")
104
+ output_dir = Path("./test_output_progress")
105
+ source_dir.mkdir(exist_ok=True)
106
+ output_dir.mkdir(exist_ok=True)
107
+
108
+ # 创建 5 个测试文件
109
+ for i in range(5):
110
+ (source_dir / f"document_{i+1}.txt").write_text(
111
+ f"文档 {i+1} 的内容。\n" * 20
112
+ )
113
+
114
+ try:
115
+ # 定义进度回调
116
+ def on_progress(current, total, message):
117
+ percentage = (current / total) * 100
118
+ print(f" [{current}/{total}] {percentage:.1f}% - {message}")
119
+
120
+ print(f"\n⏳ 开始处理 {len(list(source_dir.glob('*.txt')))} 个文件...")
121
+
122
+ stages = [
123
+ ParseStage(config=ParseConfig(provider="textin")),
124
+ EmbedStage(config=EmbedConfig(provider="qwen"))
125
+ ]
126
+
127
+ result = client.local.run_workflow(
128
+ source=LocalSource(directory=str(source_dir)),
129
+ destination=LocalDestination(output_dir=str(output_dir)),
130
+ stages=stages,
131
+ progress_callback=on_progress # 添加进度回调
132
+ )
133
+
134
+ print("\n✅ 全部完成!")
135
+ print(f" 成功率: {result.success}/{result.total} ({result.success/result.total*100:.1f}%)")
136
+
137
+ finally:
138
+ # 清理
139
+ import shutil
140
+ if source_dir.exists():
141
+ shutil.rmtree(source_dir)
142
+ if output_dir.exists():
143
+ shutil.rmtree(output_dir)
144
+
145
+
146
+ def example_3_error_handling():
147
+ """示例 3.3: 错误处理策略"""
148
+ print("\n" + "="*60)
149
+ print("示例 3.3: 错误处理策略")
150
+ print("="*60)
151
+
152
+ client = XParseClient.from_env()
153
+
154
+ # 准备测试目录
155
+ source_dir = Path("./test_docs_errors")
156
+ output_dir = Path("./test_output_errors")
157
+ source_dir.mkdir(exist_ok=True)
158
+ output_dir.mkdir(exist_ok=True)
159
+
160
+ # 创建正常文件和可能出错的文件
161
+ (source_dir / "good_1.txt").write_text("正常文档 1")
162
+ (source_dir / "good_2.txt").write_text("正常文档 2")
163
+ (source_dir / "empty.txt").write_text("") # 空文件可能导致错误
164
+
165
+ try:
166
+ print("\n📋 错误处理策略: continue (遇到错误继续处理)")
167
+
168
+ stages = [
169
+ ParseStage(config=ParseConfig(provider="textin")),
170
+ ]
171
+
172
+ result = client.local.run_workflow(
173
+ source=LocalSource(directory=str(source_dir)),
174
+ destination=LocalDestination(output_dir=str(output_dir)),
175
+ stages=stages,
176
+ on_error="continue", # 遇到错误继续处理
177
+ max_retries=2 # 失败重试 2 次
178
+ )
179
+
180
+ print("\n✅ 处理完成!")
181
+ print(f" - 成功: {result.success}")
182
+ print(f" - 失败: {result.failed}")
183
+
184
+ if result.failed_files:
185
+ print("\n❌ 失败的文件:")
186
+ for failed in result.failed_files:
187
+ print(f" - {failed.file_path}: {failed.error}")
188
+
189
+ finally:
190
+ # 清理
191
+ import shutil
192
+ if source_dir.exists():
193
+ shutil.rmtree(source_dir)
194
+ if output_dir.exists():
195
+ shutil.rmtree(output_dir)
196
+
197
+
198
+ def example_4_milvus_workflow():
199
+ """示例 3.4: 写入 Milvus 向量数据库"""
200
+ print("\n" + "="*60)
201
+ print("示例 3.4: 写入 Milvus 向量数据库")
202
+ print("="*60)
203
+
204
+ print("\n💡 提示: 此示例需要安装 Milvus 依赖")
205
+ print(" pip install xparse-client[milvus]")
206
+
207
+ try:
208
+ from xparse_client.connectors import MilvusDestination
209
+ except ImportError:
210
+ print("\n⚠️ 跳过此示例: 未安装 pymilvus")
211
+ return
212
+
213
+ client = XParseClient.from_env()
214
+
215
+ # 准备测试目录
216
+ source_dir = Path("./test_docs_milvus")
217
+ source_dir.mkdir(exist_ok=True)
218
+
219
+ # 创建测试文件
220
+ for i in range(3):
221
+ (source_dir / f"vector_doc_{i+1}.txt").write_text(
222
+ f"向量数据库测试文档 {i+1}。\n" * 15
223
+ )
224
+
225
+ try:
226
+ print("\n📊 目标: Milvus 向量数据库")
227
+
228
+ stages = [
229
+ ParseStage(config=ParseConfig(provider="textin")),
230
+ ChunkStage(config=ChunkConfig(
231
+ strategy="by_title",
232
+ max_characters=300
233
+ )),
234
+ EmbedStage(config=EmbedConfig(provider="qwen"))
235
+ ]
236
+
237
+ result = client.local.run_workflow(
238
+ source=LocalSource(directory=str(source_dir)),
239
+ destination=MilvusDestination(
240
+ db_path="./demo_vectors.db", # 本地 Milvus
241
+ collection_name="demo_collection",
242
+ dimension=1024 # qwen 向量维度
243
+ ),
244
+ stages=stages
245
+ )
246
+
247
+ print("\n✅ 向量已写入 Milvus!")
248
+ print(" - Collection: demo_collection")
249
+ print(f" - 向量数量: {result.success} 个文档的向量")
250
+
251
+ # 清理 Milvus 文件
252
+ if Path("demo_vectors.db").exists():
253
+ import shutil
254
+ shutil.rmtree("demo_vectors.db")
255
+
256
+ finally:
257
+ # 清理
258
+ import shutil
259
+ if source_dir.exists():
260
+ shutil.rmtree(source_dir)
261
+
262
+
263
+ def main():
264
+ """主函数"""
265
+ print("\n" + "="*60)
266
+ print("xparse-client 本地批量处理工作流示例")
267
+ print("="*60)
268
+
269
+ # 检查环境变量
270
+ if not os.getenv("TEXTIN_APP_ID") or not os.getenv("TEXTIN_SECRET_CODE"):
271
+ print("\n❌ 错误: 请先设置环境变量:")
272
+ print(" export TEXTIN_APP_ID='your-app-id'")
273
+ print(" export TEXTIN_SECRET_CODE='your-secret-code'")
274
+ return
275
+
276
+ print("\n💡 提示:")
277
+ print(" - 本地批量处理会在你的机器上同步执行")
278
+ print(" - 适合中小规模的批量导入任务")
279
+ print(" - 支持进度回调和错误处理")
280
+ print(" - 可以写入本地文件或向量数据库")
281
+
282
+ try:
283
+ # 运行示例
284
+ example_1_basic_workflow()
285
+ example_2_with_progress()
286
+ example_3_error_handling()
287
+ example_4_milvus_workflow()
288
+
289
+ print("\n" + "="*60)
290
+ print("✅ 所有示例运行完成!")
291
+ print("="*60)
292
+
293
+ except Exception as e:
294
+ print(f"\n❌ 错误: {e}")
295
+ import traceback
296
+ traceback.print_exc()
297
+
298
+
299
+ if __name__ == "__main__":
300
+ main()