xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +188 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +351 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +225 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +132 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b8.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -129
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -690
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.19.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.19.dist-info/RECORD +0 -11
@@ -0,0 +1,327 @@
1
+ """示例 4: 高级批处理工作流
2
+
3
+ 演示高级批处理功能和最佳实践:
4
+ - 从配置文件加载
5
+ - 自定义 Source/Destination
6
+ - 完整的错误处理
7
+ - 日志记录
8
+
9
+ 运行方式:
10
+ export TEXTIN_APP_ID="your-app-id"
11
+ export TEXTIN_SECRET_CODE="your-secret-code"
12
+ python example/4_advanced_workflow.py
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import os
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+
21
+ from xparse_client import XParseClient
22
+ from xparse_client.connectors import LocalDestination, LocalSource
23
+ from xparse_client.models import (
24
+ ChunkConfig,
25
+ ChunkStage,
26
+ EmbedConfig,
27
+ EmbedStage,
28
+ ParseConfig,
29
+ ParseStage,
30
+ )
31
+
32
+ # 配置日志
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36
+ )
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def example_1_config_file():
41
+ """示例 4.1: 从配置文件加载"""
42
+ print("\n" + "="*60)
43
+ print("示例 4.1: 从配置文件加载配置")
44
+ print("="*60)
45
+
46
+ # 创建配置文件
47
+ config = {
48
+ "source": {
49
+ "directory": "./data/input",
50
+ "pattern": ["*.pdf", "*.docx", "*.txt"],
51
+ "recursive": False
52
+ },
53
+ "destination": {
54
+ "output_dir": "./data/output"
55
+ },
56
+ "stages": [
57
+ {
58
+ "type": "parse",
59
+ "provider": "textin"
60
+ },
61
+ {
62
+ "type": "chunk",
63
+ "strategy": "by_title",
64
+ "max_characters": 1000,
65
+ "overlap": 50
66
+ },
67
+ {
68
+ "type": "embed",
69
+ "provider": "qwen",
70
+ "model_name": "text-embedding-v3"
71
+ }
72
+ ],
73
+ "options": {
74
+ "on_error": "continue",
75
+ "max_retries": 3
76
+ }
77
+ }
78
+
79
+ config_file = Path("workflow_config.json")
80
+ config_file.write_text(json.dumps(config, indent=2, ensure_ascii=False))
81
+
82
+ print(f"\n📄 配置文件已创建: {config_file}")
83
+ print("\n配置内容:")
84
+ print(json.dumps(config, indent=2, ensure_ascii=False))
85
+
86
+ # 从配置文件加载
87
+ print("\n⏳ 从配置文件加载...")
88
+ loaded_config = json.loads(config_file.read_text())
89
+
90
+ # 创建客户端
91
+ XParseClient.from_env()
92
+
93
+ # 演示如何从配置构建 source 和 destination(不实际创建)
94
+ print("\n✅ 配置加载成功!")
95
+ print(" - Source 配置:")
96
+ print(f" 目录: {loaded_config['source']['directory']}")
97
+ print(f" 模式: {loaded_config['source']['pattern']}")
98
+ print(f" 递归: {loaded_config['source']['recursive']}")
99
+ print(" - Destination 配置:")
100
+ print(f" 输出目录: {loaded_config['destination']['output_dir']}")
101
+
102
+ # 构建 stages
103
+ stages = []
104
+ for stage_config in loaded_config["stages"]:
105
+ if stage_config["type"] == "parse":
106
+ stages.append(ParseStage(
107
+ config=ParseConfig(provider=stage_config["provider"])
108
+ ))
109
+ elif stage_config["type"] == "chunk":
110
+ stages.append(ChunkStage(
111
+ config=ChunkConfig(
112
+ strategy=stage_config["strategy"],
113
+ max_characters=stage_config.get("max_characters", 1024),
114
+ overlap=stage_config.get("overlap", 0)
115
+ )
116
+ ))
117
+ elif stage_config["type"] == "embed":
118
+ stages.append(EmbedStage(
119
+ config=EmbedConfig(
120
+ provider=stage_config["provider"],
121
+ model_name=stage_config.get("model_name", "text-embedding-v3")
122
+ )
123
+ ))
124
+
125
+ print(f" - Stages: {len(stages)} 个")
126
+ for i, stage in enumerate(stages, 1):
127
+ print(f" {i}. {stage.type}")
128
+
129
+ # 清理
130
+ config_file.unlink(missing_ok=True)
131
+
132
+
133
+ def example_2_custom_source():
134
+ """示例 4.2: 自定义 Source(过滤特定文件)"""
135
+ print("\n" + "="*60)
136
+ print("示例 4.2: 自定义 Source - 按文件大小过滤")
137
+ print("="*60)
138
+
139
+ from typing import Any
140
+
141
+ from xparse_client.connectors.sources import Source
142
+
143
+ class FilteredLocalSource(Source):
144
+ """自定义 Source: 只处理特定大小范围的文件"""
145
+
146
+ def __init__(self, directory: str, min_size: int = 0, max_size: int = 10*1024*1024):
147
+ self.directory = Path(directory)
148
+ self.min_size = min_size
149
+ self.max_size = max_size
150
+
151
+ def list_files(self) -> list[str]:
152
+ """列出符合大小要求的文件"""
153
+ files = []
154
+ for f in self.directory.glob("**/*.txt"):
155
+ size = f.stat().st_size
156
+ if self.min_size <= size <= self.max_size:
157
+ files.append(str(f))
158
+ return files
159
+
160
+ def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
161
+ """读取文件"""
162
+ path = Path(file_path)
163
+ content = path.read_bytes()
164
+
165
+ data_source = {
166
+ "url": f"file://{path.absolute()}",
167
+ "size": path.stat().st_size,
168
+ "modified": path.stat().st_mtime,
169
+ }
170
+
171
+ return content, data_source
172
+
173
+ # 准备测试
174
+ test_dir = Path("./test_custom_source")
175
+ test_dir.mkdir(exist_ok=True)
176
+
177
+ # 创建不同大小的文件
178
+ (test_dir / "small.txt").write_text("小")
179
+ (test_dir / "medium.txt").write_text("中等" * 100)
180
+ (test_dir / "large.txt").write_text("大" * 10000)
181
+
182
+ try:
183
+ print(f"\n📁 测试目录: {test_dir}")
184
+ print(f" - small.txt: {(test_dir / 'small.txt').stat().st_size} bytes")
185
+ print(f" - medium.txt: {(test_dir / 'medium.txt').stat().st_size} bytes")
186
+ print(f" - large.txt: {(test_dir / 'large.txt').stat().st_size} bytes")
187
+
188
+ # 使用自定义 Source (只处理 100 - 5000 bytes)
189
+ source = FilteredLocalSource(
190
+ directory=str(test_dir),
191
+ min_size=100,
192
+ max_size=5000
193
+ )
194
+
195
+ files = source.list_files()
196
+ print(f"\n✅ 过滤结果: {len(files)} 个文件符合条件")
197
+ for f in files:
198
+ print(f" - {Path(f).name}")
199
+
200
+ finally:
201
+ # 清理
202
+ import shutil
203
+ if test_dir.exists():
204
+ shutil.rmtree(test_dir)
205
+
206
+
207
+ def example_3_production_workflow():
208
+ """示例 4.3: 生产环境工作流(完整错误处理和日志)"""
209
+ print("\n" + "="*60)
210
+ print("示例 4.3: 生产环境工作流")
211
+ print("="*60)
212
+
213
+ client = XParseClient.from_env()
214
+
215
+ # 准备测试
216
+ source_dir = Path("./test_production")
217
+ output_dir = Path("./test_production_output")
218
+ log_dir = Path("./test_production_logs")
219
+
220
+ source_dir.mkdir(exist_ok=True)
221
+ output_dir.mkdir(exist_ok=True)
222
+ log_dir.mkdir(exist_ok=True)
223
+
224
+ # 创建测试文件
225
+ for i in range(3):
226
+ (source_dir / f"prod_doc_{i+1}.txt").write_text(f"生产文档 {i+1}\n" * 50)
227
+
228
+ # 设置日志文件
229
+ log_file = log_dir / f"workflow_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
230
+ file_handler = logging.FileHandler(log_file)
231
+ file_handler.setLevel(logging.INFO)
232
+ logger.addHandler(file_handler)
233
+
234
+ try:
235
+ logger.info(f"开始工作流: {datetime.now()}")
236
+ logger.info(f"源目录: {source_dir}")
237
+ logger.info(f"输出目录: {output_dir}")
238
+
239
+ # 进度和错误回调
240
+
241
+ def on_progress(current, total, message):
242
+ logger.info(f"进度: [{current}/{total}] {message}")
243
+ print(f" 📊 [{current}/{total}] {message}")
244
+
245
+ # 定义 stages
246
+ stages = [
247
+ ParseStage(config=ParseConfig(provider="textin")),
248
+ ChunkStage(config=ChunkConfig(strategy="basic")),
249
+ EmbedStage(config=EmbedConfig(provider="qwen"))
250
+ ]
251
+
252
+ # 执行工作流
253
+ print(f"\n⏳ 开始处理 (日志: {log_file})...")
254
+
255
+ result = client.local.run_workflow(
256
+ source=LocalSource(directory=str(source_dir)),
257
+ destination=LocalDestination(output_dir=str(output_dir)),
258
+ stages=stages,
259
+ progress_callback=on_progress,
260
+ on_error="continue",
261
+ max_retries=2
262
+ )
263
+
264
+ # 记录结果
265
+ logger.info(f"工作流完成: 成功 {result.success}, 失败 {result.failed}")
266
+ logger.info(f"耗时: {result.duration:.2f} 秒")
267
+
268
+ print("\n✅ 工作流完成!")
269
+ print(f" - 总文件: {result.total}")
270
+ print(f" - 成功: {result.success}")
271
+ print(f" - 失败: {result.failed}")
272
+ print(f" - 耗时: {result.duration:.2f} 秒")
273
+ print(f" - 日志: {log_file}")
274
+
275
+ if result.failed_files:
276
+ logger.error(f"失败的文件: {len(result.failed_files)} 个")
277
+ for failed in result.failed_files:
278
+ logger.error(f" - {failed.file_path}: {failed.error}")
279
+
280
+ except Exception as e:
281
+ logger.error(f"工作流异常: {e}", exc_info=True)
282
+ raise
283
+
284
+ finally:
285
+ # 清理
286
+ import shutil
287
+ for d in [source_dir, output_dir, log_dir]:
288
+ if d.exists():
289
+ shutil.rmtree(d)
290
+
291
+
292
+ def main():
293
+ """主函数"""
294
+ print("\n" + "="*60)
295
+ print("xparse-client 高级批处理工作流示例")
296
+ print("="*60)
297
+
298
+ # 检查环境变量
299
+ if not os.getenv("TEXTIN_APP_ID") or not os.getenv("TEXTIN_SECRET_CODE"):
300
+ print("\n❌ 错误: 请先设置环境变量:")
301
+ print(" export TEXTIN_APP_ID='your-app-id'")
302
+ print(" export TEXTIN_SECRET_CODE='your-secret-code'")
303
+ return
304
+
305
+ print("\n💡 本示例展示:")
306
+ print(" - 从 JSON 配置文件加载设置")
307
+ print(" - 自定义 Source 实现(按文件大小过滤)")
308
+ print(" - 生产环境级别的错误处理和日志")
309
+
310
+ try:
311
+ # 运行示例
312
+ example_1_config_file()
313
+ example_2_custom_source()
314
+ example_3_production_workflow()
315
+
316
+ print("\n" + "="*60)
317
+ print("✅ 所有示例运行完成!")
318
+ print("="*60)
319
+
320
+ except Exception as e:
321
+ print(f"\n❌ 错误: {e}")
322
+ import traceback
323
+ traceback.print_exc()
324
+
325
+
326
+ if __name__ == "__main__":
327
+ main()
example/README.md ADDED
@@ -0,0 +1,128 @@
1
+ # xparse-client 使用示例
2
+
3
+ 本目录包含了 xparse-client SDK 的各种使用示例。
4
+
5
+ ## 📋 示例列表
6
+
7
+ ### 1. 基础 API 使用 - `1_basic_api_usage.py`
8
+
9
+ 演示最基本的 API 调用:
10
+ - 单文件解析(Parse API)
11
+ - 文档提取(Extract API)
12
+ - Pipeline 执行(完整流程)
13
+
14
+ **适合**: 初次使用、单文件处理
15
+
16
+ ```bash
17
+ python example/1_basic_api_usage.py
18
+ ```
19
+
20
+ ---
21
+
22
+ ### 2. 服务端异步任务 - `2_async_job.py`
23
+
24
+ 演示服务端异步任务的使用:
25
+ - 创建异步解析任务
26
+ - 轮询任务状态
27
+ - 等待任务完成
28
+ - 错误处理
29
+
30
+ **适合**: 处理大文件、长时间任务
31
+
32
+ ```bash
33
+ python example/2_async_job.py
34
+ ```
35
+
36
+ ---
37
+
38
+ ### 3. 本地批量处理 - `3_local_workflow.py`
39
+
40
+ 演示本地批量处理工作流:
41
+ - 从本地目录读取文件
42
+ - 批量调用 Pipeline API
43
+ - 写入 Milvus 向量数据库
44
+ - 查看处理统计
45
+
46
+ **适合**: 批量导入文档到向量数据库
47
+
48
+ ```bash
49
+ python example/3_local_workflow.py
50
+ ```
51
+
52
+ ---
53
+
54
+ ### 4. 高级批处理 - `4_advanced_workflow.py`
55
+
56
+ 演示高级批处理功能:
57
+ - 进度回调
58
+ - 错误处理策略(stop/continue/retry)
59
+ - 自定义 Source/Destination
60
+ - 从配置文件加载
61
+
62
+ **适合**: 生产环境、复杂场景
63
+
64
+ ```bash
65
+ python example/4_advanced_workflow.py
66
+ ```
67
+
68
+ ---
69
+
70
+ ## 🔧 环境配置
71
+
72
+ 所有示例都需要配置 TextIn API 凭证。
73
+
74
+ ### 方式 1: 环境变量(推荐)
75
+
76
+ ```bash
77
+ export TEXTIN_APP_ID="your-app-id"
78
+ export TEXTIN_SECRET_CODE="your-secret-code"
79
+ ```
80
+
81
+ ### 方式 2: 修改代码
82
+
83
+ 在示例代码中直接修改:
84
+
85
+ ```python
86
+ client = XParseClient(
87
+ app_id="your-app-id",
88
+ secret_code="your-secret-code"
89
+ )
90
+ ```
91
+
92
+ ## 📝 配置文件示例
93
+
94
+ 查看 `config_example.json` 了解如何从配置文件加载设置。
95
+
96
+ ## 🚀 快速开始
97
+
98
+ 1. 安装依赖
99
+
100
+ ```bash
101
+ pip install xparse-client
102
+ # 或安装所有可选依赖
103
+ pip install xparse-client[all]
104
+ ```
105
+
106
+ 2. 配置凭证
107
+
108
+ ```bash
109
+ export TEXTIN_APP_ID="your-app-id"
110
+ export TEXTIN_SECRET_CODE="your-secret-code"
111
+ ```
112
+
113
+ 3. 运行示例
114
+
115
+ ```bash
116
+ # 从基础示例开始
117
+ python example/1_basic_api_usage.py
118
+
119
+ # 然后尝试批量处理
120
+ python example/3_local_workflow.py
121
+ ```
122
+
123
+ ## 📚 更多资源
124
+
125
+ - [完整文档](../README.md)
126
+ - [API 参考](../docs/)
127
+ - [迁移指南](../docs/MIGRATION_FROM_PIPELINE.md)
128
+ - [GitHub Issues](https://github.com/textin/xparse-client/issues)
@@ -0,0 +1,95 @@
1
+ {
2
+ "description": "xparse-client 工作流配置示例",
3
+ "source": {
4
+ "type": "local",
5
+ "directory": "./data/documents",
6
+ "pattern": ["*.pdf", "*.docx", "*.txt"],
7
+ "recursive": true,
8
+ "description": "从本地目录读取文档,支持多种文件类型"
9
+ },
10
+ "destination": {
11
+ "type": "local",
12
+ "output_dir": "./data/output",
13
+ "description": "输出到本地 JSON 文件"
14
+ },
15
+ "stages": [
16
+ {
17
+ "type": "parse",
18
+ "provider": "textin",
19
+ "description": "使用 TextIn 解析文档"
20
+ },
21
+ {
22
+ "type": "chunk",
23
+ "strategy": "by_title",
24
+ "max_characters": 1000,
25
+ "overlap": 50,
26
+ "description": "按标题分块,每块最多 1000 字符,重叠 50 字符"
27
+ },
28
+ {
29
+ "type": "embed",
30
+ "provider": "qwen",
31
+ "model_name": "text-embedding-v3",
32
+ "description": "使用通义千问生成向量"
33
+ }
34
+ ],
35
+ "options": {
36
+ "on_error": "continue",
37
+ "max_retries": 3,
38
+ "description": "遇到错误继续处理,失败重试 3 次"
39
+ },
40
+ "milvus_example": {
41
+ "description": "使用 Milvus 作为目的地的配置示例",
42
+ "destination": {
43
+ "type": "milvus",
44
+ "db_path": "./vectors.db",
45
+ "collection_name": "documents",
46
+ "dimension": 1024,
47
+ "description": "写入本地 Milvus 数据库"
48
+ }
49
+ },
50
+ "s3_example": {
51
+ "description": "使用 S3 作为数据源的配置示例",
52
+ "source": {
53
+ "type": "s3",
54
+ "endpoint": "https://s3.amazonaws.com",
55
+ "access_key": "your-access-key",
56
+ "secret_key": "your-secret-key",
57
+ "bucket": "your-bucket",
58
+ "prefix": "documents/",
59
+ "region": "us-east-1",
60
+ "pattern": ["*.pdf"],
61
+ "description": "从 S3 读取 PDF 文件"
62
+ }
63
+ },
64
+ "extract_example": {
65
+ "description": "使用 Extract 提取结构化数据的配置示例",
66
+ "stages": [
67
+ {
68
+ "type": "parse",
69
+ "provider": "textin"
70
+ },
71
+ {
72
+ "type": "extract",
73
+ "schema": {
74
+ "type": "object",
75
+ "properties": {
76
+ "title": {
77
+ "type": "string",
78
+ "description": "文档标题"
79
+ },
80
+ "author": {
81
+ "type": "string",
82
+ "description": "作者名称"
83
+ },
84
+ "date": {
85
+ "type": "string",
86
+ "description": "日期"
87
+ }
88
+ },
89
+ "required": ["title"]
90
+ },
91
+ "description": "提取文档的标题、作者和日期"
92
+ }
93
+ ]
94
+ }
95
+ }