xparse-client 0.2.20__py3-none-any.whl → 0.3.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +110 -20
- xparse_client/_base.py +179 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +350 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +188 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +132 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +136 -0
- xparse_client/models/pipeline.py +132 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b2.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b2.dist-info/RECORD +68 -0
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -163
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -860
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.20.dist-info/METADATA +0 -1050
- xparse_client-0.2.20.dist-info/RECORD +0 -11
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""示例 4: 高级批处理工作流
|
|
2
|
+
|
|
3
|
+
演示高级批处理功能和最佳实践:
|
|
4
|
+
- 从配置文件加载
|
|
5
|
+
- 自定义 Source/Destination
|
|
6
|
+
- 完整的错误处理
|
|
7
|
+
- 日志记录
|
|
8
|
+
|
|
9
|
+
运行方式:
|
|
10
|
+
export TEXTIN_APP_ID="your-app-id"
|
|
11
|
+
export TEXTIN_SECRET_CODE="your-secret-code"
|
|
12
|
+
python example/4_advanced_workflow.py
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from xparse_client import XParseClient
|
|
22
|
+
from xparse_client.connectors import LocalDestination, LocalSource
|
|
23
|
+
from xparse_client.models import (
|
|
24
|
+
ChunkConfig,
|
|
25
|
+
ChunkStage,
|
|
26
|
+
EmbedConfig,
|
|
27
|
+
EmbedStage,
|
|
28
|
+
ParseConfig,
|
|
29
|
+
ParseStage,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# 配置日志
|
|
33
|
+
logging.basicConfig(
|
|
34
|
+
level=logging.INFO,
|
|
35
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
36
|
+
)
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def example_1_config_file():
|
|
41
|
+
"""示例 4.1: 从配置文件加载"""
|
|
42
|
+
print("\n" + "="*60)
|
|
43
|
+
print("示例 4.1: 从配置文件加载配置")
|
|
44
|
+
print("="*60)
|
|
45
|
+
|
|
46
|
+
# 创建配置文件
|
|
47
|
+
config = {
|
|
48
|
+
"source": {
|
|
49
|
+
"directory": "./data/input",
|
|
50
|
+
"pattern": ["*.pdf", "*.docx", "*.txt"],
|
|
51
|
+
"recursive": False
|
|
52
|
+
},
|
|
53
|
+
"destination": {
|
|
54
|
+
"output_dir": "./data/output"
|
|
55
|
+
},
|
|
56
|
+
"stages": [
|
|
57
|
+
{
|
|
58
|
+
"type": "parse",
|
|
59
|
+
"provider": "textin"
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"type": "chunk",
|
|
63
|
+
"strategy": "by_title",
|
|
64
|
+
"max_characters": 1000,
|
|
65
|
+
"overlap": 50
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"type": "embed",
|
|
69
|
+
"provider": "qwen",
|
|
70
|
+
"model_name": "text-embedding-v3"
|
|
71
|
+
}
|
|
72
|
+
],
|
|
73
|
+
"options": {
|
|
74
|
+
"on_error": "continue",
|
|
75
|
+
"max_retries": 3
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
config_file = Path("workflow_config.json")
|
|
80
|
+
config_file.write_text(json.dumps(config, indent=2, ensure_ascii=False))
|
|
81
|
+
|
|
82
|
+
print(f"\n📄 配置文件已创建: {config_file}")
|
|
83
|
+
print("\n配置内容:")
|
|
84
|
+
print(json.dumps(config, indent=2, ensure_ascii=False))
|
|
85
|
+
|
|
86
|
+
# 从配置文件加载
|
|
87
|
+
print("\n⏳ 从配置文件加载...")
|
|
88
|
+
loaded_config = json.loads(config_file.read_text())
|
|
89
|
+
|
|
90
|
+
# 创建客户端
|
|
91
|
+
XParseClient.from_env()
|
|
92
|
+
|
|
93
|
+
# 演示如何从配置构建 source 和 destination(不实际创建)
|
|
94
|
+
print(f"\n✅ 配置加载成功!")
|
|
95
|
+
print(f" - Source 配置:")
|
|
96
|
+
print(f" 目录: {loaded_config['source']['directory']}")
|
|
97
|
+
print(f" 模式: {loaded_config['source']['pattern']}")
|
|
98
|
+
print(f" 递归: {loaded_config['source']['recursive']}")
|
|
99
|
+
print(f" - Destination 配置:")
|
|
100
|
+
print(f" 输出目录: {loaded_config['destination']['output_dir']}")
|
|
101
|
+
|
|
102
|
+
# 构建 stages
|
|
103
|
+
stages = []
|
|
104
|
+
for stage_config in loaded_config["stages"]:
|
|
105
|
+
if stage_config["type"] == "parse":
|
|
106
|
+
stages.append(ParseStage(
|
|
107
|
+
config=ParseConfig(provider=stage_config["provider"])
|
|
108
|
+
))
|
|
109
|
+
elif stage_config["type"] == "chunk":
|
|
110
|
+
stages.append(ChunkStage(
|
|
111
|
+
config=ChunkConfig(
|
|
112
|
+
strategy=stage_config["strategy"],
|
|
113
|
+
max_characters=stage_config.get("max_characters", 1024),
|
|
114
|
+
overlap=stage_config.get("overlap", 0)
|
|
115
|
+
)
|
|
116
|
+
))
|
|
117
|
+
elif stage_config["type"] == "embed":
|
|
118
|
+
stages.append(EmbedStage(
|
|
119
|
+
config=EmbedConfig(
|
|
120
|
+
provider=stage_config["provider"],
|
|
121
|
+
model_name=stage_config.get("model_name", "text-embedding-v3")
|
|
122
|
+
)
|
|
123
|
+
))
|
|
124
|
+
|
|
125
|
+
print(f" - Stages: {len(stages)} 个")
|
|
126
|
+
for i, stage in enumerate(stages, 1):
|
|
127
|
+
print(f" {i}. {stage.type}")
|
|
128
|
+
|
|
129
|
+
# 清理
|
|
130
|
+
config_file.unlink(missing_ok=True)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def example_2_custom_source():
|
|
134
|
+
"""示例 4.2: 自定义 Source(过滤特定文件)"""
|
|
135
|
+
print("\n" + "="*60)
|
|
136
|
+
print("示例 4.2: 自定义 Source - 按文件大小过滤")
|
|
137
|
+
print("="*60)
|
|
138
|
+
|
|
139
|
+
from typing import Any
|
|
140
|
+
|
|
141
|
+
from xparse_client.connectors.sources import Source
|
|
142
|
+
|
|
143
|
+
class FilteredLocalSource(Source):
|
|
144
|
+
"""自定义 Source: 只处理特定大小范围的文件"""
|
|
145
|
+
|
|
146
|
+
def __init__(self, directory: str, min_size: int = 0, max_size: int = 10*1024*1024):
|
|
147
|
+
self.directory = Path(directory)
|
|
148
|
+
self.min_size = min_size
|
|
149
|
+
self.max_size = max_size
|
|
150
|
+
|
|
151
|
+
def list_files(self) -> list[str]:
|
|
152
|
+
"""列出符合大小要求的文件"""
|
|
153
|
+
files = []
|
|
154
|
+
for f in self.directory.glob("**/*.txt"):
|
|
155
|
+
size = f.stat().st_size
|
|
156
|
+
if self.min_size <= size <= self.max_size:
|
|
157
|
+
files.append(str(f))
|
|
158
|
+
return files
|
|
159
|
+
|
|
160
|
+
def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
|
|
161
|
+
"""读取文件"""
|
|
162
|
+
path = Path(file_path)
|
|
163
|
+
content = path.read_bytes()
|
|
164
|
+
|
|
165
|
+
data_source = {
|
|
166
|
+
"url": f"file://{path.absolute()}",
|
|
167
|
+
"size": path.stat().st_size,
|
|
168
|
+
"modified": path.stat().st_mtime,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return content, data_source
|
|
172
|
+
|
|
173
|
+
# 准备测试
|
|
174
|
+
test_dir = Path("./test_custom_source")
|
|
175
|
+
test_dir.mkdir(exist_ok=True)
|
|
176
|
+
|
|
177
|
+
# 创建不同大小的文件
|
|
178
|
+
(test_dir / "small.txt").write_text("小")
|
|
179
|
+
(test_dir / "medium.txt").write_text("中等" * 100)
|
|
180
|
+
(test_dir / "large.txt").write_text("大" * 10000)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
print(f"\n📁 测试目录: {test_dir}")
|
|
184
|
+
print(f" - small.txt: {(test_dir / 'small.txt').stat().st_size} bytes")
|
|
185
|
+
print(f" - medium.txt: {(test_dir / 'medium.txt').stat().st_size} bytes")
|
|
186
|
+
print(f" - large.txt: {(test_dir / 'large.txt').stat().st_size} bytes")
|
|
187
|
+
|
|
188
|
+
# 使用自定义 Source (只处理 100 - 5000 bytes)
|
|
189
|
+
source = FilteredLocalSource(
|
|
190
|
+
directory=str(test_dir),
|
|
191
|
+
min_size=100,
|
|
192
|
+
max_size=5000
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
files = source.list_files()
|
|
196
|
+
print(f"\n✅ 过滤结果: {len(files)} 个文件符合条件")
|
|
197
|
+
for f in files:
|
|
198
|
+
print(f" - {Path(f).name}")
|
|
199
|
+
|
|
200
|
+
finally:
|
|
201
|
+
# 清理
|
|
202
|
+
import shutil
|
|
203
|
+
if test_dir.exists():
|
|
204
|
+
shutil.rmtree(test_dir)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def example_3_production_workflow():
|
|
208
|
+
"""示例 4.3: 生产环境工作流(完整错误处理和日志)"""
|
|
209
|
+
print("\n" + "="*60)
|
|
210
|
+
print("示例 4.3: 生产环境工作流")
|
|
211
|
+
print("="*60)
|
|
212
|
+
|
|
213
|
+
client = XParseClient.from_env()
|
|
214
|
+
|
|
215
|
+
# 准备测试
|
|
216
|
+
source_dir = Path("./test_production")
|
|
217
|
+
output_dir = Path("./test_production_output")
|
|
218
|
+
log_dir = Path("./test_production_logs")
|
|
219
|
+
|
|
220
|
+
source_dir.mkdir(exist_ok=True)
|
|
221
|
+
output_dir.mkdir(exist_ok=True)
|
|
222
|
+
log_dir.mkdir(exist_ok=True)
|
|
223
|
+
|
|
224
|
+
# 创建测试文件
|
|
225
|
+
for i in range(3):
|
|
226
|
+
(source_dir / f"prod_doc_{i+1}.txt").write_text(f"生产文档 {i+1}\n" * 50)
|
|
227
|
+
|
|
228
|
+
# 设置日志文件
|
|
229
|
+
log_file = log_dir / f"workflow_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
|
230
|
+
file_handler = logging.FileHandler(log_file)
|
|
231
|
+
file_handler.setLevel(logging.INFO)
|
|
232
|
+
logger.addHandler(file_handler)
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
logger.info(f"开始工作流: {datetime.now()}")
|
|
236
|
+
logger.info(f"源目录: {source_dir}")
|
|
237
|
+
logger.info(f"输出目录: {output_dir}")
|
|
238
|
+
|
|
239
|
+
# 进度和错误回调
|
|
240
|
+
|
|
241
|
+
def on_progress(current, total, message):
|
|
242
|
+
logger.info(f"进度: [{current}/{total}] {message}")
|
|
243
|
+
print(f" 📊 [{current}/{total}] {message}")
|
|
244
|
+
|
|
245
|
+
# 定义 stages
|
|
246
|
+
stages = [
|
|
247
|
+
ParseStage(config=ParseConfig(provider="textin")),
|
|
248
|
+
ChunkStage(config=ChunkConfig(strategy="basic")),
|
|
249
|
+
EmbedStage(config=EmbedConfig(provider="qwen"))
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
# 执行工作流
|
|
253
|
+
print(f"\n⏳ 开始处理 (日志: {log_file})...")
|
|
254
|
+
|
|
255
|
+
result = client.local.run_workflow(
|
|
256
|
+
source=LocalSource(directory=str(source_dir)),
|
|
257
|
+
destination=LocalDestination(output_dir=str(output_dir)),
|
|
258
|
+
stages=stages,
|
|
259
|
+
progress_callback=on_progress,
|
|
260
|
+
on_error="continue",
|
|
261
|
+
max_retries=2
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# 记录结果
|
|
265
|
+
logger.info(f"工作流完成: 成功 {result.success}, 失败 {result.failed}")
|
|
266
|
+
logger.info(f"耗时: {result.duration:.2f} 秒")
|
|
267
|
+
|
|
268
|
+
print("\n✅ 工作流完成!")
|
|
269
|
+
print(f" - 总文件: {result.total}")
|
|
270
|
+
print(f" - 成功: {result.success}")
|
|
271
|
+
print(f" - 失败: {result.failed}")
|
|
272
|
+
print(f" - 耗时: {result.duration:.2f} 秒")
|
|
273
|
+
print(f" - 日志: {log_file}")
|
|
274
|
+
|
|
275
|
+
if result.failed_files:
|
|
276
|
+
logger.error(f"失败的文件: {len(result.failed_files)} 个")
|
|
277
|
+
for failed in result.failed_files:
|
|
278
|
+
logger.error(f" - {failed.file_path}: {failed.error}")
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(f"工作流异常: {e}", exc_info=True)
|
|
282
|
+
raise
|
|
283
|
+
|
|
284
|
+
finally:
|
|
285
|
+
# 清理
|
|
286
|
+
import shutil
|
|
287
|
+
for d in [source_dir, output_dir, log_dir]:
|
|
288
|
+
if d.exists():
|
|
289
|
+
shutil.rmtree(d)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def main():
|
|
293
|
+
"""主函数"""
|
|
294
|
+
print("\n" + "="*60)
|
|
295
|
+
print("xparse-client 高级批处理工作流示例")
|
|
296
|
+
print("="*60)
|
|
297
|
+
|
|
298
|
+
# 检查环境变量
|
|
299
|
+
if not os.getenv("TEXTIN_APP_ID") or not os.getenv("TEXTIN_SECRET_CODE"):
|
|
300
|
+
print("\n❌ 错误: 请先设置环境变量:")
|
|
301
|
+
print(" export TEXTIN_APP_ID='your-app-id'")
|
|
302
|
+
print(" export TEXTIN_SECRET_CODE='your-secret-code'")
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
print("\n💡 本示例展示:")
|
|
306
|
+
print(" - 从 JSON 配置文件加载设置")
|
|
307
|
+
print(" - 自定义 Source 实现(按文件大小过滤)")
|
|
308
|
+
print(" - 生产环境级别的错误处理和日志")
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
# 运行示例
|
|
312
|
+
example_1_config_file()
|
|
313
|
+
example_2_custom_source()
|
|
314
|
+
example_3_production_workflow()
|
|
315
|
+
|
|
316
|
+
print("\n" + "="*60)
|
|
317
|
+
print("✅ 所有示例运行完成!")
|
|
318
|
+
print("="*60)
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
print(f"\n❌ 错误: {e}")
|
|
322
|
+
import traceback
|
|
323
|
+
traceback.print_exc()
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
if __name__ == "__main__":
|
|
327
|
+
main()
|
example/README.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# xparse-client 使用示例
|
|
2
|
+
|
|
3
|
+
本目录包含了 xparse-client SDK 的各种使用示例。
|
|
4
|
+
|
|
5
|
+
## 📋 示例列表
|
|
6
|
+
|
|
7
|
+
### 1. 基础 API 使用 - `1_basic_api_usage.py`
|
|
8
|
+
|
|
9
|
+
演示最基本的 API 调用:
|
|
10
|
+
- 单文件解析(Parse API)
|
|
11
|
+
- 文档提取(Extract API)
|
|
12
|
+
- Pipeline 执行(完整流程)
|
|
13
|
+
|
|
14
|
+
**适合**: 初次使用、单文件处理
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
python example/1_basic_api_usage.py
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
### 2. 服务端异步任务 - `2_async_job.py`
|
|
23
|
+
|
|
24
|
+
演示服务端异步任务的使用:
|
|
25
|
+
- 创建异步解析任务
|
|
26
|
+
- 轮询任务状态
|
|
27
|
+
- 等待任务完成
|
|
28
|
+
- 错误处理
|
|
29
|
+
|
|
30
|
+
**适合**: 处理大文件、长时间任务
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
python example/2_async_job.py
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
### 3. 本地批量处理 - `3_local_workflow.py`
|
|
39
|
+
|
|
40
|
+
演示本地批量处理工作流:
|
|
41
|
+
- 从本地目录读取文件
|
|
42
|
+
- 批量调用 Pipeline API
|
|
43
|
+
- 写入 Milvus 向量数据库
|
|
44
|
+
- 查看处理统计
|
|
45
|
+
|
|
46
|
+
**适合**: 批量导入文档到向量数据库
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python example/3_local_workflow.py
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### 4. 高级批处理 - `4_advanced_workflow.py`
|
|
55
|
+
|
|
56
|
+
演示高级批处理功能:
|
|
57
|
+
- 进度回调
|
|
58
|
+
- 错误处理策略(stop/continue/retry)
|
|
59
|
+
- 自定义 Source/Destination
|
|
60
|
+
- 从配置文件加载
|
|
61
|
+
|
|
62
|
+
**适合**: 生产环境、复杂场景
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
python example/4_advanced_workflow.py
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 🔧 环境配置
|
|
71
|
+
|
|
72
|
+
所有示例都需要配置 TextIn API 凭证。
|
|
73
|
+
|
|
74
|
+
### 方式 1: 环境变量(推荐)
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export TEXTIN_APP_ID="your-app-id"
|
|
78
|
+
export TEXTIN_SECRET_CODE="your-secret-code"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 方式 2: 修改代码
|
|
82
|
+
|
|
83
|
+
在示例代码中直接修改:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
client = XParseClient(
|
|
87
|
+
app_id="your-app-id",
|
|
88
|
+
secret_code="your-secret-code"
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## 📝 配置文件示例
|
|
93
|
+
|
|
94
|
+
查看 `config_example.json` 了解如何从配置文件加载设置。
|
|
95
|
+
|
|
96
|
+
## 🚀 快速开始
|
|
97
|
+
|
|
98
|
+
1. 安装依赖
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install xparse-client
|
|
102
|
+
# 或安装所有可选依赖
|
|
103
|
+
pip install xparse-client[all]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
2. 配置凭证
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
export TEXTIN_APP_ID="your-app-id"
|
|
110
|
+
export TEXTIN_SECRET_CODE="your-secret-code"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
3. 运行示例
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# 从基础示例开始
|
|
117
|
+
python example/1_basic_api_usage.py
|
|
118
|
+
|
|
119
|
+
# 然后尝试批量处理
|
|
120
|
+
python example/3_local_workflow.py
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## 📚 更多资源
|
|
124
|
+
|
|
125
|
+
- [完整文档](../README.md)
|
|
126
|
+
- [API 参考](../docs/)
|
|
127
|
+
- [迁移指南](../docs/MIGRATION_FROM_PIPELINE.md)
|
|
128
|
+
- [GitHub Issues](https://github.com/textin/xparse-client/issues)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "xparse-client 工作流配置示例",
|
|
3
|
+
"source": {
|
|
4
|
+
"type": "local",
|
|
5
|
+
"directory": "./data/documents",
|
|
6
|
+
"pattern": ["*.pdf", "*.docx", "*.txt"],
|
|
7
|
+
"recursive": true,
|
|
8
|
+
"description": "从本地目录读取文档,支持多种文件类型"
|
|
9
|
+
},
|
|
10
|
+
"destination": {
|
|
11
|
+
"type": "local",
|
|
12
|
+
"output_dir": "./data/output",
|
|
13
|
+
"description": "输出到本地 JSON 文件"
|
|
14
|
+
},
|
|
15
|
+
"stages": [
|
|
16
|
+
{
|
|
17
|
+
"type": "parse",
|
|
18
|
+
"provider": "textin",
|
|
19
|
+
"description": "使用 TextIn 解析文档"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"type": "chunk",
|
|
23
|
+
"strategy": "by_title",
|
|
24
|
+
"max_characters": 1000,
|
|
25
|
+
"overlap": 50,
|
|
26
|
+
"description": "按标题分块,每块最多 1000 字符,重叠 50 字符"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"type": "embed",
|
|
30
|
+
"provider": "qwen",
|
|
31
|
+
"model_name": "text-embedding-v3",
|
|
32
|
+
"description": "使用通义千问生成向量"
|
|
33
|
+
}
|
|
34
|
+
],
|
|
35
|
+
"options": {
|
|
36
|
+
"on_error": "continue",
|
|
37
|
+
"max_retries": 3,
|
|
38
|
+
"description": "遇到错误继续处理,失败重试 3 次"
|
|
39
|
+
},
|
|
40
|
+
"milvus_example": {
|
|
41
|
+
"description": "使用 Milvus 作为目的地的配置示例",
|
|
42
|
+
"destination": {
|
|
43
|
+
"type": "milvus",
|
|
44
|
+
"db_path": "./vectors.db",
|
|
45
|
+
"collection_name": "documents",
|
|
46
|
+
"dimension": 1024,
|
|
47
|
+
"description": "写入本地 Milvus 数据库"
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
"s3_example": {
|
|
51
|
+
"description": "使用 S3 作为数据源的配置示例",
|
|
52
|
+
"source": {
|
|
53
|
+
"type": "s3",
|
|
54
|
+
"endpoint": "https://s3.amazonaws.com",
|
|
55
|
+
"access_key": "your-access-key",
|
|
56
|
+
"secret_key": "your-secret-key",
|
|
57
|
+
"bucket": "your-bucket",
|
|
58
|
+
"prefix": "documents/",
|
|
59
|
+
"region": "us-east-1",
|
|
60
|
+
"pattern": ["*.pdf"],
|
|
61
|
+
"description": "从 S3 读取 PDF 文件"
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"extract_example": {
|
|
65
|
+
"description": "使用 Extract 提取结构化数据的配置示例",
|
|
66
|
+
"stages": [
|
|
67
|
+
{
|
|
68
|
+
"type": "parse",
|
|
69
|
+
"provider": "textin"
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"type": "extract",
|
|
73
|
+
"schema": {
|
|
74
|
+
"type": "object",
|
|
75
|
+
"properties": {
|
|
76
|
+
"title": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "文档标题"
|
|
79
|
+
},
|
|
80
|
+
"author": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"description": "作者名称"
|
|
83
|
+
},
|
|
84
|
+
"date": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"description": "日期"
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"required": ["title"]
|
|
90
|
+
},
|
|
91
|
+
"description": "提取文档的标题、作者和日期"
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
}
|