xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
  69. example/run_pipeline.py +0 -506
  70. example/run_pipeline_test.py +0 -458
  71. xparse_client/pipeline/__init__.py +0 -3
  72. xparse_client/pipeline/config.py +0 -129
  73. xparse_client/pipeline/destinations.py +0 -487
  74. xparse_client/pipeline/pipeline.py +0 -622
  75. xparse_client/pipeline/sources.py +0 -585
  76. xparse_client-0.2.11.dist-info/METADATA +0 -1050
  77. xparse_client-0.2.11.dist-info/RECORD +0 -13
@@ -1,622 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- encoding: utf-8 -*-
3
-
4
- import json
5
- import logging
6
- import time
7
- from datetime import datetime, timezone
8
- from pathlib import Path
9
- from typing import Dict, Any, Optional, Tuple, List, Union
10
-
11
- import requests
12
-
13
- from .config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
14
- from .sources import Source, S3Source, LocalSource, FtpSource, SmbSource
15
- from .destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
16
-
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
-
21
- class Pipeline:
22
- """数据处理 Pipeline"""
23
-
24
- def __init__(
25
- self,
26
- source: Source,
27
- destination: Destination,
28
- api_base_url: str = 'http://localhost:8000/api/xparse',
29
- api_headers: Optional[Dict[str, str]] = None,
30
- stages: Optional[List[Stage]] = None,
31
- pipeline_config: Optional[PipelineConfig] = None,
32
- intermediate_results_destination: Optional[Destination] = None
33
- ):
34
- self.source = source
35
- self.destination = destination
36
- self.api_base_url = api_base_url.rstrip('/')
37
- self.api_headers = api_headers or {}
38
- self.pipeline_config = pipeline_config or PipelineConfig()
39
-
40
- # 处理 intermediate_results_destination 参数
41
- # 如果直接传入了 intermediate_results_destination,优先使用它并自动启用中间结果保存
42
- if intermediate_results_destination is not None:
43
- self.pipeline_config.include_intermediate_results = True
44
- self.pipeline_config.intermediate_results_destination = intermediate_results_destination
45
- # 如果 pipeline_config 中已设置,使用 pipeline_config 中的值
46
- elif self.pipeline_config.include_intermediate_results:
47
- if not self.pipeline_config.intermediate_results_destination:
48
- raise ValueError("当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination")
49
-
50
- # 处理 stages 配置
51
- if stages is None:
52
- raise ValueError("必须提供 stages 参数")
53
-
54
- self.stages = stages
55
-
56
- # 验证 stages
57
- if not self.stages or self.stages[0].type != 'parse':
58
- raise ValueError("stages 必须包含且第一个必须是 'parse' 类型")
59
-
60
- # 验证 embed config(如果存在)
61
- for stage in self.stages:
62
- if stage.type == 'embed' and isinstance(stage.config, EmbedConfig):
63
- stage.config.validate()
64
-
65
- # 验证 intermediate_results_destination
66
- if self.pipeline_config.include_intermediate_results:
67
- # 验证是否为支持的 Destination 类型
68
- from .destinations import Destination
69
- if not isinstance(self.pipeline_config.intermediate_results_destination, Destination):
70
- raise ValueError(f"intermediate_results_destination 必须是 Destination 类型")
71
- self.intermediate_results_destination = self.pipeline_config.intermediate_results_destination
72
-
73
- print("=" * 60)
74
- print("Pipeline 初始化完成")
75
- print(f" Stages: {[s.type for s in self.stages]}")
76
- for stage in self.stages:
77
- print(f" - {stage.type}: {stage.config}")
78
- if self.pipeline_config.include_intermediate_results:
79
- print(f" Pipeline Config: 中间结果保存已启用")
80
- print("=" * 60)
81
-
82
- def get_config(self) -> Dict[str, Any]:
83
- """获取 Pipeline 的完整配置信息,返回字典格式(与 create_pipeline_from_config 的入参格式一致)"""
84
- config = {}
85
-
86
- # Source 配置
87
- source_type = type(self.source).__name__.replace('Source', '').lower()
88
- config['source'] = {'type': source_type}
89
-
90
- if isinstance(self.source, S3Source):
91
- config['source'].update({
92
- 'endpoint': self.source.endpoint,
93
- 'bucket': self.source.bucket,
94
- 'prefix': self.source.prefix,
95
- 'pattern': self.source.pattern,
96
- 'recursive': self.source.recursive
97
- })
98
- # access_key 和 secret_key 不在对象中保存,无法恢复
99
- # region 也不在对象中保存,使用默认值
100
- config['source']['region'] = 'us-east-1' # 默认值
101
- elif isinstance(self.source, LocalSource):
102
- config['source'].update({
103
- 'directory': str(self.source.directory),
104
- 'pattern': self.source.pattern,
105
- 'recursive': self.source.recursive
106
- })
107
- elif isinstance(self.source, FtpSource):
108
- config['source'].update({
109
- 'host': self.source.host,
110
- 'port': self.source.port,
111
- 'username': self.source.username,
112
- 'pattern': self.source.pattern,
113
- 'recursive': self.source.recursive
114
- })
115
- # password 不在对象中保存,无法恢复
116
- elif isinstance(self.source, SmbSource):
117
- config['source'].update({
118
- 'host': self.source.host,
119
- 'share_name': self.source.share_name,
120
- 'username': self.source.username,
121
- 'domain': self.source.domain,
122
- 'port': self.source.port,
123
- 'path': self.source.path,
124
- 'pattern': self.source.pattern,
125
- 'recursive': self.source.recursive
126
- })
127
- # password 不在对象中保存,无法恢复
128
-
129
- # Destination 配置
130
- dest_type = type(self.destination).__name__.replace('Destination', '').lower()
131
- # MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
132
- if dest_type == 'milvus':
133
- # 判断是本地 Milvus 还是 Zilliz(通过 db_path 判断)
134
- if self.destination.db_path.startswith('http'):
135
- dest_type = 'zilliz'
136
- else:
137
- dest_type = 'milvus'
138
-
139
- config['destination'] = {'type': dest_type}
140
-
141
- if isinstance(self.destination, MilvusDestination):
142
- config['destination'].update({
143
- 'db_path': self.destination.db_path,
144
- 'collection_name': self.destination.collection_name,
145
- 'dimension': self.destination.dimension
146
- })
147
- # api_key 和 token 不在对象中保存,无法恢复
148
- elif isinstance(self.destination, QdrantDestination):
149
- config['destination'].update({
150
- 'url': self.destination.url,
151
- 'collection_name': self.destination.collection_name,
152
- 'dimension': self.destination.dimension,
153
- 'prefer_grpc': getattr(self.destination, 'prefer_grpc', False)
154
- })
155
- # api_key 不在对象中保存,无法恢复
156
- elif isinstance(self.destination, LocalDestination):
157
- config['destination'].update({
158
- 'output_dir': str(self.destination.output_dir)
159
- })
160
- elif isinstance(self.destination, S3Destination):
161
- config['destination'].update({
162
- 'endpoint': self.destination.endpoint,
163
- 'bucket': self.destination.bucket,
164
- 'prefix': self.destination.prefix
165
- })
166
- # access_key, secret_key, region 不在对象中保存,无法恢复
167
- config['destination']['region'] = 'us-east-1' # 默认值
168
-
169
- # API 配置
170
- config['api_base_url'] = self.api_base_url
171
- config['api_headers'] = {}
172
- for key, value in self.api_headers.items():
173
- config['api_headers'][key] = value
174
-
175
- # Stages 配置
176
- config['stages'] = []
177
- for stage in self.stages:
178
- stage_dict = {
179
- 'type': stage.type,
180
- 'config': {}
181
- }
182
-
183
- if isinstance(stage.config, ParseConfig):
184
- stage_dict['config'] = stage.config.to_dict()
185
- elif isinstance(stage.config, ChunkConfig):
186
- stage_dict['config'] = stage.config.to_dict()
187
- elif isinstance(stage.config, EmbedConfig):
188
- stage_dict['config'] = stage.config.to_dict()
189
- else:
190
- # 如果 config 是字典或其他类型,尝试转换
191
- if isinstance(stage.config, dict):
192
- stage_dict['config'] = stage.config
193
- else:
194
- stage_dict['config'] = str(stage.config)
195
-
196
- config['stages'].append(stage_dict)
197
-
198
- # Pipeline Config
199
- if self.pipeline_config.include_intermediate_results:
200
- config['pipeline_config'] = {
201
- 'include_intermediate_results': True,
202
- 'intermediate_results_destination': {}
203
- }
204
-
205
- inter_dest = self.pipeline_config.intermediate_results_destination
206
- if inter_dest:
207
- inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
208
- config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
209
-
210
- if isinstance(inter_dest, LocalDestination):
211
- config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
212
- elif isinstance(inter_dest, S3Destination):
213
- config['pipeline_config']['intermediate_results_destination'].update({
214
- 'endpoint': inter_dest.endpoint,
215
- 'bucket': inter_dest.bucket,
216
- 'prefix': inter_dest.prefix
217
- })
218
- # access_key, secret_key, region 不在对象中保存,无法恢复
219
- config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1' # 默认值
220
-
221
- return config
222
-
223
- def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
224
- url = f"{self.api_base_url}/pipeline"
225
- max_retries = 3
226
-
227
- for try_count in range(max_retries):
228
- try:
229
- files = {'file': (filename or 'file', file_bytes)}
230
- form_data = {}
231
-
232
- # 将 stages 转换为 API 格式
233
- stages_data = [stage.to_dict() for stage in self.stages]
234
- form_data['stages'] = json.dumps(stages_data)
235
- form_data['data_source'] = json.dumps(data_source, ensure_ascii=False)
236
-
237
- # 如果启用了中间结果保存,在请求中添加参数
238
- if self.pipeline_config:
239
- form_data['config'] = json.dumps(self.pipeline_config.to_dict(), ensure_ascii=False)
240
-
241
- response = requests.post(
242
- url,
243
- files=files,
244
- data=form_data,
245
- headers=self.api_headers,
246
- timeout=120
247
- )
248
-
249
- if response.status_code == 200:
250
- result = response.json()
251
- x_request_id = result.get('x_request_id', '')
252
- print(f" ✓ Pipeline 接口返回 x_request_id: {x_request_id}")
253
- if result.get('code') == 200 and 'data' in result:
254
- return result.get('data')
255
- # 如果 code 不是 200,打印错误信息
256
- error_msg = result.get('message', result.get('msg', '未知错误'))
257
- print(f" ✗ Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
258
- logger.error(f"Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
259
- return None
260
- else:
261
- # 尝试解析响应获取 x_request_id 和错误信息
262
- x_request_id = ''
263
- error_msg = ''
264
- try:
265
- result = response.json()
266
- x_request_id = result.get('x_request_id', '')
267
- error_msg = result.get('message', result.get('msg', response.text[:200]))
268
- except:
269
- error_msg = response.text[:200] if response.text else f'HTTP {response.status_code}'
270
-
271
- print(f" ✗ API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
272
- logger.warning(f"API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
273
-
274
- except Exception as e:
275
- # 如果是 requests 异常,尝试从响应中获取 x_request_id
276
- x_request_id = ''
277
- error_msg = str(e)
278
- try:
279
- if hasattr(e, 'response') and e.response is not None:
280
- try:
281
- result = e.response.json()
282
- x_request_id = result.get('x_request_id', '')
283
- error_msg = result.get('message', result.get('msg', error_msg))
284
- except:
285
- pass
286
- except:
287
- pass
288
-
289
- print(f" ✗ 请求异常: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
290
- logger.error(f"API 请求异常 pipeline: {error_msg}, x_request_id={x_request_id}")
291
-
292
- if try_count < max_retries - 1:
293
- time.sleep(2)
294
-
295
- return None
296
-
297
- def process_with_pipeline(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Tuple[List[Dict[str, Any]], PipelineStats]]:
298
- print(f" → 调用 Pipeline 接口: {filename}")
299
- result = self._call_pipeline_api(file_bytes, filename, data_source)
300
-
301
- if result and 'elements' in result and 'stats' in result:
302
- elements = result['elements']
303
- stats_data = result['stats']
304
-
305
- stats = PipelineStats(
306
- original_elements=stats_data.get('original_elements', 0),
307
- chunked_elements=stats_data.get('chunked_elements', 0),
308
- embedded_elements=stats_data.get('embedded_elements', 0),
309
- stages=self.stages, # 使用实际执行的 stages
310
- record_id=stats_data.get('record_id') # 从 API 响应中获取 record_id
311
- )
312
-
313
- # 如果启用了中间结果保存,处理中间结果
314
- if self.pipeline_config.include_intermediate_results and 'intermediate_results' in result:
315
- self._save_intermediate_results(result['intermediate_results'], filename, data_source)
316
-
317
- print(f" ✓ Pipeline 完成:")
318
- print(f" - 原始元素: {stats.original_elements}")
319
- print(f" - 分块后: {stats.chunked_elements}")
320
- print(f" - 向量化: {stats.embedded_elements}")
321
- logger.info(f"Pipeline 完成: {filename}, {stats.embedded_elements} 个向量")
322
-
323
- return elements, stats
324
- else:
325
- print(f" ✗ Pipeline 失败")
326
- logger.error(f"Pipeline 失败: {filename}")
327
- return None
328
-
329
- def _save_intermediate_results(self, intermediate_results: List[Dict[str, Any]], filename: str, data_source: Dict[str, Any]) -> None:
330
- """保存中间结果
331
-
332
- Args:
333
- intermediate_results: 中间结果数组,每个元素包含 stage 和 elements 字段
334
- filename: 文件名
335
- data_source: 数据源信息
336
- """
337
- try:
338
- # intermediate_results 是一个数组,每个元素是 {stage: str, elements: List}
339
- for result_item in intermediate_results:
340
- if 'stage' not in result_item or 'elements' not in result_item:
341
- logger.warning(f"中间结果项缺少 stage 或 elements 字段: {result_item}")
342
- continue
343
-
344
- stage = result_item['stage']
345
- elements = result_item['elements']
346
-
347
- metadata = {
348
- 'filename': filename,
349
- 'stage': stage,
350
- 'total_elements': len(elements),
351
- 'processed_at': datetime.now().isoformat(),
352
- 'data_source': data_source
353
- }
354
-
355
- self.pipeline_config.intermediate_results_destination.write(elements, metadata)
356
- print(f" ✓ 保存 {stage.upper()} 中间结果: {len(elements)} 个元素")
357
- logger.info(f"保存 {stage.upper()} 中间结果成功: {filename}")
358
-
359
- except Exception as e:
360
- print(f" ✗ 保存中间结果失败: {str(e)}")
361
- logger.error(f"保存中间结果失败: {filename}, {str(e)}")
362
-
363
- def process_file(self, file_path: str) -> bool:
364
- print(f"\n{'=' * 60}")
365
- print(f"处理文件: {file_path}")
366
- logger.info(f"开始处理文件: {file_path}")
367
-
368
- try:
369
- print(f" → 读取文件...")
370
- file_bytes, data_source = self.source.read_file(file_path)
371
- data_source = data_source or {}
372
- # 转换为毫秒时间戳字符串
373
- timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
374
- data_source['date_processed'] = str(timestamp_ms)
375
- print(f" ✓ 文件读取完成: {len(file_bytes)} bytes")
376
-
377
- result = self.process_with_pipeline(file_bytes, file_path, data_source)
378
- if not result:
379
- return False
380
-
381
- embedded_data, stats = result
382
-
383
- print(f" → 写入目的地...")
384
- metadata = {
385
- 'filename': file_path,
386
- 'processed_at': str(timestamp_ms),
387
- }
388
-
389
- # 如果 stats 中有 record_id,添加到 metadata 中
390
- if stats.record_id:
391
- metadata['record_id'] = stats.record_id
392
-
393
- success = self.destination.write(embedded_data, metadata)
394
-
395
- if success:
396
- print(f"\n✓✓✓ 文件处理成功: {file_path}")
397
- logger.info(f"文件处理成功: {file_path}")
398
- else:
399
- print(f"\n✗✗✗ 文件处理失败: {file_path}")
400
- logger.error(f"文件处理失败: {file_path}")
401
-
402
- return success
403
-
404
- except Exception as e:
405
- print(f"\n✗✗✗ 处理异常: {str(e)}")
406
- logger.error(f"处理文件异常 {file_path}: {str(e)}")
407
- return False
408
-
409
- def run(self):
410
- start_time = time.time()
411
-
412
- print("\n" + "=" * 60)
413
- print("开始执行 Pipeline")
414
- print("=" * 60)
415
- logger.info("=" * 60)
416
- logger.info("开始执行 Pipeline")
417
-
418
- print("\n→ 列出文件...")
419
- files = self.source.list_files()
420
-
421
- if not files:
422
- print("\n✗ 没有找到文件")
423
- logger.info("没有找到文件")
424
- return
425
-
426
- total = len(files)
427
- success_count = 0
428
- fail_count = 0
429
-
430
- for idx, file_path in enumerate(files, 1):
431
- print(f"\n进度: [{idx}/{total}]")
432
- logger.info(f"进度: [{idx}/{total}] - {file_path}")
433
-
434
- try:
435
- if self.process_file(file_path):
436
- success_count += 1
437
- else:
438
- fail_count += 1
439
- except Exception as e:
440
- print(f"\n✗✗✗ 文件处理异常: {str(e)}")
441
- logger.error(f"文件处理异常 {file_path}: {str(e)}")
442
- fail_count += 1
443
-
444
- if idx < total:
445
- time.sleep(1)
446
-
447
- elapsed = time.time() - start_time
448
- print("\n" + "=" * 60)
449
- print("Pipeline 执行完成!")
450
- print("=" * 60)
451
- print(f"总文件数: {total}")
452
- print(f"成功: {success_count}")
453
- print(f"失败: {fail_count}")
454
- print(f"总耗时: {elapsed:.2f} 秒")
455
- print("=" * 60)
456
-
457
- logger.info("=" * 60)
458
- logger.info(f"Pipeline 完成 - 总数:{total}, 成功:{success_count}, 失败:{fail_count}, 耗时:{elapsed:.2f}秒")
459
- logger.info("=" * 60)
460
-
461
-
462
- def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
463
- source_config = config['source']
464
- if source_config['type'] == 's3':
465
- source = S3Source(
466
- endpoint=source_config['endpoint'],
467
- access_key=source_config['access_key'],
468
- secret_key=source_config['secret_key'],
469
- bucket=source_config['bucket'],
470
- prefix=source_config.get('prefix', ''),
471
- region=source_config.get('region', 'us-east-1'),
472
- pattern=source_config.get('pattern', '*'),
473
- recursive=source_config.get('recursive', False)
474
- )
475
- elif source_config['type'] == 'local':
476
- source = LocalSource(
477
- directory=source_config['directory'],
478
- pattern=source_config.get('pattern', '*'),
479
- recursive=source_config.get('recursive', False)
480
- )
481
- elif source_config['type'] == 'ftp':
482
- source = FtpSource(
483
- host=source_config['host'],
484
- port=source_config['port'],
485
- username=source_config['username'],
486
- password=source_config['password'],
487
- pattern=source_config.get('pattern', '*'),
488
- recursive=source_config.get('recursive', False)
489
- )
490
- elif source_config['type'] == 'smb':
491
- source = SmbSource(
492
- host=source_config['host'],
493
- share_name=source_config['share_name'],
494
- username=source_config['username'],
495
- password=source_config['password'],
496
- domain=source_config.get('domain', ''),
497
- port=source_config.get('port', 445),
498
- path=source_config.get('path', ''),
499
- pattern=source_config.get('pattern', '*'),
500
- recursive=source_config.get('recursive', False)
501
- )
502
- else:
503
- raise ValueError(f"未知的 source 类型: {source_config['type']}")
504
-
505
- dest_config = config['destination']
506
- if dest_config['type'] in ['milvus', 'zilliz']:
507
- destination = MilvusDestination(
508
- db_path=dest_config['db_path'],
509
- collection_name=dest_config['collection_name'],
510
- dimension=dest_config['dimension'],
511
- api_key=dest_config.get('api_key'),
512
- token=dest_config.get('token')
513
- )
514
- elif dest_config['type'] == 'qdrant':
515
- destination = QdrantDestination(
516
- url=dest_config['url'],
517
- collection_name=dest_config['collection_name'],
518
- dimension=dest_config['dimension'],
519
- api_key=dest_config.get('api_key'),
520
- prefer_grpc=dest_config.get('prefer_grpc', False)
521
- )
522
- elif dest_config['type'] == 'local':
523
- destination = LocalDestination(
524
- output_dir=dest_config['output_dir']
525
- )
526
- elif dest_config['type'] == 's3':
527
- destination = S3Destination(
528
- endpoint=dest_config['endpoint'],
529
- access_key=dest_config['access_key'],
530
- secret_key=dest_config['secret_key'],
531
- bucket=dest_config['bucket'],
532
- prefix=dest_config.get('prefix', ''),
533
- region=dest_config.get('region', 'us-east-1')
534
- )
535
- else:
536
- raise ValueError(f"未知的 destination 类型: {dest_config['type']}")
537
-
538
- # 处理 stages 配置
539
- if 'stages' not in config or not config['stages']:
540
- raise ValueError("配置中必须包含 'stages' 字段")
541
-
542
- stages = []
543
- for stage_cfg in config['stages']:
544
- stage_type = stage_cfg.get('type')
545
- stage_config_dict = stage_cfg.get('config', {})
546
-
547
- if stage_type == 'parse':
548
- parse_cfg_copy = dict(stage_config_dict)
549
- provider = parse_cfg_copy.pop('provider', 'textin')
550
- stage_config = ParseConfig(provider=provider, **parse_cfg_copy)
551
- elif stage_type == 'chunk':
552
- stage_config = ChunkConfig(
553
- strategy=stage_config_dict.get('strategy', 'basic'),
554
- include_orig_elements=stage_config_dict.get('include_orig_elements', False),
555
- new_after_n_chars=stage_config_dict.get('new_after_n_chars', 512),
556
- max_characters=stage_config_dict.get('max_characters', 1024),
557
- overlap=stage_config_dict.get('overlap', 0),
558
- overlap_all=stage_config_dict.get('overlap_all', False)
559
- )
560
- elif stage_type == 'embed':
561
- stage_config = EmbedConfig(
562
- provider=stage_config_dict.get('provider', 'qwen'),
563
- model_name=stage_config_dict.get('model_name', 'text-embedding-v3')
564
- )
565
- else:
566
- raise ValueError(f"未知的 stage 类型: {stage_type}")
567
-
568
- stages.append(Stage(type=stage_type, config=stage_config))
569
-
570
- # 创建 Pipeline 配置
571
- pipeline_config = None
572
- if 'pipeline_config' in config and config['pipeline_config']:
573
- pipeline_cfg = config['pipeline_config']
574
- include_intermediate_results = pipeline_cfg.get('include_intermediate_results', False)
575
- intermediate_results_destination = None
576
-
577
- if include_intermediate_results:
578
- if 'intermediate_results_destination' in pipeline_cfg:
579
- dest_cfg = pipeline_cfg['intermediate_results_destination']
580
- dest_type = dest_cfg.get('type')
581
-
582
- if dest_type == 'local':
583
- intermediate_results_destination = LocalDestination(
584
- output_dir=dest_cfg['output_dir']
585
- )
586
- elif dest_type == 's3':
587
- intermediate_results_destination = S3Destination(
588
- endpoint=dest_cfg['endpoint'],
589
- access_key=dest_cfg['access_key'],
590
- secret_key=dest_cfg['secret_key'],
591
- bucket=dest_cfg['bucket'],
592
- prefix=dest_cfg.get('prefix', ''),
593
- region=dest_cfg.get('region', 'us-east-1')
594
- )
595
- else:
596
- raise ValueError(f"不支持的 intermediate_results_destination 类型: '{dest_type}',支持的类型: 'local', 's3'")
597
- else:
598
- raise ValueError("当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination")
599
-
600
- pipeline_config = PipelineConfig(
601
- include_intermediate_results=include_intermediate_results,
602
- intermediate_results_destination=intermediate_results_destination
603
- )
604
-
605
- # 创建 Pipeline
606
- pipeline = Pipeline(
607
- source=source,
608
- destination=destination,
609
- api_base_url=config.get('api_base_url', 'http://localhost:8000/api/xparse'),
610
- api_headers=config.get('api_headers', {}),
611
- stages=stages,
612
- pipeline_config=pipeline_config
613
- )
614
-
615
- return pipeline
616
-
617
-
618
- __all__ = [
619
- 'Pipeline',
620
- 'create_pipeline_from_config',
621
- ]
622
-