xparse-client 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
example/run_pipeline.py CHANGED
@@ -5,6 +5,7 @@ Pipeline 运行脚本
5
5
  快速启动和运行 Pipeline 的示例
6
6
  '''
7
7
 
8
+ import json
8
9
  from datetime import datetime, timezone
9
10
  from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
10
11
 
@@ -127,14 +128,14 @@ def run_with_manual_setup():
127
128
  # prefix='',
128
129
  # region='cn-east-3'
129
130
  # )
130
- source = S3Source(
131
- endpoint='https://s3.us-east-1.amazonaws.com',
132
- access_key='AKIA6QUE3TVZADUWA4PO',
133
- secret_key='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',
134
- bucket='textin-xparse',
135
- prefix='',
136
- region='us-east-1'
137
- )
131
+ # source = S3Source(
132
+ # endpoint='https://s3.us-east-1.amazonaws.com',
133
+ # access_key='AKIA6QUE3TVZADUWA4PO',
134
+ # secret_key='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',
135
+ # bucket='textin-xparse',
136
+ # prefix='',
137
+ # region='us-east-1'
138
+ # )
138
139
  # source = S3Source(
139
140
  # endpoint='http://127.0.0.1:9000',
140
141
  # access_key='',
@@ -157,10 +158,11 @@ def run_with_manual_setup():
157
158
  # username='', # 用户名,按照实际填写
158
159
  # password='' # 密码,按照实际填写
159
160
  # )
160
- # source = LocalSource(
161
- # directory='/Users/ke_wang/Documents/doc',
162
- # pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
163
- # )
161
+ source = LocalSource(
162
+ directory='/Users/ke_wang/Documents/doc',
163
+ recursive=False,
164
+ pattern=['*'] # 支持通配符: *.pdf, *.docx, **/*.txt
165
+ )
164
166
 
165
167
  # 创建 Milvus 目的地
166
168
  # destination = MilvusDestination(
@@ -175,7 +177,7 @@ def run_with_manual_setup():
175
177
 
176
178
  destination = MilvusDestination(
177
179
  db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
178
- collection_name='textin_test_3', # 数据库collection名称
180
+ collection_name='textin_test_3_copy', # 数据库collection名称
179
181
  dimension=1024, # 向量维度,需与 embed API 返回一致
180
182
  api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
181
183
  )
@@ -193,7 +195,7 @@ def run_with_manual_setup():
193
195
  stages = [
194
196
  Stage(
195
197
  type='parse',
196
- config=ParseConfig(provider='textin-lite')
198
+ config=ParseConfig(provider='textin')
197
199
  ),
198
200
  Stage(
199
201
  type='chunk',
@@ -235,6 +237,7 @@ def run_with_manual_setup():
235
237
  )
236
238
 
237
239
  # 运行
240
+ # config = pipeline.get_config()
238
241
  pipeline.run()
239
242
 
240
243
 
@@ -86,14 +86,14 @@ def run_with_manual_setup():
86
86
  from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
87
87
 
88
88
  # 创建 S3 数据源
89
- # source = S3Source(
90
- # endpoint='https://textin-minio-api.ai.intsig.net',
91
- # access_key='IEQspf8C7fVcgmp3AZWl',
92
- # secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
93
- # bucket='textin-test',
94
- # prefix='',
95
- # region='us-east-1'
96
- # )
89
+ source = S3Source(
90
+ endpoint='https://textin-minio-api.ai.intsig.net',
91
+ access_key='IEQspf8C7fVcgmp3AZWl',
92
+ secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
93
+ bucket='textin-test',
94
+ prefix='',
95
+ region='us-east-1'
96
+ )
97
97
  source = S3Source(
98
98
  endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
99
99
  access_key='LTAI5tBgsaVfkbh9rbPyuB17',
@@ -79,6 +79,139 @@ class Pipeline:
79
79
  print(f" Pipeline Config: 中间结果保存已启用")
80
80
  print("=" * 60)
81
81
 
82
+ def get_config(self) -> Dict[str, Any]:
83
+ """获取 Pipeline 的完整配置信息,返回字典格式(与 create_pipeline_from_config 的入参格式一致)"""
84
+ config = {}
85
+
86
+ # Source 配置
87
+ source_type = type(self.source).__name__.replace('Source', '').lower()
88
+ config['source'] = {'type': source_type}
89
+
90
+ if isinstance(self.source, S3Source):
91
+ config['source'].update({
92
+ 'endpoint': self.source.endpoint,
93
+ 'bucket': self.source.bucket,
94
+ 'prefix': self.source.prefix,
95
+ 'pattern': self.source.pattern,
96
+ 'recursive': self.source.recursive
97
+ })
98
+ # access_key 和 secret_key 不在对象中保存,无法恢复
99
+ # region 也不在对象中保存,使用默认值
100
+ config['source']['region'] = 'us-east-1' # 默认值
101
+ elif isinstance(self.source, LocalSource):
102
+ config['source'].update({
103
+ 'directory': str(self.source.directory),
104
+ 'pattern': self.source.pattern,
105
+ 'recursive': self.source.recursive
106
+ })
107
+ elif isinstance(self.source, FtpSource):
108
+ config['source'].update({
109
+ 'host': self.source.host,
110
+ 'port': self.source.port,
111
+ 'username': self.source.username,
112
+ 'pattern': self.source.pattern,
113
+ 'recursive': self.source.recursive
114
+ })
115
+ # password 不在对象中保存,无法恢复
116
+ elif isinstance(self.source, SmbSource):
117
+ config['source'].update({
118
+ 'host': self.source.host,
119
+ 'share_name': self.source.share_name,
120
+ 'username': self.source.username,
121
+ 'domain': self.source.domain,
122
+ 'port': self.source.port,
123
+ 'path': self.source.path,
124
+ 'pattern': self.source.pattern,
125
+ 'recursive': self.source.recursive
126
+ })
127
+ # password 不在对象中保存,无法恢复
128
+
129
+ # Destination 配置
130
+ dest_type = type(self.destination).__name__.replace('Destination', '').lower()
131
+ # MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
132
+ if dest_type == 'milvus':
133
+ # 判断是本地 Milvus 还是 Zilliz(通过 db_path 判断)
134
+ if self.destination.db_path.startswith('http'):
135
+ dest_type = 'zilliz'
136
+ else:
137
+ dest_type = 'milvus'
138
+
139
+ config['destination'] = {'type': dest_type}
140
+
141
+ if isinstance(self.destination, MilvusDestination):
142
+ config['destination'].update({
143
+ 'db_path': self.destination.db_path,
144
+ 'collection_name': self.destination.collection_name,
145
+ 'dimension': self.destination.dimension
146
+ })
147
+ # api_key 和 token 不在对象中保存,无法恢复
148
+ elif isinstance(self.destination, LocalDestination):
149
+ config['destination'].update({
150
+ 'output_dir': str(self.destination.output_dir)
151
+ })
152
+ elif isinstance(self.destination, S3Destination):
153
+ config['destination'].update({
154
+ 'endpoint': self.destination.endpoint,
155
+ 'bucket': self.destination.bucket,
156
+ 'prefix': self.destination.prefix
157
+ })
158
+ # access_key, secret_key, region 不在对象中保存,无法恢复
159
+ config['destination']['region'] = 'us-east-1' # 默认值
160
+
161
+ # API 配置
162
+ config['api_base_url'] = self.api_base_url
163
+ config['api_headers'] = {}
164
+ for key, value in self.api_headers.items():
165
+ config['api_headers'][key] = value
166
+
167
+ # Stages 配置
168
+ config['stages'] = []
169
+ for stage in self.stages:
170
+ stage_dict = {
171
+ 'type': stage.type,
172
+ 'config': {}
173
+ }
174
+
175
+ if isinstance(stage.config, ParseConfig):
176
+ stage_dict['config'] = stage.config.to_dict()
177
+ elif isinstance(stage.config, ChunkConfig):
178
+ stage_dict['config'] = stage.config.to_dict()
179
+ elif isinstance(stage.config, EmbedConfig):
180
+ stage_dict['config'] = stage.config.to_dict()
181
+ else:
182
+ # 如果 config 是字典或其他类型,尝试转换
183
+ if isinstance(stage.config, dict):
184
+ stage_dict['config'] = stage.config
185
+ else:
186
+ stage_dict['config'] = str(stage.config)
187
+
188
+ config['stages'].append(stage_dict)
189
+
190
+ # Pipeline Config
191
+ if self.pipeline_config.include_intermediate_results:
192
+ config['pipeline_config'] = {
193
+ 'include_intermediate_results': True,
194
+ 'intermediate_results_destination': {}
195
+ }
196
+
197
+ inter_dest = self.pipeline_config.intermediate_results_destination
198
+ if inter_dest:
199
+ inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
200
+ config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
201
+
202
+ if isinstance(inter_dest, LocalDestination):
203
+ config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
204
+ elif isinstance(inter_dest, S3Destination):
205
+ config['pipeline_config']['intermediate_results_destination'].update({
206
+ 'endpoint': inter_dest.endpoint,
207
+ 'bucket': inter_dest.bucket,
208
+ 'prefix': inter_dest.prefix
209
+ })
210
+ # access_key, secret_key, region 不在对象中保存,无法恢复
211
+ config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1' # 默认值
212
+
213
+ return config
214
+
82
215
  def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
83
216
  url = f"{self.api_base_url}/pipeline"
84
217
  max_retries = 3
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
11
11
  from email.utils import parsedate_to_datetime
12
12
  from fnmatch import fnmatch
13
13
  from pathlib import Path
14
- from typing import List, Dict, Any, Tuple
14
+ from typing import List, Dict, Any, Tuple, Optional
15
15
 
16
16
  from smb.SMBConnection import SMBConnection
17
17
  from botocore.config import Config
@@ -20,6 +20,56 @@ from botocore.config import Config
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
+ def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
24
+ """规范化通配符模式列表
25
+
26
+ Args:
27
+ pattern: 通配符模式列表,如果为 None 或空列表则返回 None(表示匹配所有文件)
28
+
29
+ Returns:
30
+ 通配符模式列表,如果 pattern 是 None、空列表或包含 "*" 则返回 None(表示匹配所有文件)
31
+ """
32
+ if pattern is None or not pattern:
33
+ return None # None 表示匹配所有文件
34
+
35
+ if not isinstance(pattern, list):
36
+ raise ValueError(f"pattern 类型错误: {type(pattern)}")
37
+
38
+ # 过滤空字符串并去除空格
39
+ normalized = [p.strip() for p in pattern if p and p.strip()]
40
+
41
+ if not normalized:
42
+ return None
43
+
44
+ # 如果包含 "*",直接返回 None(匹配所有文件,减少后续开销)
45
+ if '*' in normalized:
46
+ return None
47
+
48
+ return normalized
49
+
50
+
51
+ def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
52
+ """检查文件路径是否匹配通配符模式
53
+
54
+ Args:
55
+ file_path: 文件路径
56
+ wildcard_patterns: 已规范化的通配符模式列表(如 ['*.pdf', '*.docx'])
57
+
58
+ Returns:
59
+ 如果匹配返回 True,否则返回 False
60
+ """
61
+ # 如果 wildcard_patterns 是 None 或空列表,匹配所有文件
62
+ if wildcard_patterns is None:
63
+ return True
64
+
65
+ # 检查是否匹配任何一个通配符模式
66
+ for wildcard_pattern in wildcard_patterns:
67
+ if fnmatch(file_path, wildcard_pattern):
68
+ return True
69
+
70
+ return False
71
+
72
+
23
73
  def _to_millis_timestamp_string(timestamp):
24
74
  """将时间戳转换为毫秒时间戳字符串
25
75
 
@@ -62,11 +112,11 @@ class S3Source(Source):
62
112
  """S3/MinIO 数据源"""
63
113
 
64
114
  def __init__(self, endpoint: str, access_key: str, secret_key: str,
65
- bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*', recursive: bool = False):
115
+ bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
66
116
  self.endpoint = endpoint
67
117
  self.bucket = bucket
68
118
  self.prefix = prefix
69
- self.pattern = pattern or '*'
119
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
70
120
  self.recursive = recursive
71
121
 
72
122
  if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
@@ -108,7 +158,7 @@ class S3Source(Source):
108
158
  key = obj['Key']
109
159
  if key.endswith('/') or key.endswith('empty.tmp'):
110
160
  continue
111
- if fnmatch(key, self.pattern):
161
+ if _match_file_extension(key, self.pattern):
112
162
  files.append(key)
113
163
 
114
164
  # 非递归模式下,CommonPrefixes 包含子目录,我们忽略它们
@@ -156,9 +206,9 @@ class S3Source(Source):
156
206
  class LocalSource(Source):
157
207
  """本地文件系统数据源"""
158
208
 
159
- def __init__(self, directory: str, pattern: str = '*', recursive: bool = False):
209
+ def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
160
210
  self.directory = Path(directory)
161
- self.pattern = pattern or '*'
211
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
162
212
  self.recursive = recursive
163
213
 
164
214
  if not self.directory.exists():
@@ -168,20 +218,29 @@ class LocalSource(Source):
168
218
  logger.info(f"本地目录: {self.directory}")
169
219
 
170
220
  def list_files(self) -> List[str]:
221
+ all_files = []
222
+ # 匹配所有文件
171
223
  if self.recursive:
172
- # 递归模式:使用 rglob
173
- files = [
224
+ all_files.extend([
174
225
  str(f.relative_to(self.directory))
175
- for f in self.directory.rglob(self.pattern)
226
+ for f in self.directory.rglob('*')
176
227
  if f.is_file()
177
- ]
228
+ ])
178
229
  else:
179
- # 非递归模式:只列出根目录下的文件,使用 glob
180
- files = [
230
+ all_files.extend([
181
231
  str(f.relative_to(self.directory))
182
- for f in self.directory.glob(self.pattern)
232
+ for f in self.directory.glob('*')
183
233
  if f.is_file()
184
- ]
234
+ ])
235
+
236
+ files = []
237
+ if self.pattern is not None:
238
+ for file in all_files:
239
+ if _match_file_extension(file, self.pattern):
240
+ files.append(file)
241
+ else:
242
+ files.extend(all_files)
243
+
185
244
  print(f"✓ 本地找到 {len(files)} 个文件")
186
245
  return files
187
246
 
@@ -217,12 +276,12 @@ class LocalSource(Source):
217
276
  class FtpSource(Source):
218
277
  """FTP 数据源"""
219
278
 
220
- def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*', recursive: bool = False):
279
+ def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
221
280
  self.host = host
222
281
  self.port = port
223
282
  self.username = username
224
283
  self.password = password
225
- self.pattern = pattern or '*'
284
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
226
285
  self.recursive = recursive
227
286
 
228
287
  self.client = ftplib.FTP()
@@ -288,12 +347,12 @@ class FtpSource(Source):
288
347
  except:
289
348
  # 不是目录,是文件
290
349
  relative_path = full_path.lstrip('/')
291
- if fnmatch(relative_path, self.pattern):
350
+ if _match_file_extension(relative_path, self.pattern):
292
351
  files.append(relative_path)
293
352
  else:
294
353
  # 是文件
295
354
  relative_path = full_path.lstrip('/')
296
- if fnmatch(relative_path, self.pattern):
355
+ if _match_file_extension(relative_path, self.pattern):
297
356
  files.append(relative_path)
298
357
 
299
358
  # 恢复原始目录
@@ -325,7 +384,7 @@ class FtpSource(Source):
325
384
  item_type = item_info.get('type', 'unknown')
326
385
  # 只添加文件,排除目录
327
386
  if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
328
- if fnmatch(item_name, self.pattern):
387
+ if _match_file_extension(item_name, self.pattern):
329
388
  files.append(item_name)
330
389
  except:
331
390
  # 如果不支持 MLSD,使用 LIST 命令
@@ -341,7 +400,7 @@ class FtpSource(Source):
341
400
  continue
342
401
  is_dir = parts[0].startswith('d')
343
402
  # 只添加文件,排除目录
344
- if not is_dir and fnmatch(item_name, self.pattern):
403
+ if not is_dir and _match_file_extension(item_name, self.pattern):
345
404
  files.append(item_name)
346
405
  except:
347
406
  # 最后回退到 nlst,通过尝试切换目录来判断是否为目录
@@ -357,7 +416,7 @@ class FtpSource(Source):
357
416
  continue
358
417
  except:
359
418
  # 不能切换,说明是文件
360
- if fnmatch(item_name, self.pattern):
419
+ if _match_file_extension(item_name, self.pattern):
361
420
  files.append(item_name)
362
421
 
363
422
  # 确保回到原始目录
@@ -405,7 +464,7 @@ class SmbSource(Source):
405
464
  """SMB/CIFS 数据源"""
406
465
 
407
466
  def __init__(self, host: str, share_name: str, username: str, password: str,
408
- domain: str = '', port: int = 445, path: str = '', pattern: str = '*', recursive: bool = False):
467
+ domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
409
468
  self.host = host
410
469
  self.share_name = share_name
411
470
  self.username = username
@@ -413,7 +472,7 @@ class SmbSource(Source):
413
472
  self.domain = domain
414
473
  self.port = port
415
474
  self.path = path.strip('/').strip('\\') if path else ''
416
- self.pattern = pattern or '*'
475
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
417
476
  self.recursive = recursive
418
477
 
419
478
  self.conn = SMBConnection(
@@ -451,7 +510,7 @@ class SmbSource(Source):
451
510
  _list_recursive(conn, share, item_path)
452
511
  # 非递归模式:忽略子目录
453
512
  else:
454
- if fnmatch(relative_path, self.pattern):
513
+ if _match_file_extension(relative_path, self.pattern):
455
514
  files.append(relative_path)
456
515
  except Exception as e:
457
516
  logger.warning(f"列出路径失败 {current_path}: {str(e)}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -173,7 +173,7 @@ source = S3Source(
173
173
  bucket='textin',
174
174
  prefix='',
175
175
  region='us-east-1',
176
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
176
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
177
177
  )
178
178
  ```
179
179
  请确保配置的访问凭证至少包括以下几项权限:
@@ -193,7 +193,7 @@ source = S3Source(
193
193
  bucket='textin',
194
194
  prefix='',
195
195
  region='cn-shanghai',
196
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
196
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
197
197
  )
198
198
  ```
199
199
  请确保配置的访问凭证至少包括以下几项权限:
@@ -214,7 +214,7 @@ source = S3Source(
214
214
  bucket='textin',
215
215
  prefix='',
216
216
  region='ap-shanghai',
217
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
217
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
218
218
  )
219
219
  ```
220
220
 
@@ -235,7 +235,7 @@ source = S3Source(
235
235
  bucket='textin',
236
236
  prefix='',
237
237
  region='cn-shanghai',
238
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
238
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
239
239
  )
240
240
  ```
241
241
 
@@ -257,7 +257,7 @@ source = S3Source(
257
257
  bucket='textin',
258
258
  prefix='',
259
259
  region='cn-east-3',
260
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
260
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
261
261
  )
262
262
  ```
263
263
 
@@ -279,7 +279,7 @@ source = S3Source(
279
279
  bucket='textin-xparse',
280
280
  prefix='',
281
281
  region='us-east-1',
282
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
282
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
283
283
  )
284
284
  ```
285
285
  请确保配置的访问凭证至少包括以下几项权限:
@@ -294,7 +294,7 @@ s3:GetObject
294
294
  ```python
295
295
  source = LocalSource(
296
296
  directory='./input',
297
- pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
297
+ pattern=['*.pdf', '*.docx'] # 支持多个通配符模式列表
298
298
  )
299
299
  ```
300
300
 
@@ -306,7 +306,7 @@ source = FtpSource(
306
306
  port=21,
307
307
  username='', # 用户名,按照实际填写
308
308
  password='', # 密码,按照实际填写
309
- pattern='*.pdf' # 可选,过滤指定类型文件
309
+ pattern=['*.pdf'] # 可选,通配符模式列表,过滤指定类型文件
310
310
  )
311
311
  ```
312
312
 
@@ -319,11 +319,11 @@ source = SmbSource(
319
319
  username='', # 用户名,按照实际填写
320
320
  password='', # 密码,按照实际填写
321
321
  domain='your-smb-domain',
322
- pattern='**/*.pdf' # 可选,支持多级匹配
322
+ pattern=['**/*.pdf'] # 可选,通配符模式列表,支持多级匹配
323
323
  )
324
324
  ```
325
325
 
326
- > 注 1:所有 Source 均支持 `pattern` 参数,使用 Shell 通配符(`*.pdf`、`**/*.txt` 等)来过滤需要处理的文件;默认为 `*`,即处理全部文件。
326
+ > 注 1:所有 Source 均支持 `pattern` 参数,使用通配符模式列表(如 `['*.pdf', '*.docx']`)来过滤需要处理的文件。支持多个通配符模式,如果列表中包含 `'*'` 则匹配所有文件。默认为 `None`,即处理全部文件。
327
327
 
328
328
  > 注 2:所有 Source 均支持 `recursive` 参数,表示是否递归遍历,默认为 `False`。
329
329
 
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
517
517
 
518
518
  ## 💡 使用示例
519
519
 
520
- ### 示例 1: 使用 config 字典配置(推荐)
520
+ ### 示例 1: 手动创建 Pipeline(推荐)
521
521
 
522
522
  ```python
523
- from xparse_client import create_pipeline_from_config
523
+ from xparse_client import (
524
+ Pipeline, S3Source, MilvusDestination,
525
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
526
+ )
524
527
 
525
- # 完整的配置示例
526
- config = {
527
- # S3 数据源配置
528
- 'source': {
529
- 'type': 's3',
530
- 'endpoint': 'https://your-minio.com',
531
- 'access_key': 'your-access-key',
532
- 'secret_key': 'your-secret-key',
533
- 'bucket': 'documents',
534
- 'prefix': 'pdfs/',
535
- 'region': 'us-east-1',
536
- 'pattern': '*.pdf' # 仅处理匹配的文件
537
- },
538
-
539
- # Milvus 目的地配置
540
- 'destination': {
541
- 'type': 'milvus',
542
- 'db_path': './vectors.db',
543
- 'collection_name': 'documents',
544
- 'dimension': 1024
545
- },
546
-
547
- # API 配置
548
- 'api_base_url': 'https://api.textin.com/api/xparse',
549
- 'api_headers': {
528
+ # 创建数据源
529
+ source = S3Source(
530
+ endpoint='https://your-minio.com',
531
+ access_key='your-access-key',
532
+ secret_key='your-secret-key',
533
+ bucket='documents',
534
+ prefix='pdfs/',
535
+ region='us-east-1',
536
+ pattern=['*.pdf'], # 仅处理匹配的文件
537
+ recursive=False # 不递归子目录
538
+ )
539
+
540
+ # 创建目的地
541
+ destination = MilvusDestination(
542
+ db_path='./vectors.db',
543
+ collection_name='documents',
544
+ dimension=1024
545
+ )
546
+
547
+ # 配置处理阶段
548
+ stages = [
549
+ Stage(
550
+ type='parse',
551
+ config=ParseConfig(provider='textin')
552
+ ),
553
+ Stage(
554
+ type='chunk',
555
+ config=ChunkConfig(
556
+ strategy='by_title', # 按标题分块
557
+ include_orig_elements=False,
558
+ new_after_n_chars=512,
559
+ max_characters=1024,
560
+ overlap=50 # 块之间重叠 50 字符
561
+ )
562
+ ),
563
+ Stage(
564
+ type='embed',
565
+ config=EmbedConfig(
566
+ provider='qwen',
567
+ model_name='text-embedding-v3'
568
+ )
569
+ )
570
+ ]
571
+
572
+ # 创建并运行 Pipeline
573
+ pipeline = Pipeline(
574
+ source=source,
575
+ destination=destination,
576
+ api_base_url='https://api.textin.com/api/xparse',
577
+ api_headers={
550
578
  'x-ti-app-id': 'your-app-id',
551
579
  'x-ti-secret-code': 'your-secret-code'
552
580
  },
553
-
554
- # Stages 配置
555
- 'stages': [
556
- {
557
- 'type': 'parse',
558
- 'config': {
559
- 'provider': 'textin'
560
- }
561
- },
562
- {
563
- 'type': 'chunk',
564
- 'config': {
565
- 'strategy': 'by_title', # 按标题分块
566
- 'include_orig_elements': False,
567
- 'new_after_n_chars': 512,
568
- 'max_characters': 1024,
569
- 'overlap': 50 # 块之间重叠 50 字符
570
- }
571
- },
572
- {
573
- 'type': 'embed',
574
- 'config': {
575
- 'provider': 'qwen',
576
- 'model_name': 'text-embedding-v3'
577
- }
578
- }
579
- ]
580
- }
581
+ stages=stages
582
+ )
581
583
 
582
- # 使用配置创建并运行 pipeline
583
- pipeline = create_pipeline_from_config(config)
584
584
  pipeline.run()
585
585
  ```
586
586
 
587
- ### 示例 2: 本地到本地(测试)
587
+ ### 示例 1.1: 输出配置字典
588
+
589
+ 手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
588
590
 
589
591
  ```python
590
- from datetime import datetime, timezone
591
- from xparse_client import create_pipeline_from_config
592
+ from xparse_client import (
593
+ Pipeline, LocalSource, LocalDestination,
594
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
595
+ )
592
596
 
593
- config = {
594
- 'source': {
595
- 'type': 'local',
596
- 'directory': './test_files',
597
- 'pattern': '*.pdf'
598
- },
599
- 'destination': {
600
- 'type': 'local',
601
- 'output_dir': './test_output'
597
+ # 手动创建 Pipeline
598
+ source = LocalSource(
599
+ directory='./test_files',
600
+ pattern=['*.pdf'],
601
+ recursive=False
602
+ )
603
+
604
+ destination = LocalDestination(output_dir='./test_output')
605
+
606
+ stages = [
607
+ Stage(type='parse', config=ParseConfig(provider='textin')),
608
+ Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
609
+ Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
610
+ ]
611
+
612
+ pipeline = Pipeline(
613
+ source=source,
614
+ destination=destination,
615
+ api_base_url='https://api.textin.com/api/xparse',
616
+ api_headers={
617
+ 'x-ti-app-id': 'your-app-id',
618
+ 'x-ti-secret-code': 'your-secret-code'
602
619
  },
603
- 'api_base_url': 'https://api.textin.com/api/xparse',
604
- 'api_headers': {
620
+ stages=stages
621
+ )
622
+
623
+ # 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
624
+ config_dict = pipeline.get_config()
625
+
626
+ # 可以保存为 JSON 文件
627
+ import json
628
+ with open('pipeline_config.json', 'w', encoding='utf-8') as f:
629
+ json.dump(config_dict, f, indent=2, ensure_ascii=False)
630
+
631
+ # 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
632
+ # from xparse_client import create_pipeline_from_config
633
+ # new_pipeline = create_pipeline_from_config(config_dict)
634
+ ```
635
+
636
+ ### 示例 2: 本地到本地(测试)
637
+
638
+ ```python
639
+ from xparse_client import (
640
+ Pipeline, LocalSource, LocalDestination,
641
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
642
+ )
643
+
644
+ # 创建本地数据源
645
+ source = LocalSource(
646
+ directory='./test_files',
647
+ pattern=['*.pdf'],
648
+ recursive=False
649
+ )
650
+
651
+ # 创建本地输出目的地
652
+ destination = LocalDestination(output_dir='./test_output')
653
+
654
+ # 配置处理阶段
655
+ stages = [
656
+ Stage(
657
+ type='parse',
658
+ config=ParseConfig(provider='textin')
659
+ ),
660
+ Stage(
661
+ type='chunk',
662
+ config=ChunkConfig(
663
+ strategy='basic',
664
+ max_characters=1024
665
+ )
666
+ ),
667
+ Stage(
668
+ type='embed',
669
+ config=EmbedConfig(
670
+ provider='qwen',
671
+ model_name='text-embedding-v3'
672
+ )
673
+ )
674
+ ]
675
+
676
+ # 创建并运行 Pipeline
677
+ pipeline = Pipeline(
678
+ source=source,
679
+ destination=destination,
680
+ api_base_url='https://api.textin.com/api/xparse',
681
+ api_headers={
605
682
  'x-ti-app-id': 'your-app-id',
606
683
  'x-ti-secret-code': 'your-secret-code'
607
684
  },
608
- # Stages 配置
609
- 'stages': [
610
- {
611
- 'type': 'parse',
612
- 'config': {
613
- 'provider': 'textin'
614
- }
615
- },
616
- {
617
- 'type': 'chunk',
618
- 'config': {
619
- 'strategy': 'basic',
620
- 'max_characters': 1024
621
- }
622
- },
623
- {
624
- 'type': 'embed',
625
- 'config': {
626
- 'provider': 'qwen',
627
- 'model_name': 'text-embedding-v3'
628
- }
629
- }
630
- ]
631
- }
685
+ stages=stages
686
+ )
632
687
 
633
- pipeline = create_pipeline_from_config(config)
634
688
  pipeline.run()
635
689
  ```
636
690
 
637
691
  ### 示例 3: 不同分块策略的配置
638
692
 
639
693
  ```python
640
- from xparse_client import create_pipeline_from_config
694
+ from xparse_client import (
695
+ Pipeline, S3Source, MilvusDestination,
696
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
697
+ )
698
+
699
+ # 创建数据源和目的地
700
+ source = S3Source(...)
701
+ destination = MilvusDestination(...)
641
702
 
642
703
  # 配置 1:按页面分块(适合 PDF 文档)
643
- config_by_page = {
644
- 'source': {...},
645
- 'destination': {...},
646
- 'api_base_url': 'https://api.textin.com/api/xparse',
647
- 'api_headers': {...},
648
- 'stages': [
649
- {
650
- 'type': 'parse',
651
- 'config': {
652
- 'provider': 'textin'
653
- }
654
- },
655
- {
656
- 'type': 'chunk',
657
- 'config': {
658
- 'strategy': 'by_page', # 按页面分块
659
- 'max_characters': 2048, # 增大块大小
660
- 'overlap': 100 # 页面间重叠 100 字符
661
- }
662
- },
663
- {
664
- 'type': 'embed',
665
- 'config': {
666
- 'provider': 'qwen',
667
- 'model_name': 'text-embedding-v4' # 使用更高精度的模型
668
- }
669
- }
670
- ]
671
- }
704
+ stages_by_page = [
705
+ Stage(
706
+ type='parse',
707
+ config=ParseConfig(provider='textin')
708
+ ),
709
+ Stage(
710
+ type='chunk',
711
+ config=ChunkConfig(
712
+ strategy='by_page', # 按页面分块
713
+ max_characters=2048, # 增大块大小
714
+ overlap=100 # 页面间重叠 100 字符
715
+ )
716
+ ),
717
+ Stage(
718
+ type='embed',
719
+ config=EmbedConfig(
720
+ provider='qwen',
721
+ model_name='text-embedding-v4' # 使用更高精度的模型
722
+ )
723
+ )
724
+ ]
672
725
 
673
726
  # 配置 2:按标题分块(适合结构化文档)
674
- config_by_title = {
675
- 'source': {...},
676
- 'destination': {...},
677
- 'api_base_url': 'https://api.textin.com/api/xparse',
678
- 'api_headers': {...},
679
- 'stages': [
680
- {
681
- 'type': 'parse',
682
- 'config': {
683
- 'provider': 'textin'
684
- }
685
- },
686
- {
687
- 'type': 'chunk',
688
- 'config': {
689
- 'strategy': 'by_title', # 按标题分块
690
- 'include_orig_elements': True, # 保留原始元素信息
691
- 'max_characters': 1536
692
- }
693
- },
694
- {
695
- 'type': 'embed',
696
- 'config': {
697
- 'provider': 'qwen',
698
- 'model_name': 'text-embedding-v3'
699
- }
700
- }
701
- ]
702
- }
727
+ stages_by_title = [
728
+ Stage(
729
+ type='parse',
730
+ config=ParseConfig(provider='textin')
731
+ ),
732
+ Stage(
733
+ type='chunk',
734
+ config=ChunkConfig(
735
+ strategy='by_title', # 按标题分块
736
+ include_orig_elements=True, # 保留原始元素信息
737
+ max_characters=1536
738
+ )
739
+ ),
740
+ Stage(
741
+ type='embed',
742
+ config=EmbedConfig(
743
+ provider='qwen',
744
+ model_name='text-embedding-v3'
745
+ )
746
+ )
747
+ ]
703
748
 
704
749
  # 根据文档类型选择配置
705
- pipeline = create_pipeline_from_config(config_by_page)
750
+ pipeline = Pipeline(
751
+ source=source,
752
+ destination=destination,
753
+ api_base_url='https://api.textin.com/api/xparse',
754
+ api_headers={...},
755
+ stages=stages_by_page # 或 stages_by_title
756
+ )
706
757
  pipeline.run()
707
758
  ```
708
759
 
709
760
  ### 示例 4: FTP 数据源配置
710
761
 
711
762
  ```python
712
- from xparse_client import create_pipeline_from_config
763
+ from xparse_client import (
764
+ Pipeline, FtpSource, MilvusDestination,
765
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
766
+ )
713
767
 
714
- config = {
715
- # FTP 数据源
716
- 'source': {
717
- 'type': 'ftp',
718
- 'host': 'ftp.example.com',
719
- 'port': 21,
720
- 'username': 'user',
721
- 'password': 'pass'
722
- },
723
-
724
- # Milvus 目的地
725
- 'destination': {
726
- 'type': 'milvus',
727
- 'db_path': './vectors.db',
728
- 'collection_name': 'ftp_docs',
729
- 'dimension': 1024
730
- },
731
-
732
- 'api_base_url': 'https://api.textin.com/api/xparse',
733
- 'api_headers': {
768
+ # 创建 FTP 数据源
769
+ source = FtpSource(
770
+ host='ftp.example.com',
771
+ port=21,
772
+ username='user',
773
+ password='pass',
774
+ pattern=['*.pdf'],
775
+ recursive=False
776
+ )
777
+
778
+ # 创建 Milvus 目的地
779
+ destination = MilvusDestination(
780
+ db_path='./vectors.db',
781
+ collection_name='ftp_docs',
782
+ dimension=1024
783
+ )
784
+
785
+ # 配置处理阶段
786
+ stages = [
787
+ Stage(
788
+ type='parse',
789
+ config=ParseConfig(provider='textin')
790
+ ),
791
+ Stage(
792
+ type='chunk',
793
+ config=ChunkConfig(
794
+ strategy='basic',
795
+ max_characters=1024
796
+ )
797
+ ),
798
+ Stage(
799
+ type='embed',
800
+ config=EmbedConfig(
801
+ provider='qwen',
802
+ model_name='text-embedding-v3'
803
+ )
804
+ )
805
+ ]
806
+
807
+ # 创建并运行 Pipeline
808
+ pipeline = Pipeline(
809
+ source=source,
810
+ destination=destination,
811
+ api_base_url='https://api.textin.com/api/xparse',
812
+ api_headers={
734
813
  'x-ti-app-id': 'app-id',
735
814
  'x-ti-secret-code': 'secret'
736
815
  },
737
-
738
- # Stages 配置
739
- 'stages': [
740
- {
741
- 'type': 'parse',
742
- 'config': {
743
- 'provider': 'textin'
744
- }
745
- },
746
- {
747
- 'type': 'chunk',
748
- 'config': {
749
- 'strategy': 'basic',
750
- 'max_characters': 1024
751
- }
752
- },
753
- {
754
- 'type': 'embed',
755
- 'config': {
756
- 'provider': 'qwen',
757
- 'model_name': 'text-embedding-v3'
758
- }
759
- }
760
- ]
761
- }
816
+ stages=stages
817
+ )
762
818
 
763
- pipeline = create_pipeline_from_config(config)
764
819
  pipeline.run()
765
820
  ```
766
821
 
767
822
  ### 示例 5: 获取处理统计信息
768
823
 
769
824
  ```python
770
- from xparse_client import create_pipeline_from_config
825
+ from datetime import datetime, timezone
826
+ from xparse_client import (
827
+ Pipeline, LocalSource, LocalDestination,
828
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
829
+ )
771
830
 
772
- config = {
773
- 'source': {
774
- 'type': 'local',
775
- 'directory': './docs',
776
- 'pattern': '*.pdf'
777
- },
778
- 'destination': {
779
- 'type': 'local',
780
- 'output_dir': './output'
781
- },
782
- 'api_base_url': 'https://api.textin.com/api/xparse',
783
- 'api_headers': {
831
+ # 创建 Pipeline
832
+ source = LocalSource(
833
+ directory='./docs',
834
+ pattern=['*.pdf'],
835
+ recursive=False
836
+ )
837
+
838
+ destination = LocalDestination(output_dir='./output')
839
+
840
+ stages = [
841
+ Stage(
842
+ type='parse',
843
+ config=ParseConfig(provider='textin')
844
+ ),
845
+ Stage(
846
+ type='chunk',
847
+ config=ChunkConfig(
848
+ strategy='basic',
849
+ max_characters=1024
850
+ )
851
+ ),
852
+ Stage(
853
+ type='embed',
854
+ config=EmbedConfig(
855
+ provider='qwen',
856
+ model_name='text-embedding-v3'
857
+ )
858
+ )
859
+ ]
860
+
861
+ pipeline = Pipeline(
862
+ source=source,
863
+ destination=destination,
864
+ api_base_url='https://api.textin.com/api/xparse',
865
+ api_headers={
784
866
  'x-ti-app-id': 'your-app-id',
785
867
  'x-ti-secret-code': 'your-secret-code'
786
868
  },
787
- 'stages': [
788
- {
789
- 'type': 'parse',
790
- 'config': {
791
- 'provider': 'textin'
792
- }
793
- },
794
- {
795
- 'type': 'chunk',
796
- 'config': {
797
- 'strategy': 'basic',
798
- 'max_characters': 1024
799
- }
800
- },
801
- {
802
- 'type': 'embed',
803
- 'config': {
804
- 'provider': 'qwen',
805
- 'model_name': 'text-embedding-v3'
806
- }
807
- }
808
- ]
809
- }
810
-
811
- pipeline = create_pipeline_from_config(config)
869
+ stages=stages
870
+ )
812
871
 
813
872
  # 处理单个文件并获取统计信息
814
873
  file_bytes, data_source = pipeline.source.read_file('document.pdf')
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=xZ8TLofrK7naEwBe-tiuotcQ8yKWUES_k9iCQcIOIYo,15446
2
+ example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
+ xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
7
+ xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
8
+ xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
9
+ xparse_client-0.2.9.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.9.dist-info/METADATA,sha256=Faj3fvt9Fc-EW9yFDewhpkqGVo_qSvL5N-tq1aIkkyk,28086
11
+ xparse_client-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.9.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.9.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=ijws5q_vMmV0-bMHuFtOUMrEnxnL1LvOBCtcCD2c8zc,15366
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
- xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
7
- xparse_client/pipeline/pipeline.py,sha256=pHw32eo-bRegzDvkuVUu0CjMXMejJ64dDXH7esGMXjg,20379
8
- xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
9
- xparse_client-0.2.7.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.7.dist-info/METADATA,sha256=qMHiAq2qdH4vfW5zktrYrh9Kj72JxBDERVht8KYerl0,26805
11
- xparse_client-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.7.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.7.dist-info/RECORD,,