xparse-client 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
example/run_pipeline.py CHANGED
@@ -84,7 +84,7 @@ def run_with_config():
84
84
 
85
85
  def run_with_manual_setup():
86
86
  """手动创建 Source、Destination 和 Pipeline"""
87
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage, PipelineConfig, LocalDestination
87
+ from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage, PipelineConfig, LocalDestination, QdrantDestination
88
88
 
89
89
  # 创建 S3 数据源
90
90
  # source = S3Source(
@@ -132,7 +132,7 @@ def run_with_manual_setup():
132
132
  endpoint='https://s3.us-east-1.amazonaws.com',
133
133
  access_key='AKIA6QUE3TVZADUWA4PO',
134
134
  secret_key='OfV4r9/u+CmlLxmiZDYwtiFSl0OsNdWLADKdPek7',
135
- bucket='textin-xparse',
135
+ bucket='textin-test',
136
136
  prefix='',
137
137
  region='us-east-1'
138
138
  )
@@ -160,7 +160,8 @@ def run_with_manual_setup():
160
160
  # )
161
161
  # source = LocalSource(
162
162
  # directory='/Users/ke_wang/Documents/doc',
163
- # pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
163
+ # recursive=True,
164
+ # pattern=['**/*.png'] # 支持通配符: *.pdf, *.docx, **/*.txt
164
165
  # )
165
166
 
166
167
  # 创建 Milvus 目的地
@@ -174,12 +175,12 @@ def run_with_manual_setup():
174
175
  # output_dir='./result'
175
176
  # )
176
177
 
177
- destination = MilvusDestination(
178
- db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
179
- collection_name='textin_test_3_copy', # 数据库collection名称
180
- dimension=1024, # 向量维度,需与 embed API 返回一致
181
- api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
182
- )
178
+ # destination = MilvusDestination(
179
+ # db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
180
+ # collection_name='textin_test_3_copy', # 数据库collection名称
181
+ # dimension=1024, # 向量维度,需与 embed API 返回一致
182
+ # api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
183
+ # )
183
184
 
184
185
  # destination = S3Destination(
185
186
  # endpoint='https://cos.ap-shanghai.myqcloud.com',
@@ -189,12 +190,19 @@ def run_with_manual_setup():
189
190
  # prefix='result',
190
191
  # region='ap-shanghai'
191
192
  # )
193
+
194
+ destination = QdrantDestination(
195
+ url='https://1325db22-7dd8-4fc9-930b-f969d4963b3d.us-east-1-1.aws.cloud.qdrant.io:6333',
196
+ collection_name='textin1',
197
+ dimension=1024,
198
+ api_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.TGnFB1pAD7c7IqSOvTpgCPpHXSnnoKhWEQ5pQ8DrBnI',
199
+ )
192
200
 
193
201
  # 使用新的 stages 格式创建配置
194
202
  stages = [
195
203
  Stage(
196
204
  type='parse',
197
- config=ParseConfig(provider='textin-lite')
205
+ config=ParseConfig(provider='paddle')
198
206
  ),
199
207
  Stage(
200
208
  type='chunk',
@@ -236,8 +244,8 @@ def run_with_manual_setup():
236
244
  )
237
245
 
238
246
  # 运行
239
- config = pipeline.get_config()
240
- print(json.dumps(config, ensure_ascii=False, indent=2))
247
+ # config = pipeline.get_config()
248
+ pipeline.run()
241
249
 
242
250
 
243
251
  # ============================================================================
@@ -86,14 +86,14 @@ def run_with_manual_setup():
86
86
  from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
87
87
 
88
88
  # 创建 S3 数据源
89
- # source = S3Source(
90
- # endpoint='https://textin-minio-api.ai.intsig.net',
91
- # access_key='IEQspf8C7fVcgmp3AZWl',
92
- # secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
93
- # bucket='textin-test',
94
- # prefix='',
95
- # region='us-east-1'
96
- # )
89
+ source = S3Source(
90
+ endpoint='https://textin-minio-api.ai.intsig.net',
91
+ access_key='IEQspf8C7fVcgmp3AZWl',
92
+ secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
93
+ bucket='textin-test',
94
+ prefix='',
95
+ region='us-east-1'
96
+ )
97
97
  source = S3Source(
98
98
  endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
99
99
  access_key='LTAI5tBgsaVfkbh9rbPyuB17',
xparse_client/__init__.py CHANGED
@@ -10,7 +10,7 @@ logging.basicConfig(
10
10
 
11
11
  from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
12
12
  from .pipeline.sources import Source, S3Source, LocalSource, FtpSource, SmbSource
13
- from .pipeline.destinations import Destination, MilvusDestination, LocalDestination, S3Destination
13
+ from .pipeline.destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
14
14
  from .pipeline.pipeline import Pipeline, create_pipeline_from_config
15
15
 
16
16
  __all__ = [
@@ -27,6 +27,7 @@ __all__ = [
27
27
  'SmbSource',
28
28
  'Destination',
29
29
  'MilvusDestination',
30
+ 'QdrantDestination',
30
31
  'LocalDestination',
31
32
  'S3Destination',
32
33
  'Pipeline',
@@ -13,7 +13,8 @@ from typing import List, Dict, Any
13
13
 
14
14
  from botocore.config import Config
15
15
  from pymilvus import MilvusClient
16
-
16
+ from qdrant_client import QdrantClient
17
+ from qdrant_client.models import Distance, VectorParams, PointStruct, PayloadSchemaType
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -127,7 +128,7 @@ class MilvusDestination(Destination):
127
128
  print(f" ✓ 删除现有记录: record_id={record_id}, 删除 {deleted_count} 条")
128
129
  logger.info(f"删除 Milvus 现有记录: record_id={record_id}, 删除 {deleted_count} 条")
129
130
  else:
130
- print(f" → 未找到现有记录: record_id={record_id}")
131
+ print(f" → 准备写入记录: record_id={record_id}")
131
132
  except Exception as e:
132
133
  print(f" ! 删除现有记录失败: {str(e)}")
133
134
  logger.warning(f"删除 Milvus 现有记录失败: record_id={record_id}, {str(e)}")
@@ -296,9 +297,190 @@ class S3Destination(Destination):
296
297
  return False
297
298
 
298
299
 
300
+ class QdrantDestination(Destination):
301
+ """Qdrant 向量数据库目的地"""
302
+
303
+ def __init__(self, url: str, collection_name: str, dimension: int, api_key: str = None, prefer_grpc: bool = False):
304
+ """初始化 Qdrant 目的地
305
+
306
+ Args:
307
+ url: Qdrant 服务地址(如 'http://localhost:6333' 或 'https://xxx.qdrant.io')
308
+ collection_name: Collection 名称
309
+ dimension: 向量维度
310
+ api_key: API Key(可选,用于 Qdrant Cloud)
311
+ prefer_grpc: 是否优先使用 gRPC(默认 False,使用 HTTP)
312
+ """
313
+
314
+ self.url = url
315
+ self.collection_name = collection_name
316
+ self.dimension = dimension
317
+
318
+ client_kwargs = {'url': url}
319
+ if api_key:
320
+ client_kwargs['api_key'] = api_key
321
+ if prefer_grpc:
322
+ client_kwargs['prefer_grpc'] = True
323
+
324
+ self.client = QdrantClient(**client_kwargs)
325
+
326
+ # 检查或创建 collection
327
+ try:
328
+ collections = self.client.get_collections()
329
+ collection_exists = any(col.name == collection_name for col in collections.collections)
330
+
331
+ if not collection_exists:
332
+ self.client.create_collection(
333
+ collection_name=collection_name,
334
+ vectors_config=VectorParams(
335
+ size=dimension,
336
+ distance=Distance.COSINE
337
+ )
338
+ )
339
+ # 为 record_id 创建索引,用于过滤查询
340
+ try:
341
+ self.client.create_payload_index(
342
+ collection_name=collection_name,
343
+ field_name="record_id",
344
+ field_schema=PayloadSchemaType.KEYWORD
345
+ )
346
+ print(f"✓ Qdrant Collection 创建: {collection_name} (维度: {dimension})")
347
+ except Exception as e:
348
+ logger.warning(f"创建 record_id 索引失败(可能已存在): {str(e)}")
349
+ print(f"✓ Qdrant Collection 创建: {collection_name} (维度: {dimension})")
350
+ else:
351
+ print(f"✓ Qdrant Collection 存在: {collection_name}")
352
+ # 确保 record_id 索引存在(如果不存在则创建)
353
+ try:
354
+ self.client.create_payload_index(
355
+ collection_name=collection_name,
356
+ field_name="record_id",
357
+ field_schema=PayloadSchemaType.KEYWORD
358
+ )
359
+ except Exception as e:
360
+ # 索引可能已存在,忽略错误
361
+ logger.debug(f"record_id 索引可能已存在: {str(e)}")
362
+
363
+ logger.info(f"Qdrant 连接成功: {url}/{collection_name}")
364
+ except Exception as e:
365
+ print(f"✗ Qdrant 连接失败: {str(e)}")
366
+ logger.error(f"Qdrant 连接失败: {str(e)}")
367
+ raise
368
+
369
+ def write(self, data: List[Dict[str, Any]], metadata: Dict[str, Any]) -> bool:
370
+ try:
371
+ # 如果 metadata 中有 record_id,先删除相同 record_id 的现有记录
372
+ record_id = metadata.get('record_id')
373
+ if record_id:
374
+ try:
375
+ # 查询并删除相同 record_id 的所有记录
376
+ # 使用字典格式的 filter(兼容性更好)
377
+ scroll_result = self.client.scroll(
378
+ collection_name=self.collection_name,
379
+ scroll_filter={
380
+ "must": [
381
+ {
382
+ "key": "record_id",
383
+ "match": {"value": record_id}
384
+ }
385
+ ]
386
+ },
387
+ limit=10000 # 假设单次最多删除 10000 条
388
+ )
389
+
390
+ if scroll_result[0]: # 有记录
391
+ point_ids = [point.id for point in scroll_result[0]]
392
+ self.client.delete(
393
+ collection_name=self.collection_name,
394
+ points_selector=point_ids
395
+ )
396
+ print(f" ✓ 删除现有记录: record_id={record_id}, 删除 {len(point_ids)} 条")
397
+ logger.info(f"删除 Qdrant 现有记录: record_id={record_id}, 删除 {len(point_ids)} 条")
398
+ else:
399
+ print(f" → 准备写入记录: record_id={record_id}")
400
+ except Exception as e:
401
+ print(f" ! 删除现有记录失败: {str(e)}")
402
+ logger.warning(f"删除 Qdrant 现有记录失败: record_id={record_id}, {str(e)}")
403
+ # 继续执行写入,不因为删除失败而中断
404
+ else:
405
+ print(f" → 没有 record_id")
406
+ logger.warning(f"没有 record_id")
407
+ return False
408
+
409
+ points = []
410
+ for item in data:
411
+ # 获取元素级别的 metadata
412
+ element_metadata = item.get('metadata', {})
413
+
414
+ if 'embeddings' in item and item['embeddings']:
415
+ element_id = item.get('element_id') or item.get('id') or str(uuid.uuid4())
416
+
417
+ # 构建 payload(元数据)
418
+ payload = {
419
+ 'text': item.get('text', ''),
420
+ 'record_id': record_id,
421
+ }
422
+
423
+ # 合并文件级别的 metadata 和元素级别的 metadata
424
+ # 文件级别的 metadata 优先级更高
425
+ merged_metadata = {**element_metadata, **metadata}
426
+
427
+ # 将 metadata 中的字段添加到 payload
428
+ # 排除已存在的固定字段,避免冲突
429
+ fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
430
+ for key, value in merged_metadata.items():
431
+ if key not in fixed_fields:
432
+ # 特殊处理 data_source 字段:如果是字典则递归展平
433
+ if key == 'data_source' and isinstance(value, dict):
434
+ # 递归展平 data_source 字典,包括嵌套的字典
435
+ flattened = _flatten_dict(value, 'data_source', fixed_fields)
436
+ payload.update(flattened)
437
+ elif key == 'coordinates' and isinstance(value, list):
438
+ payload[key] = value
439
+ elif isinstance(value, (dict, list)):
440
+ # Qdrant 支持 JSON 格式的 payload
441
+ payload[key] = value
442
+ else:
443
+ payload[key] = value
444
+
445
+ # 创建 Point(id 是必需的)
446
+ # Qdrant 的 point id 可以是整数或 UUID 字符串
447
+ # 如果 element_id 是 UUID 格式,直接使用;否则转换为 UUID5(基于 element_id 生成稳定的 UUID)
448
+ try:
449
+ # 尝试将 element_id 解析为 UUID
450
+ point_id = str(uuid.UUID(element_id))
451
+ except (ValueError, TypeError):
452
+ # 如果不是有效的 UUID,使用 UUID5 基于 element_id 生成稳定的 UUID
453
+ point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, str(element_id)))
454
+
455
+ point = PointStruct(
456
+ id=point_id,
457
+ vector=item['embeddings'],
458
+ payload=payload
459
+ )
460
+ points.append(point)
461
+
462
+ if not points:
463
+ print(f" ! 警告: 没有有效的向量数据")
464
+ return False
465
+
466
+ # 批量插入
467
+ self.client.upsert(
468
+ collection_name=self.collection_name,
469
+ points=points
470
+ )
471
+ print(f" ✓ 写入 Qdrant: {len(points)} 条")
472
+ logger.info(f"写入 Qdrant 成功: {len(points)} 条")
473
+ return True
474
+ except Exception as e:
475
+ print(f" ✗ 写入 Qdrant 失败: {str(e)}")
476
+ logger.error(f"写入 Qdrant 失败: {str(e)}")
477
+ return False
478
+
479
+
299
480
  __all__ = [
300
481
  'Destination',
301
482
  'MilvusDestination',
483
+ 'QdrantDestination',
302
484
  'LocalDestination',
303
485
  'S3Destination',
304
486
  ]
@@ -12,7 +12,7 @@ import requests
12
12
 
13
13
  from .config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
14
14
  from .sources import Source, S3Source, LocalSource, FtpSource, SmbSource
15
- from .destinations import Destination, MilvusDestination, LocalDestination, S3Destination
15
+ from .destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
16
16
 
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -145,6 +145,14 @@ class Pipeline:
145
145
  'dimension': self.destination.dimension
146
146
  })
147
147
  # api_key 和 token 不在对象中保存,无法恢复
148
+ elif isinstance(self.destination, QdrantDestination):
149
+ config['destination'].update({
150
+ 'url': self.destination.url,
151
+ 'collection_name': self.destination.collection_name,
152
+ 'dimension': self.destination.dimension,
153
+ 'prefer_grpc': getattr(self.destination, 'prefer_grpc', False)
154
+ })
155
+ # api_key 不在对象中保存,无法恢复
148
156
  elif isinstance(self.destination, LocalDestination):
149
157
  config['destination'].update({
150
158
  'output_dir': str(self.destination.output_dir)
@@ -503,6 +511,14 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
503
511
  api_key=dest_config.get('api_key'),
504
512
  token=dest_config.get('token')
505
513
  )
514
+ elif dest_config['type'] == 'qdrant':
515
+ destination = QdrantDestination(
516
+ url=dest_config['url'],
517
+ collection_name=dest_config['collection_name'],
518
+ dimension=dest_config['dimension'],
519
+ api_key=dest_config.get('api_key'),
520
+ prefer_grpc=dest_config.get('prefer_grpc', False)
521
+ )
506
522
  elif dest_config['type'] == 'local':
507
523
  destination = LocalDestination(
508
524
  output_dir=dest_config['output_dir']
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
11
11
  from email.utils import parsedate_to_datetime
12
12
  from fnmatch import fnmatch
13
13
  from pathlib import Path
14
- from typing import List, Dict, Any, Tuple
14
+ from typing import List, Dict, Any, Tuple, Optional
15
15
 
16
16
  from smb.SMBConnection import SMBConnection
17
17
  from botocore.config import Config
@@ -20,6 +20,56 @@ from botocore.config import Config
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
+ def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
24
+ """规范化通配符模式列表
25
+
26
+ Args:
27
+ pattern: 通配符模式列表,如果为 None 或空列表则返回 None(表示匹配所有文件)
28
+
29
+ Returns:
30
+ 通配符模式列表,如果 pattern 是 None、空列表或包含 "*" 则返回 None(表示匹配所有文件)
31
+ """
32
+ if pattern is None or not pattern:
33
+ return None # None 表示匹配所有文件
34
+
35
+ if not isinstance(pattern, list):
36
+ raise ValueError(f"pattern 类型错误: {type(pattern)}")
37
+
38
+ # 过滤空字符串并去除空格
39
+ normalized = [p.strip() for p in pattern if p and p.strip()]
40
+
41
+ if not normalized:
42
+ return None
43
+
44
+ # 如果包含 "*",直接返回 None(匹配所有文件,减少后续开销)
45
+ if '*' in normalized:
46
+ return None
47
+
48
+ return normalized
49
+
50
+
51
+ def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
52
+ """检查文件路径是否匹配通配符模式
53
+
54
+ Args:
55
+ file_path: 文件路径
56
+ wildcard_patterns: 已规范化的通配符模式列表(如 ['*.pdf', '*.docx'])
57
+
58
+ Returns:
59
+ 如果匹配返回 True,否则返回 False
60
+ """
61
+ # 如果 wildcard_patterns 是 None 或空列表,匹配所有文件
62
+ if wildcard_patterns is None:
63
+ return True
64
+
65
+ # 检查是否匹配任何一个通配符模式
66
+ for wildcard_pattern in wildcard_patterns:
67
+ if fnmatch(file_path, wildcard_pattern):
68
+ return True
69
+
70
+ return False
71
+
72
+
23
73
  def _to_millis_timestamp_string(timestamp):
24
74
  """将时间戳转换为毫秒时间戳字符串
25
75
 
@@ -62,11 +112,11 @@ class S3Source(Source):
62
112
  """S3/MinIO 数据源"""
63
113
 
64
114
  def __init__(self, endpoint: str, access_key: str, secret_key: str,
65
- bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*', recursive: bool = False):
115
+ bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
66
116
  self.endpoint = endpoint
67
117
  self.bucket = bucket
68
118
  self.prefix = prefix
69
- self.pattern = pattern or '*'
119
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
70
120
  self.recursive = recursive
71
121
 
72
122
  if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
@@ -108,7 +158,7 @@ class S3Source(Source):
108
158
  key = obj['Key']
109
159
  if key.endswith('/') or key.endswith('empty.tmp'):
110
160
  continue
111
- if fnmatch(key, self.pattern):
161
+ if _match_file_extension(key, self.pattern):
112
162
  files.append(key)
113
163
 
114
164
  # 非递归模式下,CommonPrefixes 包含子目录,我们忽略它们
@@ -156,9 +206,9 @@ class S3Source(Source):
156
206
  class LocalSource(Source):
157
207
  """本地文件系统数据源"""
158
208
 
159
- def __init__(self, directory: str, pattern: str = '*', recursive: bool = False):
209
+ def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
160
210
  self.directory = Path(directory)
161
- self.pattern = pattern or '*'
211
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
162
212
  self.recursive = recursive
163
213
 
164
214
  if not self.directory.exists():
@@ -168,20 +218,29 @@ class LocalSource(Source):
168
218
  logger.info(f"本地目录: {self.directory}")
169
219
 
170
220
  def list_files(self) -> List[str]:
221
+ all_files = []
222
+ # 匹配所有文件
171
223
  if self.recursive:
172
- # 递归模式:使用 rglob
173
- files = [
224
+ all_files.extend([
174
225
  str(f.relative_to(self.directory))
175
- for f in self.directory.rglob(self.pattern)
226
+ for f in self.directory.rglob('*')
176
227
  if f.is_file()
177
- ]
228
+ ])
178
229
  else:
179
- # 非递归模式:只列出根目录下的文件,使用 glob
180
- files = [
230
+ all_files.extend([
181
231
  str(f.relative_to(self.directory))
182
- for f in self.directory.glob(self.pattern)
232
+ for f in self.directory.glob('*')
183
233
  if f.is_file()
184
- ]
234
+ ])
235
+
236
+ files = []
237
+ if self.pattern is not None:
238
+ for file in all_files:
239
+ if _match_file_extension(file, self.pattern):
240
+ files.append(file)
241
+ else:
242
+ files.extend(all_files)
243
+
185
244
  print(f"✓ 本地找到 {len(files)} 个文件")
186
245
  return files
187
246
 
@@ -217,12 +276,12 @@ class LocalSource(Source):
217
276
  class FtpSource(Source):
218
277
  """FTP 数据源"""
219
278
 
220
- def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*', recursive: bool = False):
279
+ def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
221
280
  self.host = host
222
281
  self.port = port
223
282
  self.username = username
224
283
  self.password = password
225
- self.pattern = pattern or '*'
284
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
226
285
  self.recursive = recursive
227
286
 
228
287
  self.client = ftplib.FTP()
@@ -288,12 +347,12 @@ class FtpSource(Source):
288
347
  except:
289
348
  # 不是目录,是文件
290
349
  relative_path = full_path.lstrip('/')
291
- if fnmatch(relative_path, self.pattern):
350
+ if _match_file_extension(relative_path, self.pattern):
292
351
  files.append(relative_path)
293
352
  else:
294
353
  # 是文件
295
354
  relative_path = full_path.lstrip('/')
296
- if fnmatch(relative_path, self.pattern):
355
+ if _match_file_extension(relative_path, self.pattern):
297
356
  files.append(relative_path)
298
357
 
299
358
  # 恢复原始目录
@@ -325,7 +384,7 @@ class FtpSource(Source):
325
384
  item_type = item_info.get('type', 'unknown')
326
385
  # 只添加文件,排除目录
327
386
  if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
328
- if fnmatch(item_name, self.pattern):
387
+ if _match_file_extension(item_name, self.pattern):
329
388
  files.append(item_name)
330
389
  except:
331
390
  # 如果不支持 MLSD,使用 LIST 命令
@@ -341,7 +400,7 @@ class FtpSource(Source):
341
400
  continue
342
401
  is_dir = parts[0].startswith('d')
343
402
  # 只添加文件,排除目录
344
- if not is_dir and fnmatch(item_name, self.pattern):
403
+ if not is_dir and _match_file_extension(item_name, self.pattern):
345
404
  files.append(item_name)
346
405
  except:
347
406
  # 最后回退到 nlst,通过尝试切换目录来判断是否为目录
@@ -357,7 +416,7 @@ class FtpSource(Source):
357
416
  continue
358
417
  except:
359
418
  # 不能切换,说明是文件
360
- if fnmatch(item_name, self.pattern):
419
+ if _match_file_extension(item_name, self.pattern):
361
420
  files.append(item_name)
362
421
 
363
422
  # 确保回到原始目录
@@ -405,7 +464,7 @@ class SmbSource(Source):
405
464
  """SMB/CIFS 数据源"""
406
465
 
407
466
  def __init__(self, host: str, share_name: str, username: str, password: str,
408
- domain: str = '', port: int = 445, path: str = '', pattern: str = '*', recursive: bool = False):
467
+ domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
409
468
  self.host = host
410
469
  self.share_name = share_name
411
470
  self.username = username
@@ -413,7 +472,7 @@ class SmbSource(Source):
413
472
  self.domain = domain
414
473
  self.port = port
415
474
  self.path = path.strip('/').strip('\\') if path else ''
416
- self.pattern = pattern or '*'
475
+ self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
417
476
  self.recursive = recursive
418
477
 
419
478
  self.conn = SMBConnection(
@@ -451,7 +510,7 @@ class SmbSource(Source):
451
510
  _list_recursive(conn, share, item_path)
452
511
  # 非递归模式:忽略子目录
453
512
  else:
454
- if fnmatch(relative_path, self.pattern):
513
+ if _match_file_extension(relative_path, self.pattern):
455
514
  files.append(relative_path)
456
515
  except Exception as e:
457
516
  logger.warning(f"列出路径失败 {current_path}: {str(e)}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -13,6 +13,7 @@ Requires-Dist: boto3
13
13
  Requires-Dist: pymilvus[milvus_lite]
14
14
  Requires-Dist: requests
15
15
  Requires-Dist: pysmb
16
+ Requires-Dist: qdrant-client
16
17
  Dynamic: license-file
17
18
 
18
19
  # xParse
@@ -24,7 +25,7 @@ xParse的同步pipeline实现,支持多种数据源与输出。
24
25
  ## 🌟 特点
25
26
 
26
27
  - **灵活的数据源**:支持兼容 S3 协议的对象存储、本地文件系统以及 FTP/SMB 协议文件系统
27
- - **灵活的输出**:支持 Milvus/Zilliz 向量数据库、兼容 S3 协议的对象存储以及本地文件系统
28
+ - **灵活的输出**:支持 Milvus/Zilliz/Qdrant 向量数据库、兼容 S3 协议的对象存储以及本地文件系统
28
29
  - **统一 Pipeline API**:使用 `/api/xparse/pipeline` 一次性完成 parse → chunk → embed 全流程
29
30
  - **配置化处理**:支持灵活配置 parse、chunk、embed 参数
30
31
  - **详细统计信息**:返回每个阶段的处理统计数据
@@ -51,7 +52,7 @@ xParse的同步pipeline实现,支持多种数据源与输出。
51
52
  │ [embeddings + stats]
52
53
 
53
54
  ┌──────────────┐
54
- │ Destination │ 目的地(Milvus/Zilliz/本地)
55
+ │ Destination │ 目的地(Milvus/Zilliz/Qdrant/本地)
55
56
  └──────────────┘
56
57
  ```
57
58
 
@@ -69,7 +70,7 @@ pip install --upgrade xparse-client
69
70
 
70
71
  #### 代码配置
71
72
  ```python
72
- from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Stage, Pipeline, S3Source, MilvusDestination
73
+ from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Stage, Pipeline, S3Source, MilvusDestination, QdrantDestination
73
74
 
74
75
  # 使用新的 stages 格式创建配置
75
76
  stages = [
@@ -173,7 +174,7 @@ source = S3Source(
173
174
  bucket='textin',
174
175
  prefix='',
175
176
  region='us-east-1',
176
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
177
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
177
178
  )
178
179
  ```
179
180
  请确保配置的访问凭证至少包括以下几项权限:
@@ -193,7 +194,7 @@ source = S3Source(
193
194
  bucket='textin',
194
195
  prefix='',
195
196
  region='cn-shanghai',
196
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
197
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
197
198
  )
198
199
  ```
199
200
  请确保配置的访问凭证至少包括以下几项权限:
@@ -214,7 +215,7 @@ source = S3Source(
214
215
  bucket='textin',
215
216
  prefix='',
216
217
  region='ap-shanghai',
217
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
218
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
218
219
  )
219
220
  ```
220
221
 
@@ -235,7 +236,7 @@ source = S3Source(
235
236
  bucket='textin',
236
237
  prefix='',
237
238
  region='cn-shanghai',
238
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
239
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
239
240
  )
240
241
  ```
241
242
 
@@ -257,7 +258,7 @@ source = S3Source(
257
258
  bucket='textin',
258
259
  prefix='',
259
260
  region='cn-east-3',
260
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
261
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
261
262
  )
262
263
  ```
263
264
 
@@ -279,7 +280,7 @@ source = S3Source(
279
280
  bucket='textin-xparse',
280
281
  prefix='',
281
282
  region='us-east-1',
282
- pattern='*.pdf' # 可选,使用 Shell 通配符过滤对象
283
+ pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
283
284
  )
284
285
  ```
285
286
  请确保配置的访问凭证至少包括以下几项权限:
@@ -294,7 +295,7 @@ s3:GetObject
294
295
  ```python
295
296
  source = LocalSource(
296
297
  directory='./input',
297
- pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
298
+ pattern=['*.pdf', '*.docx'] # 支持多个通配符模式列表
298
299
  )
299
300
  ```
300
301
 
@@ -306,7 +307,7 @@ source = FtpSource(
306
307
  port=21,
307
308
  username='', # 用户名,按照实际填写
308
309
  password='', # 密码,按照实际填写
309
- pattern='*.pdf' # 可选,过滤指定类型文件
310
+ pattern=['*.pdf'] # 可选,通配符模式列表,过滤指定类型文件
310
311
  )
311
312
  ```
312
313
 
@@ -319,11 +320,11 @@ source = SmbSource(
319
320
  username='', # 用户名,按照实际填写
320
321
  password='', # 密码,按照实际填写
321
322
  domain='your-smb-domain',
322
- pattern='**/*.pdf' # 可选,支持多级匹配
323
+ pattern=['**/*.pdf'] # 可选,通配符模式列表,支持多级匹配
323
324
  )
324
325
  ```
325
326
 
326
- > 注 1:所有 Source 均支持 `pattern` 参数,使用 Shell 通配符(`*.pdf`、`**/*.txt` 等)来过滤需要处理的文件;默认为 `*`,即处理全部文件。
327
+ > 注 1:所有 Source 均支持 `pattern` 参数,使用通配符模式列表(如 `['*.pdf', '*.docx']`)来过滤需要处理的文件。支持多个通配符模式,如果列表中包含 `'*'` 则匹配所有文件。默认为 `None`,即处理全部文件。
327
328
 
328
329
  > 注 2:所有 Source 均支持 `recursive` 参数,表示是否递归遍历,默认为 `False`。
329
330
 
@@ -354,6 +355,28 @@ destination = MilvusDestination(
354
355
  )
355
356
  ```
356
357
 
358
+ #### Qdrant 向量存储
359
+
360
+ ```python
361
+ destination = QdrantDestination(
362
+ url='http://localhost:6333', # Qdrant 服务地址(本地或云端)
363
+ collection_name='my_collection', # Collection 名称
364
+ dimension=1024, # 向量维度,需与 embed API 返回一致
365
+ api_key='your-api-key', # 可选,Qdrant Cloud API Key
366
+ prefer_grpc=False # 可选,是否优先使用 gRPC(默认 False)
367
+ )
368
+ ```
369
+
370
+ **Qdrant Cloud 示例:**
371
+ ```python
372
+ destination = QdrantDestination(
373
+ url='https://xxxxxxx.us-east-1-0.aws.cloud.qdrant.io',
374
+ collection_name='my_collection',
375
+ dimension=1024,
376
+ api_key='your-api-key'
377
+ )
378
+ ```
379
+
357
380
  #### 本地文件系统目的地
358
381
 
359
382
  将在配置的本地文件地址中写入`json`文件。
@@ -533,7 +556,7 @@ source = S3Source(
533
556
  bucket='documents',
534
557
  prefix='pdfs/',
535
558
  region='us-east-1',
536
- pattern='*.pdf', # 仅处理匹配的文件
559
+ pattern=['*.pdf'], # 仅处理匹配的文件
537
560
  recursive=False # 不递归子目录
538
561
  )
539
562
 
@@ -597,7 +620,7 @@ from xparse_client import (
597
620
  # 手动创建 Pipeline
598
621
  source = LocalSource(
599
622
  directory='./test_files',
600
- pattern='*.pdf',
623
+ pattern=['*.pdf'],
601
624
  recursive=False
602
625
  )
603
626
 
@@ -644,7 +667,7 @@ from xparse_client import (
644
667
  # 创建本地数据源
645
668
  source = LocalSource(
646
669
  directory='./test_files',
647
- pattern='*.pdf',
670
+ pattern=['*.pdf'],
648
671
  recursive=False
649
672
  )
650
673
 
@@ -771,7 +794,7 @@ source = FtpSource(
771
794
  port=21,
772
795
  username='user',
773
796
  password='pass',
774
- pattern='*.pdf',
797
+ pattern=['*.pdf'],
775
798
  recursive=False
776
799
  )
777
800
 
@@ -831,7 +854,7 @@ from xparse_client import (
831
854
  # 创建 Pipeline
832
855
  source = LocalSource(
833
856
  directory='./docs',
834
- pattern='*.pdf',
857
+ pattern=['*.pdf'],
835
858
  recursive=False
836
859
  )
837
860
 
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=d4pPDqjiC9dPNh6nmArPOF7fPMY0a-jcvdgtNuV-_kM,15795
2
+ example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
3
+ xparse_client/__init__.py,sha256=C2XLxkCoONl6_B1FmDhWRw84TqOL4pZF20br-K26SSY,1721
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
+ xparse_client/pipeline/destinations.py,sha256=9UyZ8Ygjoe4yAq6-VZNZBoNYRbb3mahify3c1AdOHMY,20775
7
+ xparse_client/pipeline/pipeline.py,sha256=ZspagUjiL5wnzGJq6A7riOU8qGXJMtg1fqPm9H09mkk,27272
8
+ xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
9
+ xparse_client-0.2.10.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.10.dist-info/METADATA,sha256=gIY_PxB1pTxSlKJZjU7z1Iua6ZMtAfMfHFeztWp2zIw,28785
11
+ xparse_client-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.10.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.10.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=ybAWBPXcQClRk1HOMySLi9IUPIs1Qn-S5HXNLbNJHjs,15459
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
- xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
7
- xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
8
- xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
9
- xparse_client-0.2.8.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.8.dist-info/METADATA,sha256=LX8TfLSbFZerGPhh16x5QK1lwrPh55CYKNLhr2kdBcY,27850
11
- xparse_client-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.8.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.8.dist-info/RECORD,,