xparse-client 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -231,12 +231,17 @@ class Pipeline:
231
231
 
232
232
  # 将 stages 转换为 API 格式
233
233
  stages_data = [stage.to_dict() for stage in self.stages]
234
- form_data['stages'] = json.dumps(stages_data)
235
- form_data['data_source'] = json.dumps(data_source, ensure_ascii=False)
236
-
237
- # 如果启用了中间结果保存,在请求中添加参数
238
- if self.pipeline_config:
239
- form_data['config'] = json.dumps(self.pipeline_config.to_dict(), ensure_ascii=False)
234
+ try:
235
+ form_data['stages'] = json.dumps(stages_data)
236
+ form_data['data_source'] = json.dumps(data_source, ensure_ascii=False)
237
+
238
+ # 如果启用了中间结果保存,在请求中添加参数
239
+ if self.pipeline_config:
240
+ form_data['config'] = json.dumps(self.pipeline_config.to_dict(), ensure_ascii=False)
241
+ except Exception as e:
242
+ print(f" ✗ 入参处理失败,请检查配置: {e}")
243
+ logger.error(f"入参处理失败,请检查配置: {e}")
244
+ return None
240
245
 
241
246
  response = requests.post(
242
247
  url,
@@ -469,13 +474,13 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
469
474
  bucket=source_config['bucket'],
470
475
  prefix=source_config.get('prefix', ''),
471
476
  region=source_config.get('region', 'us-east-1'),
472
- pattern=source_config.get('pattern', '*'),
477
+ pattern=source_config.get('pattern', None),
473
478
  recursive=source_config.get('recursive', False)
474
479
  )
475
480
  elif source_config['type'] == 'local':
476
481
  source = LocalSource(
477
482
  directory=source_config['directory'],
478
- pattern=source_config.get('pattern', '*'),
483
+ pattern=source_config.get('pattern', None),
479
484
  recursive=source_config.get('recursive', False)
480
485
  )
481
486
  elif source_config['type'] == 'ftp':
@@ -484,7 +489,7 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
484
489
  port=source_config['port'],
485
490
  username=source_config['username'],
486
491
  password=source_config['password'],
487
- pattern=source_config.get('pattern', '*'),
492
+ pattern=source_config.get('pattern', None),
488
493
  recursive=source_config.get('recursive', False)
489
494
  )
490
495
  elif source_config['type'] == 'smb':
@@ -496,7 +501,7 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
496
501
  domain=source_config.get('domain', ''),
497
502
  port=source_config.get('port', 445),
498
503
  path=source_config.get('path', ''),
499
- pattern=source_config.get('pattern', '*'),
504
+ pattern=source_config.get('pattern', None),
500
505
  recursive=source_config.get('recursive', False)
501
506
  )
502
507
  else:
@@ -121,8 +121,6 @@ class S3Source(Source):
121
121
 
122
122
  if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
123
123
  config = Config(signature_version='s3v4')
124
- elif self.endpoint.endswith('aliyuncs.com'):
125
- config = Config(signature_version='s3', s3={'addressing_style': 'virtual'})
126
124
  else:
127
125
  config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})
128
126
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.14
3
+ Version: 0.2.16
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -0,0 +1,11 @@
1
+ xparse_client/__init__.py,sha256=C2XLxkCoONl6_B1FmDhWRw84TqOL4pZF20br-K26SSY,1721
2
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
3
+ xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
4
+ xparse_client/pipeline/destinations.py,sha256=QKlNGcpXIqkZS3rlBlhLDoRqIWA21Jgn3GiGhhfE8Rc,20921
5
+ xparse_client/pipeline/pipeline.py,sha256=CGT7K7ynwn550w4b-cSN0wkmUAVFAvouvP8CEyF0qy4,27549
6
+ xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
7
+ xparse_client-0.2.16.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
8
+ xparse_client-0.2.16.dist-info/METADATA,sha256=FFI7ggmbNMFUX-I_xD0Y8vnGbFOhpQ3yWtOP5tRM1ao,28824
9
+ xparse_client-0.2.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ xparse_client-0.2.16.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
11
+ xparse_client-0.2.16.dist-info/RECORD,,
example/run_pipeline.py DELETED
@@ -1,515 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- encoding: utf-8 -*-
3
- '''
4
- Pipeline 运行脚本
5
- 快速启动和运行 Pipeline 的示例
6
- '''
7
-
8
- import json
9
- from datetime import datetime, timezone
10
- from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
11
-
12
-
13
- # ============================================================================
14
- # 常量配置
15
- # ============================================================================
16
-
17
- # API 请求头配置
18
- API_HEADERS = {
19
- 'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
20
- 'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
21
- }
22
-
23
-
24
- # ============================================================================
25
- # 方式 1: 使用配置字典
26
- # ============================================================================
27
-
28
- def run_with_config():
29
- """使用配置字典运行 pipeline"""
30
-
31
- config = {
32
- 'source': {
33
- 'type': 's3',
34
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
35
- 'access_key': 'IEQspf8C7fVcgmp3AZWl',
36
- 'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
37
- 'bucket': 'textin-test',
38
- 'prefix': '', # 留空处理所有文件,或指定如 'milvus/'
39
- 'region': 'us-east-1'
40
- },
41
- 'destination': {
42
- 'type': 'milvus',
43
- 'db_path': './milvus_pipeline.db',
44
- 'collection_name': 'pipeline_collection',
45
- 'dimension': 1024
46
- },
47
- 'api_base_url': 'https://api.textin.com/api/xparse',
48
- 'api_headers': API_HEADERS,
49
- # Stages 配置
50
- 'stages': [
51
- {
52
- 'type': 'parse',
53
- 'config': {
54
- 'provider': 'textin'
55
- }
56
- },
57
- {
58
- 'type': 'chunk',
59
- 'config': {
60
- 'strategy': 'basic', # 分块策略: 'basic' | 'by_title' | 'by_page'
61
- 'include_orig_elements': False, # 是否包含原始元素
62
- 'new_after_n_chars': 512, # 多少字符后创建新块
63
- 'max_characters': 1024, # 最大字符数
64
- 'overlap': 0 # 重叠字符数
65
- }
66
- },
67
- {
68
- 'type': 'embed',
69
- 'config': {
70
- 'provider': 'qwen', # 向量化供应商: 'qwen'
71
- 'model_name': 'text-embedding-v3' # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
72
- }
73
- }
74
- ]
75
- }
76
-
77
- pipeline = create_pipeline_from_config(config)
78
- pipeline.run()
79
-
80
-
81
- # ============================================================================
82
- # 方式 2: 手动创建组件
83
- # ============================================================================
84
-
85
- def run_with_manual_setup():
86
- """手动创建 Source、Destination 和 Pipeline"""
87
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage, PipelineConfig, LocalDestination, QdrantDestination
88
-
89
- # 创建 S3 数据源
90
- # source = S3Source(
91
- # endpoint='https://textin-minio-api.ai.intsig.net',
92
- # access_key='IEQspf8C7fVcgmp3AZWl',
93
- # secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
94
- # bucket='textin-test',
95
- # prefix='',
96
- # region='us-east-1'
97
- # )
98
- # source = S3Source(
99
- # endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
100
- # access_key='LTAI5tBgsaVfkbh9rbPyuB17',
101
- # secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
102
- # bucket='textin',
103
- # prefix='',
104
- # region='cn-shanghai'
105
- # )
106
- # source=S3Source(
107
- # endpoint='https://S3.oss-cn-shanghai.aliyuncs.com',
108
- # access_key='LTAI5t6ZnqTra8oLmJEfvcr7',
109
- # secret_key='SEbz4oJ4KNJIOTMfphuVGOWmRpGGUG',
110
- # bucket='textin-test-aliyun',
111
- # prefix='',
112
- # region='cn-shanghai'
113
- # )
114
- # source = S3Source(
115
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
116
- # access_key='',
117
- # secret_key='',
118
- # bucket='textin-1300705866',
119
- # prefix='',
120
- # region='ap-shanghai'
121
- # )
122
- # source = S3Source(
123
- # endpoint='https://tos-s3-cn-shanghai.volces.com',
124
- # access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
125
- # secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
126
- # bucket='textin',
127
- # prefix='',
128
- # region='cn-shanghai'
129
- # )
130
- # source = S3Source(
131
- # endpoint='https://obs.cn-east-3.myhuaweicloud.com',
132
- # access_key='',
133
- # secret_key='',
134
- # bucket='textin',
135
- # prefix='',
136
- # region='cn-east-3'
137
- # )
138
- # source = S3Source(
139
- # endpoint='https://s3.us-east-1.amazonaws.com',
140
- # access_key='AKIA6QUE3TVZADUWA4PO',
141
- # secret_key='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',
142
- # bucket='textin-test',
143
- # prefix='',
144
- # region='us-east-1'
145
- # )
146
- # source = S3Source(
147
- # endpoint='http://127.0.0.1:9000',
148
- # access_key='',
149
- # secret_key='',
150
- # bucket='textin',
151
- # prefix='',
152
- # region='us-east-1'
153
- # )
154
- # source = SmbSource(
155
- # host='internal-storage.intsig.net',
156
- # share_name='ke_wang',
157
- # username='ke_wang',
158
- # password='',
159
- # domain='INTSIG.COM'
160
- # )
161
- # source = FtpSource(
162
- # host='127.0.0.1',
163
- # port=21,
164
- # # recursive=True,
165
- # username='', # 用户名,按照实际填写
166
- # password='' # 密码,按照实际填写
167
- # )
168
- source = LocalSource(
169
- directory='/Users/ke_wang/Documents/doc',
170
- pattern=['*.pdf'],
171
- recursive=True,
172
- )
173
-
174
- # source=S3Source(
175
- # endpoint='https://obs.cn-north-4.myhuaweicloud.com',
176
- # access_key='HPUAFT3D1Q6O6UUN1RWQ',
177
- # secret_key='4zIk8x37nZiDS9P585BTFCWsOSo5G7ok1yRWtEA1',
178
- # bucket='textin-test-ywj',
179
- # prefix='',
180
- # region='cn-north-4'
181
- # )# 华为云
182
-
183
- # 创建 Milvus 目的地
184
- # destination = MilvusDestination(
185
- # db_path='./milvus_pipeline1.db',
186
- # collection_name='pipeline_collection',
187
- # dimension=1024
188
- # )
189
-
190
- # destination = LocalDestination(
191
- # output_dir='./result'
192
- # )
193
-
194
- # destination = MilvusDestination(
195
- # db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
196
- # collection_name='textin_test_3_copy', # 数据库collection名称
197
- # dimension=1024, # 向量维度,需与 embed API 返回一致
198
- # api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
199
- # )
200
-
201
- # destination = S3Destination(
202
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
203
- # access_key='',
204
- # secret_key='',
205
- # bucket='textin-1300705866',
206
- # prefix='result',
207
- # region='ap-shanghai'
208
- # )
209
-
210
- # destination = QdrantDestination(
211
- # url='https://1325db22-7dd8-4fc9-930b-f969d4963b3d.us-east-1-1.aws.cloud.qdrant.io:6333',
212
- # collection_name='textin1',
213
- # dimension=1024,
214
- # api_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.TGnFB1pAD7c7IqSOvTpgCPpHXSnnoKhWEQ5pQ8DrBnI',
215
- # )
216
-
217
- destination = S3Destination(
218
- endpoint='https://obs.cn-east-3.myhuaweicloud.com',
219
- access_key='HPUAFT3D1Q6O6UUN1RWQ',
220
- secret_key='4zIk8x37nZiDS9P585BTFCWsOSo5G7ok1yRWtEA1',
221
- bucket='xparse',
222
- prefix='json/',
223
- region='cn-east-3'
224
- ) # 华为云
225
-
226
- # 使用新的 stages 格式创建配置
227
- stages = [
228
- Stage(
229
- type='parse',
230
- config=ParseConfig(provider='textin', page_ranges='3')
231
- ),
232
- Stage(
233
- type='chunk',
234
- config=ChunkConfig(
235
- strategy='by_title', # 按标题分块
236
- include_orig_elements=False,
237
- new_after_n_chars=512,
238
- max_characters=1024,
239
- overlap=50 # 块之间重叠 50 字符
240
- )
241
- ),
242
- Stage(
243
- type='embed',
244
- config=EmbedConfig(
245
- provider='qwen',
246
- model_name='text-embedding-v3'
247
- )
248
- )
249
- ]
250
-
251
- # 配置中间结果保存
252
- intermediate_results_destination = LocalDestination(
253
- output_dir='./intermediate_results'
254
- )
255
-
256
- pipeline_config = PipelineConfig(
257
- include_intermediate_results=True,
258
- intermediate_results_destination=intermediate_results_destination
259
- )
260
-
261
- # 创建 Pipeline
262
- pipeline = Pipeline(
263
- source=source,
264
- destination=destination,
265
- api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
266
- api_headers=API_HEADERS,
267
- stages=stages,
268
- pipeline_config=pipeline_config
269
- )
270
-
271
- # 运行
272
- # config = pipeline.get_config()
273
- pipeline.run()
274
-
275
-
276
- # ============================================================================
277
- # 方式 3: 本地测试(本地文件 -> 本地输出)
278
- # ============================================================================
279
-
280
- def run_local_test():
281
- """使用本地文件进行测试"""
282
-
283
- config = {
284
- 'source': {
285
- 'type': 'local',
286
- 'directory': '/Users/ke_wang/Documents/doc',
287
- 'pattern': '*.pdf'
288
- },
289
- 'destination': {
290
- 'type': 's3',
291
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
292
- 'access_key': '',
293
- 'secret_key': '',
294
- 'bucket': 'textin-test',
295
- 'prefix': '',
296
- 'region': 'us-east-1'
297
- },
298
- 'api_base_url': 'https://api.textin.com/api/xparse',
299
- 'api_headers': API_HEADERS,
300
- # Stages 配置
301
- 'stages': [
302
- {
303
- 'type': 'parse',
304
- 'config': {
305
- 'provider': 'textin'
306
- }
307
- },
308
- {
309
- 'type': 'embed',
310
- 'config': {
311
- 'provider': 'qwen',
312
- 'model_name': 'text-embedding-v3'
313
- }
314
- }
315
- ]
316
- }
317
-
318
- pipeline = create_pipeline_from_config(config)
319
- pipeline.run()
320
-
321
-
322
- # ============================================================================
323
- # 方式 4: 处理单个文件
324
- # ============================================================================
325
-
326
- def run_single_file():
327
- """只处理单个文件"""
328
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
329
-
330
- # 创建 pipeline
331
- source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
332
- destination = LocalDestination(output_dir='./output')
333
-
334
- # 使用新的 stages 格式创建配置
335
- stages = [
336
- Stage(
337
- type='parse',
338
- config=ParseConfig(provider='textin')
339
- ),
340
- Stage(
341
- type='chunk',
342
- config=ChunkConfig(
343
- strategy='by_page', # 按页面分块
344
- max_characters=2048, # 增大块大小
345
- overlap=100
346
- )
347
- ),
348
- Stage(
349
- type='embed',
350
- config=EmbedConfig(
351
- provider='qwen',
352
- model_name='text-embedding-v4' # 使用更高精度的模型
353
- )
354
- )
355
- ]
356
-
357
- pipeline = Pipeline(
358
- source=source,
359
- destination=destination,
360
- api_base_url='https://api.textin.com/api/xparse',
361
- api_headers=API_HEADERS,
362
- stages=stages
363
- )
364
-
365
- # 只处理指定文件
366
- file_path = '4e3250f00210431fb29ca0c808.pdf' # 相对于 source directory 的路径
367
- success = pipeline.process_file(file_path)
368
-
369
- if success:
370
- print(f"\n✅ 文件 {file_path} 处理成功!")
371
- else:
372
- print(f"\n❌ 文件 {file_path} 处理失败!")
373
-
374
-
375
- # ============================================================================
376
- # 方式 5: 自定义处理流程
377
- # ============================================================================
378
-
379
- def run_custom_flow():
380
- """自定义处理流程,手动控制文件处理"""
381
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
382
-
383
- # 创建组件
384
- source = S3Source(
385
- endpoint='https://textin-minio-api.ai.intsig.net',
386
- access_key='',
387
- secret_key='',
388
- bucket='textin-test',
389
- prefix='',
390
- region='us-east-1',
391
- pattern='*.pdf'
392
- )
393
-
394
- destination = MilvusDestination(
395
- db_path='./milvus_custom.db',
396
- collection_name='custom_collection',
397
- dimension=1024
398
- )
399
-
400
- # 使用新的 stages 格式创建配置
401
- stages = [
402
- Stage(
403
- type='parse',
404
- config=ParseConfig(provider='textin')
405
- ),
406
- Stage(
407
- type='chunk',
408
- config=ChunkConfig(
409
- strategy='by_title',
410
- include_orig_elements=True,
411
- max_characters=1536,
412
- overlap=80
413
- )
414
- ),
415
- Stage(
416
- type='embed',
417
- config=EmbedConfig(
418
- provider='qwen',
419
- model_name='text-embedding-v4'
420
- )
421
- )
422
- ]
423
-
424
- pipeline = Pipeline(
425
- source=source,
426
- destination=destination,
427
- api_base_url='https://api.textin.com/api/xparse',
428
- api_headers=API_HEADERS,
429
- stages=stages
430
- )
431
-
432
- # 手动控制文件处理
433
- files = source.list_files()
434
-
435
- for file_path in files[:2]: # 只处理前2个文件
436
- print(f"\n处理: {file_path}")
437
- file_bytes, data_source = source.read_file(file_path)
438
- data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
439
-
440
- # 使用 pipeline 接口处理
441
- result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
442
-
443
- if result:
444
- embedded, stats = result
445
- print(f" - 原始元素: {stats.original_elements}")
446
- print(f" - 分块后: {stats.chunked_elements}")
447
- print(f" - 向量化: {stats.embedded_elements}")
448
-
449
- # 写入
450
- metadata = {
451
- 'file_name': file_path,
452
- 'data_source': data_source,
453
- 'stats': {
454
- 'original_elements': stats.original_elements,
455
- 'chunked_elements': stats.chunked_elements,
456
- 'embedded_elements': stats.embedded_elements
457
- }
458
- }
459
- destination.write(embedded, metadata)
460
- print(f"✓ 完成: {file_path}")
461
- else:
462
- print(f"✗ 失败: {file_path}")
463
-
464
-
465
- # ============================================================================
466
- # 主函数
467
- # ============================================================================
468
-
469
- def main():
470
- """主函数 - 选择运行方式"""
471
-
472
- print("=" * 60)
473
- print("Pipeline 运行脚本")
474
- print("=" * 60)
475
- print("\n请选择运行方式:")
476
- print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
477
- print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
478
- print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
479
- print("4. 处理单个文件 [按页面分块 + V4模型]")
480
- print("5. 自定义处理流程 [手动控制 + 统计信息]")
481
- print()
482
-
483
- try:
484
- choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
485
-
486
- if choice == '1':
487
- print("\n使用配置字典运行...")
488
- run_with_config()
489
- elif choice == '2':
490
- print("\n手动创建组件运行...")
491
- run_with_manual_setup()
492
- elif choice == '3':
493
- print("\n本地测试模式...")
494
- run_local_test()
495
- elif choice == '4':
496
- print("\n处理单个文件...")
497
- run_single_file()
498
- elif choice == '5':
499
- print("\n自定义处理流程...")
500
- run_custom_flow()
501
- else:
502
- print("无效的选项,使用默认方式运行...")
503
- run_with_config()
504
-
505
- except KeyboardInterrupt:
506
- print("\n\n用户中断执行")
507
- except Exception as e:
508
- print(f"\n程序异常: {str(e)}")
509
- import traceback
510
- traceback.print_exc()
511
-
512
-
513
- if __name__ == '__main__':
514
- main()
515
-
@@ -1,458 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- encoding: utf-8 -*-
3
- '''
4
- Pipeline 运行脚本
5
- 快速启动和运行 Pipeline 的示例
6
- '''
7
-
8
- from datetime import datetime, timezone
9
- from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
10
-
11
-
12
- # ============================================================================
13
- # 常量配置
14
- # ============================================================================
15
-
16
- # API 请求头配置
17
- API_HEADERS = {
18
- 'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
19
- 'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
20
- }
21
-
22
-
23
- # ============================================================================
24
- # 方式 1: 使用配置字典
25
- # ============================================================================
26
-
27
- def run_with_config():
28
- """使用配置字典运行 pipeline"""
29
-
30
- config = {
31
- 'source': {
32
- 'type': 's3',
33
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
34
- 'access_key': 'IEQspf8C7fVcgmp3AZWl',
35
- 'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
36
- 'bucket': 'textin-test',
37
- 'prefix': '', # 留空处理所有文件,或指定如 'milvus/'
38
- 'region': 'us-east-1'
39
- },
40
- 'destination': {
41
- 'type': 'milvus',
42
- 'db_path': './milvus_pipeline.db',
43
- 'collection_name': 'pipeline_collection',
44
- 'dimension': 1024
45
- },
46
- 'api_base_url': 'https://api.textin.com/api/xparse',
47
- 'api_headers': API_HEADERS,
48
- # Stages 配置
49
- 'stages': [
50
- {
51
- 'type': 'parse',
52
- 'config': {
53
- 'provider': 'textin'
54
- }
55
- },
56
- {
57
- 'type': 'chunk',
58
- 'config': {
59
- 'strategy': 'basic', # 分块策略: 'basic' | 'by_title' | 'by_page'
60
- 'include_orig_elements': False, # 是否包含原始元素
61
- 'new_after_n_chars': 512, # 多少字符后创建新块
62
- 'max_characters': 1024, # 最大字符数
63
- 'overlap': 0 # 重叠字符数
64
- }
65
- },
66
- {
67
- 'type': 'embed',
68
- 'config': {
69
- 'provider': 'qwen', # 向量化供应商: 'qwen'
70
- 'model_name': 'text-embedding-v3' # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
71
- }
72
- }
73
- ]
74
- }
75
-
76
- pipeline = create_pipeline_from_config(config)
77
- pipeline.run()
78
-
79
-
80
- # ============================================================================
81
- # 方式 2: 手动创建组件
82
- # ============================================================================
83
-
84
- def run_with_manual_setup():
85
- """手动创建 Source、Destination 和 Pipeline"""
86
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
87
-
88
- # 创建 S3 数据源
89
- source = S3Source(
90
- endpoint='https://textin-minio-api.ai.intsig.net',
91
- access_key='IEQspf8C7fVcgmp3AZWl',
92
- secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
93
- bucket='textin-test',
94
- prefix='',
95
- region='us-east-1'
96
- )
97
- source = S3Source(
98
- endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
99
- access_key='LTAI5tBgsaVfkbh9rbPyuB17',
100
- secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
101
- bucket='textin',
102
- prefix='',
103
- region='cn-shanghai',
104
- pattern='*.png'
105
- )
106
- # source = S3Source(
107
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
108
- # access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
109
- # secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
110
- # bucket='textin-1300705866',
111
- # prefix='',
112
- # region='ap-shanghai'
113
- # )
114
- # source = S3Source(
115
- # endpoint='https://tos-s3-cn-shanghai.volces.com',
116
- # access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
117
- # secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
118
- # bucket='textin',
119
- # prefix='',
120
- # region='cn-shanghai'
121
- # )
122
- # source = S3Source(
123
- # endpoint='https://obs.cn-east-3.myhuaweicloud.com',
124
- # access_key='HPUAL646UCQ1YAT7JMWY',
125
- # secret_key='z9cm95UXCw0R4J3AEig9siqGpZNbwDYz8PVoBGDI',
126
- # bucket='textin',
127
- # prefix='',
128
- # region='cn-east-3'
129
- # )
130
- # source = S3Source(
131
- # endpoint='https://s3.us-east-1.amazonaws.com',
132
- # access_key='AKIA6QUE3TVZADUWA4PO',
133
- # secret_key='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',
134
- # bucket='textin-xparse',
135
- # prefix='',
136
- # region='us-east-1'
137
- # )
138
- # source = S3Source(
139
- # endpoint='http://127.0.0.1:9000',
140
- # access_key='ldvOkKVZrsHW8ruqhwVG',
141
- # secret_key='sH665Q2DKgQxyLOpObXTb088SD2hvP0Rtg1dGTiT',
142
- # bucket='textin',
143
- # prefix='',
144
- # region='us-east-1'
145
- # )
146
- # source = SmbSource(
147
- # host='internal-storage.intsig.net',
148
- # share_name='ke_wang',
149
- # username='ke_wang',
150
- # password='Hhxxblj!4',
151
- # domain='INTSIG.COM'
152
- # )
153
- # source = FtpSource(
154
- # host='127.0.0.1',
155
- # port=21,
156
- # username='', # 用户名,按照实际填写
157
- # password='' # 密码,按照实际填写
158
- # )
159
- # source = LocalSource(
160
- # directory='/Users/ke_wang/Documents/doc',
161
- # pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
162
- # )
163
-
164
- # 创建 Milvus 目的地
165
- destination = MilvusDestination(
166
- db_path='./milvus_pipeline1.db',
167
- collection_name='pipeline_collection',
168
- dimension=1024
169
- )
170
-
171
- # destination = S3Destination(
172
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
173
- # access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
174
- # secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
175
- # bucket='textin-1300705866',
176
- # prefix='result',
177
- # region='ap-shanghai'
178
- # )
179
-
180
- # 使用新的 stages 格式创建配置
181
- stages = [
182
- Stage(
183
- type='parse',
184
- config=ParseConfig(provider='textin')
185
- ),
186
- Stage(
187
- type='chunk',
188
- config=ChunkConfig(
189
- strategy='by_title', # 按标题分块
190
- include_orig_elements=False,
191
- new_after_n_chars=512,
192
- max_characters=1024,
193
- overlap=50 # 块之间重叠 50 字符
194
- )
195
- )
196
- # 如果需要 embed,取消下面的注释
197
- # Stage(
198
- # type='embed',
199
- # config=EmbedConfig(
200
- # provider='qwen',
201
- # model_name='text-embedding-v3'
202
- # )
203
- # )
204
- ]
205
-
206
- # 创建 Pipeline
207
- pipeline = Pipeline(
208
- source=source,
209
- destination=destination,
210
- api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
211
- api_headers=API_HEADERS,
212
- stages=stages
213
- )
214
-
215
- # 运行
216
- pipeline.run()
217
-
218
-
219
- # ============================================================================
220
- # 方式 3: 本地测试(本地文件 -> 本地输出)
221
- # ============================================================================
222
-
223
- def run_local_test():
224
- """使用本地文件进行测试"""
225
-
226
- config = {
227
- 'source': {
228
- 'type': 'local',
229
- 'directory': '/Users/ke_wang/Documents/doc',
230
- 'pattern': '*.pdf'
231
- },
232
- 'destination': {
233
- 'type': 's3',
234
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
235
- 'access_key': 'IEQspf8C7fVcgmp3AZWl',
236
- 'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
237
- 'bucket': 'textin-test',
238
- 'prefix': '',
239
- 'region': 'us-east-1'
240
- },
241
- 'api_base_url': 'https://api.textin.com/api/xparse',
242
- 'api_headers': API_HEADERS,
243
- # Stages 配置
244
- 'stages': [
245
- {
246
- 'type': 'parse',
247
- 'config': {
248
- 'provider': 'textin'
249
- }
250
- },
251
- {
252
- 'type': 'embed',
253
- 'config': {
254
- 'provider': 'qwen',
255
- 'model_name': 'text-embedding-v3'
256
- }
257
- }
258
- ]
259
- }
260
-
261
- pipeline = create_pipeline_from_config(config)
262
- pipeline.run()
263
-
264
-
265
- # ============================================================================
266
- # 方式 4: 处理单个文件
267
- # ============================================================================
268
-
269
- def run_single_file():
270
- """只处理单个文件"""
271
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
272
-
273
- # 创建 pipeline
274
- source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
275
- destination = LocalDestination(output_dir='./output')
276
-
277
- # 使用新的 stages 格式创建配置
278
- stages = [
279
- Stage(
280
- type='parse',
281
- config=ParseConfig(provider='textin')
282
- ),
283
- Stage(
284
- type='chunk',
285
- config=ChunkConfig(
286
- strategy='by_page', # 按页面分块
287
- max_characters=2048, # 增大块大小
288
- overlap=100
289
- )
290
- ),
291
- Stage(
292
- type='embed',
293
- config=EmbedConfig(
294
- provider='qwen',
295
- model_name='text-embedding-v4' # 使用更高精度的模型
296
- )
297
- )
298
- ]
299
-
300
- pipeline = Pipeline(
301
- source=source,
302
- destination=destination,
303
- api_base_url='https://api.textin.com/api/xparse',
304
- api_headers=API_HEADERS,
305
- stages=stages
306
- )
307
-
308
- # 只处理指定文件
309
- file_path = '4e3250f00210431fb29ca0c808.pdf' # 相对于 source directory 的路径
310
- success = pipeline.process_file(file_path)
311
-
312
- if success:
313
- print(f"\n✅ 文件 {file_path} 处理成功!")
314
- else:
315
- print(f"\n❌ 文件 {file_path} 处理失败!")
316
-
317
-
318
- # ============================================================================
319
- # 方式 5: 自定义处理流程
320
- # ============================================================================
321
-
322
- def run_custom_flow():
323
- """自定义处理流程,手动控制文件处理"""
324
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
325
-
326
- # 创建组件
327
- source = S3Source(
328
- endpoint='https://textin-minio-api.ai.intsig.net',
329
- access_key='IEQspf8C7fVcgmp3AZWl',
330
- secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
331
- bucket='textin-test',
332
- prefix='',
333
- region='us-east-1',
334
- pattern='*.pdf'
335
- )
336
-
337
- destination = MilvusDestination(
338
- db_path='./milvus_custom.db',
339
- collection_name='custom_collection',
340
- dimension=1024
341
- )
342
-
343
- # 使用新的 stages 格式创建配置
344
- stages = [
345
- Stage(
346
- type='parse',
347
- config=ParseConfig(provider='textin')
348
- ),
349
- Stage(
350
- type='chunk',
351
- config=ChunkConfig(
352
- strategy='by_title',
353
- include_orig_elements=True,
354
- max_characters=1536,
355
- overlap=80
356
- )
357
- ),
358
- Stage(
359
- type='embed',
360
- config=EmbedConfig(
361
- provider='qwen',
362
- model_name='text-embedding-v4'
363
- )
364
- )
365
- ]
366
-
367
- pipeline = Pipeline(
368
- source=source,
369
- destination=destination,
370
- api_base_url='https://api.textin.com/api/xparse',
371
- api_headers=API_HEADERS,
372
- stages=stages
373
- )
374
-
375
- # 手动控制文件处理
376
- files = source.list_files()
377
-
378
- for file_path in files[:2]: # 只处理前2个文件
379
- print(f"\n处理: {file_path}")
380
- file_bytes, data_source = source.read_file(file_path)
381
- data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
382
-
383
- # 使用 pipeline 接口处理
384
- result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
385
-
386
- if result:
387
- embedded, stats = result
388
- print(f" - 原始元素: {stats.original_elements}")
389
- print(f" - 分块后: {stats.chunked_elements}")
390
- print(f" - 向量化: {stats.embedded_elements}")
391
-
392
- # 写入
393
- metadata = {
394
- 'file_name': file_path,
395
- 'data_source': data_source,
396
- 'stats': {
397
- 'original_elements': stats.original_elements,
398
- 'chunked_elements': stats.chunked_elements,
399
- 'embedded_elements': stats.embedded_elements
400
- }
401
- }
402
- destination.write(embedded, metadata)
403
- print(f"✓ 完成: {file_path}")
404
- else:
405
- print(f"✗ 失败: {file_path}")
406
-
407
-
408
- # ============================================================================
409
- # 主函数
410
- # ============================================================================
411
-
412
- def main():
413
- """主函数 - 选择运行方式"""
414
-
415
- print("=" * 60)
416
- print("Pipeline 运行脚本")
417
- print("=" * 60)
418
- print("\n请选择运行方式:")
419
- print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
420
- print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
421
- print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
422
- print("4. 处理单个文件 [按页面分块 + V4模型]")
423
- print("5. 自定义处理流程 [手动控制 + 统计信息]")
424
- print()
425
-
426
- try:
427
- choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
428
-
429
- if choice == '1':
430
- print("\n使用配置字典运行...")
431
- run_with_config()
432
- elif choice == '2':
433
- print("\n手动创建组件运行...")
434
- run_with_manual_setup()
435
- elif choice == '3':
436
- print("\n本地测试模式...")
437
- run_local_test()
438
- elif choice == '4':
439
- print("\n处理单个文件...")
440
- run_single_file()
441
- elif choice == '5':
442
- print("\n自定义处理流程...")
443
- run_custom_flow()
444
- else:
445
- print("无效的选项,使用默认方式运行...")
446
- run_with_config()
447
-
448
- except KeyboardInterrupt:
449
- print("\n\n用户中断执行")
450
- except Exception as e:
451
- print(f"\n程序异常: {str(e)}")
452
- import traceback
453
- traceback.print_exc()
454
-
455
-
456
- if __name__ == '__main__':
457
- main()
458
-
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=Hfc27wsGoif7FzIFiWHdcnxIjtGCq2L5VeNF4X_d0Q0,16643
2
- example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
3
- xparse_client/__init__.py,sha256=C2XLxkCoONl6_B1FmDhWRw84TqOL4pZF20br-K26SSY,1721
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
- xparse_client/pipeline/destinations.py,sha256=QKlNGcpXIqkZS3rlBlhLDoRqIWA21Jgn3GiGhhfE8Rc,20921
7
- xparse_client/pipeline/pipeline.py,sha256=ZspagUjiL5wnzGJq6A7riOU8qGXJMtg1fqPm9H09mkk,27272
8
- xparse_client/pipeline/sources.py,sha256=pzJ5FjP-kZi-6Cphhm9rOPXETmHw5Qpf7EaxrQPgSxs,22285
9
- xparse_client-0.2.14.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.14.dist-info/METADATA,sha256=DWUEbC_Ez38V_2al6TQEuAkEEtOG70kg3ye3YL8UJ8o,28824
11
- xparse_client-0.2.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.14.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.14.dist-info/RECORD,,