xparse-client 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/run_pipeline.py +4 -2
- xparse_client/pipeline/pipeline.py +133 -1
- xparse_client/pipeline/sources.py +0 -1
- {xparse_client-0.2.6.dist-info → xparse_client-0.2.8.dist-info}/METADATA +301 -242
- xparse_client-0.2.8.dist-info/RECORD +13 -0
- xparse_client-0.2.6.dist-info/RECORD +0 -13
- {xparse_client-0.2.6.dist-info → xparse_client-0.2.8.dist-info}/WHEEL +0 -0
- {xparse_client-0.2.6.dist-info → xparse_client-0.2.8.dist-info}/licenses/LICENSE +0 -0
- {xparse_client-0.2.6.dist-info → xparse_client-0.2.8.dist-info}/top_level.txt +0 -0
example/run_pipeline.py
CHANGED
|
@@ -5,6 +5,7 @@ Pipeline 运行脚本
|
|
|
5
5
|
快速启动和运行 Pipeline 的示例
|
|
6
6
|
'''
|
|
7
7
|
|
|
8
|
+
import json
|
|
8
9
|
from datetime import datetime, timezone
|
|
9
10
|
from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
|
|
10
11
|
|
|
@@ -175,7 +176,7 @@ def run_with_manual_setup():
|
|
|
175
176
|
|
|
176
177
|
destination = MilvusDestination(
|
|
177
178
|
db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
|
|
178
|
-
collection_name='
|
|
179
|
+
collection_name='textin_test_3_copy', # 数据库collection名称
|
|
179
180
|
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
180
181
|
api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
|
|
181
182
|
)
|
|
@@ -235,7 +236,8 @@ def run_with_manual_setup():
|
|
|
235
236
|
)
|
|
236
237
|
|
|
237
238
|
# 运行
|
|
238
|
-
pipeline.
|
|
239
|
+
config = pipeline.get_config()
|
|
240
|
+
print(json.dumps(config, ensure_ascii=False, indent=2))
|
|
239
241
|
|
|
240
242
|
|
|
241
243
|
# ============================================================================
|
|
@@ -79,6 +79,139 @@ class Pipeline:
|
|
|
79
79
|
print(f" Pipeline Config: 中间结果保存已启用")
|
|
80
80
|
print("=" * 60)
|
|
81
81
|
|
|
82
|
+
def get_config(self) -> Dict[str, Any]:
|
|
83
|
+
"""获取 Pipeline 的完整配置信息,返回字典格式(与 create_pipeline_from_config 的入参格式一致)"""
|
|
84
|
+
config = {}
|
|
85
|
+
|
|
86
|
+
# Source 配置
|
|
87
|
+
source_type = type(self.source).__name__.replace('Source', '').lower()
|
|
88
|
+
config['source'] = {'type': source_type}
|
|
89
|
+
|
|
90
|
+
if isinstance(self.source, S3Source):
|
|
91
|
+
config['source'].update({
|
|
92
|
+
'endpoint': self.source.endpoint,
|
|
93
|
+
'bucket': self.source.bucket,
|
|
94
|
+
'prefix': self.source.prefix,
|
|
95
|
+
'pattern': self.source.pattern,
|
|
96
|
+
'recursive': self.source.recursive
|
|
97
|
+
})
|
|
98
|
+
# access_key 和 secret_key 不在对象中保存,无法恢复
|
|
99
|
+
# region 也不在对象中保存,使用默认值
|
|
100
|
+
config['source']['region'] = 'us-east-1' # 默认值
|
|
101
|
+
elif isinstance(self.source, LocalSource):
|
|
102
|
+
config['source'].update({
|
|
103
|
+
'directory': str(self.source.directory),
|
|
104
|
+
'pattern': self.source.pattern,
|
|
105
|
+
'recursive': self.source.recursive
|
|
106
|
+
})
|
|
107
|
+
elif isinstance(self.source, FtpSource):
|
|
108
|
+
config['source'].update({
|
|
109
|
+
'host': self.source.host,
|
|
110
|
+
'port': self.source.port,
|
|
111
|
+
'username': self.source.username,
|
|
112
|
+
'pattern': self.source.pattern,
|
|
113
|
+
'recursive': self.source.recursive
|
|
114
|
+
})
|
|
115
|
+
# password 不在对象中保存,无法恢复
|
|
116
|
+
elif isinstance(self.source, SmbSource):
|
|
117
|
+
config['source'].update({
|
|
118
|
+
'host': self.source.host,
|
|
119
|
+
'share_name': self.source.share_name,
|
|
120
|
+
'username': self.source.username,
|
|
121
|
+
'domain': self.source.domain,
|
|
122
|
+
'port': self.source.port,
|
|
123
|
+
'path': self.source.path,
|
|
124
|
+
'pattern': self.source.pattern,
|
|
125
|
+
'recursive': self.source.recursive
|
|
126
|
+
})
|
|
127
|
+
# password 不在对象中保存,无法恢复
|
|
128
|
+
|
|
129
|
+
# Destination 配置
|
|
130
|
+
dest_type = type(self.destination).__name__.replace('Destination', '').lower()
|
|
131
|
+
# MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
|
|
132
|
+
if dest_type == 'milvus':
|
|
133
|
+
# 判断是本地 Milvus 还是 Zilliz(通过 db_path 判断)
|
|
134
|
+
if self.destination.db_path.startswith('http'):
|
|
135
|
+
dest_type = 'zilliz'
|
|
136
|
+
else:
|
|
137
|
+
dest_type = 'milvus'
|
|
138
|
+
|
|
139
|
+
config['destination'] = {'type': dest_type}
|
|
140
|
+
|
|
141
|
+
if isinstance(self.destination, MilvusDestination):
|
|
142
|
+
config['destination'].update({
|
|
143
|
+
'db_path': self.destination.db_path,
|
|
144
|
+
'collection_name': self.destination.collection_name,
|
|
145
|
+
'dimension': self.destination.dimension
|
|
146
|
+
})
|
|
147
|
+
# api_key 和 token 不在对象中保存,无法恢复
|
|
148
|
+
elif isinstance(self.destination, LocalDestination):
|
|
149
|
+
config['destination'].update({
|
|
150
|
+
'output_dir': str(self.destination.output_dir)
|
|
151
|
+
})
|
|
152
|
+
elif isinstance(self.destination, S3Destination):
|
|
153
|
+
config['destination'].update({
|
|
154
|
+
'endpoint': self.destination.endpoint,
|
|
155
|
+
'bucket': self.destination.bucket,
|
|
156
|
+
'prefix': self.destination.prefix
|
|
157
|
+
})
|
|
158
|
+
# access_key, secret_key, region 不在对象中保存,无法恢复
|
|
159
|
+
config['destination']['region'] = 'us-east-1' # 默认值
|
|
160
|
+
|
|
161
|
+
# API 配置
|
|
162
|
+
config['api_base_url'] = self.api_base_url
|
|
163
|
+
config['api_headers'] = {}
|
|
164
|
+
for key, value in self.api_headers.items():
|
|
165
|
+
config['api_headers'][key] = value
|
|
166
|
+
|
|
167
|
+
# Stages 配置
|
|
168
|
+
config['stages'] = []
|
|
169
|
+
for stage in self.stages:
|
|
170
|
+
stage_dict = {
|
|
171
|
+
'type': stage.type,
|
|
172
|
+
'config': {}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if isinstance(stage.config, ParseConfig):
|
|
176
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
177
|
+
elif isinstance(stage.config, ChunkConfig):
|
|
178
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
179
|
+
elif isinstance(stage.config, EmbedConfig):
|
|
180
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
181
|
+
else:
|
|
182
|
+
# 如果 config 是字典或其他类型,尝试转换
|
|
183
|
+
if isinstance(stage.config, dict):
|
|
184
|
+
stage_dict['config'] = stage.config
|
|
185
|
+
else:
|
|
186
|
+
stage_dict['config'] = str(stage.config)
|
|
187
|
+
|
|
188
|
+
config['stages'].append(stage_dict)
|
|
189
|
+
|
|
190
|
+
# Pipeline Config
|
|
191
|
+
if self.pipeline_config.include_intermediate_results:
|
|
192
|
+
config['pipeline_config'] = {
|
|
193
|
+
'include_intermediate_results': True,
|
|
194
|
+
'intermediate_results_destination': {}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
inter_dest = self.pipeline_config.intermediate_results_destination
|
|
198
|
+
if inter_dest:
|
|
199
|
+
inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
|
|
200
|
+
config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
|
|
201
|
+
|
|
202
|
+
if isinstance(inter_dest, LocalDestination):
|
|
203
|
+
config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
|
|
204
|
+
elif isinstance(inter_dest, S3Destination):
|
|
205
|
+
config['pipeline_config']['intermediate_results_destination'].update({
|
|
206
|
+
'endpoint': inter_dest.endpoint,
|
|
207
|
+
'bucket': inter_dest.bucket,
|
|
208
|
+
'prefix': inter_dest.prefix
|
|
209
|
+
})
|
|
210
|
+
# access_key, secret_key, region 不在对象中保存,无法恢复
|
|
211
|
+
config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1' # 默认值
|
|
212
|
+
|
|
213
|
+
return config
|
|
214
|
+
|
|
82
215
|
def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
83
216
|
url = f"{self.api_base_url}/pipeline"
|
|
84
217
|
max_retries = 3
|
|
@@ -228,7 +361,6 @@ class Pipeline:
|
|
|
228
361
|
print(f" → 读取文件...")
|
|
229
362
|
file_bytes, data_source = self.source.read_file(file_path)
|
|
230
363
|
data_source = data_source or {}
|
|
231
|
-
print(f" data_source: {data_source}")
|
|
232
364
|
# 转换为毫秒时间戳字符串
|
|
233
365
|
timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
234
366
|
data_source['date_processed'] = str(timestamp_ms)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
|
|
|
517
517
|
|
|
518
518
|
## 💡 使用示例
|
|
519
519
|
|
|
520
|
-
### 示例 1:
|
|
520
|
+
### 示例 1: 手动创建 Pipeline(推荐)
|
|
521
521
|
|
|
522
522
|
```python
|
|
523
|
-
from xparse_client import
|
|
523
|
+
from xparse_client import (
|
|
524
|
+
Pipeline, S3Source, MilvusDestination,
|
|
525
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
526
|
+
)
|
|
524
527
|
|
|
525
|
-
#
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
'
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
'
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
528
|
+
# 创建数据源
|
|
529
|
+
source = S3Source(
|
|
530
|
+
endpoint='https://your-minio.com',
|
|
531
|
+
access_key='your-access-key',
|
|
532
|
+
secret_key='your-secret-key',
|
|
533
|
+
bucket='documents',
|
|
534
|
+
prefix='pdfs/',
|
|
535
|
+
region='us-east-1',
|
|
536
|
+
pattern='*.pdf', # 仅处理匹配的文件
|
|
537
|
+
recursive=False # 不递归子目录
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# 创建目的地
|
|
541
|
+
destination = MilvusDestination(
|
|
542
|
+
db_path='./vectors.db',
|
|
543
|
+
collection_name='documents',
|
|
544
|
+
dimension=1024
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# 配置处理阶段
|
|
548
|
+
stages = [
|
|
549
|
+
Stage(
|
|
550
|
+
type='parse',
|
|
551
|
+
config=ParseConfig(provider='textin')
|
|
552
|
+
),
|
|
553
|
+
Stage(
|
|
554
|
+
type='chunk',
|
|
555
|
+
config=ChunkConfig(
|
|
556
|
+
strategy='by_title', # 按标题分块
|
|
557
|
+
include_orig_elements=False,
|
|
558
|
+
new_after_n_chars=512,
|
|
559
|
+
max_characters=1024,
|
|
560
|
+
overlap=50 # 块之间重叠 50 字符
|
|
561
|
+
)
|
|
562
|
+
),
|
|
563
|
+
Stage(
|
|
564
|
+
type='embed',
|
|
565
|
+
config=EmbedConfig(
|
|
566
|
+
provider='qwen',
|
|
567
|
+
model_name='text-embedding-v3'
|
|
568
|
+
)
|
|
569
|
+
)
|
|
570
|
+
]
|
|
571
|
+
|
|
572
|
+
# 创建并运行 Pipeline
|
|
573
|
+
pipeline = Pipeline(
|
|
574
|
+
source=source,
|
|
575
|
+
destination=destination,
|
|
576
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
577
|
+
api_headers={
|
|
550
578
|
'x-ti-app-id': 'your-app-id',
|
|
551
579
|
'x-ti-secret-code': 'your-secret-code'
|
|
552
580
|
},
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
'stages': [
|
|
556
|
-
{
|
|
557
|
-
'type': 'parse',
|
|
558
|
-
'config': {
|
|
559
|
-
'provider': 'textin'
|
|
560
|
-
}
|
|
561
|
-
},
|
|
562
|
-
{
|
|
563
|
-
'type': 'chunk',
|
|
564
|
-
'config': {
|
|
565
|
-
'strategy': 'by_title', # 按标题分块
|
|
566
|
-
'include_orig_elements': False,
|
|
567
|
-
'new_after_n_chars': 512,
|
|
568
|
-
'max_characters': 1024,
|
|
569
|
-
'overlap': 50 # 块之间重叠 50 字符
|
|
570
|
-
}
|
|
571
|
-
},
|
|
572
|
-
{
|
|
573
|
-
'type': 'embed',
|
|
574
|
-
'config': {
|
|
575
|
-
'provider': 'qwen',
|
|
576
|
-
'model_name': 'text-embedding-v3'
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
]
|
|
580
|
-
}
|
|
581
|
+
stages=stages
|
|
582
|
+
)
|
|
581
583
|
|
|
582
|
-
# 使用配置创建并运行 pipeline
|
|
583
|
-
pipeline = create_pipeline_from_config(config)
|
|
584
584
|
pipeline.run()
|
|
585
585
|
```
|
|
586
586
|
|
|
587
|
-
### 示例
|
|
587
|
+
### 示例 1.1: 输出配置字典
|
|
588
|
+
|
|
589
|
+
手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
|
|
588
590
|
|
|
589
591
|
```python
|
|
590
|
-
from
|
|
591
|
-
|
|
592
|
+
from xparse_client import (
|
|
593
|
+
Pipeline, LocalSource, LocalDestination,
|
|
594
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
595
|
+
)
|
|
592
596
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
597
|
+
# 手动创建 Pipeline
|
|
598
|
+
source = LocalSource(
|
|
599
|
+
directory='./test_files',
|
|
600
|
+
pattern='*.pdf',
|
|
601
|
+
recursive=False
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
605
|
+
|
|
606
|
+
stages = [
|
|
607
|
+
Stage(type='parse', config=ParseConfig(provider='textin')),
|
|
608
|
+
Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
|
|
609
|
+
Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
pipeline = Pipeline(
|
|
613
|
+
source=source,
|
|
614
|
+
destination=destination,
|
|
615
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
616
|
+
api_headers={
|
|
617
|
+
'x-ti-app-id': 'your-app-id',
|
|
618
|
+
'x-ti-secret-code': 'your-secret-code'
|
|
602
619
|
},
|
|
603
|
-
|
|
604
|
-
|
|
620
|
+
stages=stages
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
|
|
624
|
+
config_dict = pipeline.get_config()
|
|
625
|
+
|
|
626
|
+
# 可以保存为 JSON 文件
|
|
627
|
+
import json
|
|
628
|
+
with open('pipeline_config.json', 'w', encoding='utf-8') as f:
|
|
629
|
+
json.dump(config_dict, f, indent=2, ensure_ascii=False)
|
|
630
|
+
|
|
631
|
+
# 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
|
|
632
|
+
# from xparse_client import create_pipeline_from_config
|
|
633
|
+
# new_pipeline = create_pipeline_from_config(config_dict)
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
### 示例 2: 本地到本地(测试)
|
|
637
|
+
|
|
638
|
+
```python
|
|
639
|
+
from xparse_client import (
|
|
640
|
+
Pipeline, LocalSource, LocalDestination,
|
|
641
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# 创建本地数据源
|
|
645
|
+
source = LocalSource(
|
|
646
|
+
directory='./test_files',
|
|
647
|
+
pattern='*.pdf',
|
|
648
|
+
recursive=False
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# 创建本地输出目的地
|
|
652
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
653
|
+
|
|
654
|
+
# 配置处理阶段
|
|
655
|
+
stages = [
|
|
656
|
+
Stage(
|
|
657
|
+
type='parse',
|
|
658
|
+
config=ParseConfig(provider='textin')
|
|
659
|
+
),
|
|
660
|
+
Stage(
|
|
661
|
+
type='chunk',
|
|
662
|
+
config=ChunkConfig(
|
|
663
|
+
strategy='basic',
|
|
664
|
+
max_characters=1024
|
|
665
|
+
)
|
|
666
|
+
),
|
|
667
|
+
Stage(
|
|
668
|
+
type='embed',
|
|
669
|
+
config=EmbedConfig(
|
|
670
|
+
provider='qwen',
|
|
671
|
+
model_name='text-embedding-v3'
|
|
672
|
+
)
|
|
673
|
+
)
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
# 创建并运行 Pipeline
|
|
677
|
+
pipeline = Pipeline(
|
|
678
|
+
source=source,
|
|
679
|
+
destination=destination,
|
|
680
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
681
|
+
api_headers={
|
|
605
682
|
'x-ti-app-id': 'your-app-id',
|
|
606
683
|
'x-ti-secret-code': 'your-secret-code'
|
|
607
684
|
},
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
{
|
|
611
|
-
'type': 'parse',
|
|
612
|
-
'config': {
|
|
613
|
-
'provider': 'textin'
|
|
614
|
-
}
|
|
615
|
-
},
|
|
616
|
-
{
|
|
617
|
-
'type': 'chunk',
|
|
618
|
-
'config': {
|
|
619
|
-
'strategy': 'basic',
|
|
620
|
-
'max_characters': 1024
|
|
621
|
-
}
|
|
622
|
-
},
|
|
623
|
-
{
|
|
624
|
-
'type': 'embed',
|
|
625
|
-
'config': {
|
|
626
|
-
'provider': 'qwen',
|
|
627
|
-
'model_name': 'text-embedding-v3'
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
]
|
|
631
|
-
}
|
|
685
|
+
stages=stages
|
|
686
|
+
)
|
|
632
687
|
|
|
633
|
-
pipeline = create_pipeline_from_config(config)
|
|
634
688
|
pipeline.run()
|
|
635
689
|
```
|
|
636
690
|
|
|
637
691
|
### 示例 3: 不同分块策略的配置
|
|
638
692
|
|
|
639
693
|
```python
|
|
640
|
-
from xparse_client import
|
|
694
|
+
from xparse_client import (
|
|
695
|
+
Pipeline, S3Source, MilvusDestination,
|
|
696
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# 创建数据源和目的地
|
|
700
|
+
source = S3Source(...)
|
|
701
|
+
destination = MilvusDestination(...)
|
|
641
702
|
|
|
642
703
|
# 配置 1:按页面分块(适合 PDF 文档)
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
'
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
'type': 'embed',
|
|
665
|
-
'config': {
|
|
666
|
-
'provider': 'qwen',
|
|
667
|
-
'model_name': 'text-embedding-v4' # 使用更高精度的模型
|
|
668
|
-
}
|
|
669
|
-
}
|
|
670
|
-
]
|
|
671
|
-
}
|
|
704
|
+
stages_by_page = [
|
|
705
|
+
Stage(
|
|
706
|
+
type='parse',
|
|
707
|
+
config=ParseConfig(provider='textin')
|
|
708
|
+
),
|
|
709
|
+
Stage(
|
|
710
|
+
type='chunk',
|
|
711
|
+
config=ChunkConfig(
|
|
712
|
+
strategy='by_page', # 按页面分块
|
|
713
|
+
max_characters=2048, # 增大块大小
|
|
714
|
+
overlap=100 # 页面间重叠 100 字符
|
|
715
|
+
)
|
|
716
|
+
),
|
|
717
|
+
Stage(
|
|
718
|
+
type='embed',
|
|
719
|
+
config=EmbedConfig(
|
|
720
|
+
provider='qwen',
|
|
721
|
+
model_name='text-embedding-v4' # 使用更高精度的模型
|
|
722
|
+
)
|
|
723
|
+
)
|
|
724
|
+
]
|
|
672
725
|
|
|
673
726
|
# 配置 2:按标题分块(适合结构化文档)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
'
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
'type': 'embed',
|
|
696
|
-
'config': {
|
|
697
|
-
'provider': 'qwen',
|
|
698
|
-
'model_name': 'text-embedding-v3'
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
]
|
|
702
|
-
}
|
|
727
|
+
stages_by_title = [
|
|
728
|
+
Stage(
|
|
729
|
+
type='parse',
|
|
730
|
+
config=ParseConfig(provider='textin')
|
|
731
|
+
),
|
|
732
|
+
Stage(
|
|
733
|
+
type='chunk',
|
|
734
|
+
config=ChunkConfig(
|
|
735
|
+
strategy='by_title', # 按标题分块
|
|
736
|
+
include_orig_elements=True, # 保留原始元素信息
|
|
737
|
+
max_characters=1536
|
|
738
|
+
)
|
|
739
|
+
),
|
|
740
|
+
Stage(
|
|
741
|
+
type='embed',
|
|
742
|
+
config=EmbedConfig(
|
|
743
|
+
provider='qwen',
|
|
744
|
+
model_name='text-embedding-v3'
|
|
745
|
+
)
|
|
746
|
+
)
|
|
747
|
+
]
|
|
703
748
|
|
|
704
749
|
# 根据文档类型选择配置
|
|
705
|
-
pipeline =
|
|
750
|
+
pipeline = Pipeline(
|
|
751
|
+
source=source,
|
|
752
|
+
destination=destination,
|
|
753
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
754
|
+
api_headers={...},
|
|
755
|
+
stages=stages_by_page # 或 stages_by_title
|
|
756
|
+
)
|
|
706
757
|
pipeline.run()
|
|
707
758
|
```
|
|
708
759
|
|
|
709
760
|
### 示例 4: FTP 数据源配置
|
|
710
761
|
|
|
711
762
|
```python
|
|
712
|
-
from xparse_client import
|
|
763
|
+
from xparse_client import (
|
|
764
|
+
Pipeline, FtpSource, MilvusDestination,
|
|
765
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
766
|
+
)
|
|
713
767
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
'
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
768
|
+
# 创建 FTP 数据源
|
|
769
|
+
source = FtpSource(
|
|
770
|
+
host='ftp.example.com',
|
|
771
|
+
port=21,
|
|
772
|
+
username='user',
|
|
773
|
+
password='pass',
|
|
774
|
+
pattern='*.pdf',
|
|
775
|
+
recursive=False
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# 创建 Milvus 目的地
|
|
779
|
+
destination = MilvusDestination(
|
|
780
|
+
db_path='./vectors.db',
|
|
781
|
+
collection_name='ftp_docs',
|
|
782
|
+
dimension=1024
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# 配置处理阶段
|
|
786
|
+
stages = [
|
|
787
|
+
Stage(
|
|
788
|
+
type='parse',
|
|
789
|
+
config=ParseConfig(provider='textin')
|
|
790
|
+
),
|
|
791
|
+
Stage(
|
|
792
|
+
type='chunk',
|
|
793
|
+
config=ChunkConfig(
|
|
794
|
+
strategy='basic',
|
|
795
|
+
max_characters=1024
|
|
796
|
+
)
|
|
797
|
+
),
|
|
798
|
+
Stage(
|
|
799
|
+
type='embed',
|
|
800
|
+
config=EmbedConfig(
|
|
801
|
+
provider='qwen',
|
|
802
|
+
model_name='text-embedding-v3'
|
|
803
|
+
)
|
|
804
|
+
)
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# 创建并运行 Pipeline
|
|
808
|
+
pipeline = Pipeline(
|
|
809
|
+
source=source,
|
|
810
|
+
destination=destination,
|
|
811
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
812
|
+
api_headers={
|
|
734
813
|
'x-ti-app-id': 'app-id',
|
|
735
814
|
'x-ti-secret-code': 'secret'
|
|
736
815
|
},
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
'stages': [
|
|
740
|
-
{
|
|
741
|
-
'type': 'parse',
|
|
742
|
-
'config': {
|
|
743
|
-
'provider': 'textin'
|
|
744
|
-
}
|
|
745
|
-
},
|
|
746
|
-
{
|
|
747
|
-
'type': 'chunk',
|
|
748
|
-
'config': {
|
|
749
|
-
'strategy': 'basic',
|
|
750
|
-
'max_characters': 1024
|
|
751
|
-
}
|
|
752
|
-
},
|
|
753
|
-
{
|
|
754
|
-
'type': 'embed',
|
|
755
|
-
'config': {
|
|
756
|
-
'provider': 'qwen',
|
|
757
|
-
'model_name': 'text-embedding-v3'
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
]
|
|
761
|
-
}
|
|
816
|
+
stages=stages
|
|
817
|
+
)
|
|
762
818
|
|
|
763
|
-
pipeline = create_pipeline_from_config(config)
|
|
764
819
|
pipeline.run()
|
|
765
820
|
```
|
|
766
821
|
|
|
767
822
|
### 示例 5: 获取处理统计信息
|
|
768
823
|
|
|
769
824
|
```python
|
|
770
|
-
from
|
|
825
|
+
from datetime import datetime, timezone
|
|
826
|
+
from xparse_client import (
|
|
827
|
+
Pipeline, LocalSource, LocalDestination,
|
|
828
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
829
|
+
)
|
|
771
830
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
831
|
+
# 创建 Pipeline
|
|
832
|
+
source = LocalSource(
|
|
833
|
+
directory='./docs',
|
|
834
|
+
pattern='*.pdf',
|
|
835
|
+
recursive=False
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
destination = LocalDestination(output_dir='./output')
|
|
839
|
+
|
|
840
|
+
stages = [
|
|
841
|
+
Stage(
|
|
842
|
+
type='parse',
|
|
843
|
+
config=ParseConfig(provider='textin')
|
|
844
|
+
),
|
|
845
|
+
Stage(
|
|
846
|
+
type='chunk',
|
|
847
|
+
config=ChunkConfig(
|
|
848
|
+
strategy='basic',
|
|
849
|
+
max_characters=1024
|
|
850
|
+
)
|
|
851
|
+
),
|
|
852
|
+
Stage(
|
|
853
|
+
type='embed',
|
|
854
|
+
config=EmbedConfig(
|
|
855
|
+
provider='qwen',
|
|
856
|
+
model_name='text-embedding-v3'
|
|
857
|
+
)
|
|
858
|
+
)
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
pipeline = Pipeline(
|
|
862
|
+
source=source,
|
|
863
|
+
destination=destination,
|
|
864
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
865
|
+
api_headers={
|
|
784
866
|
'x-ti-app-id': 'your-app-id',
|
|
785
867
|
'x-ti-secret-code': 'your-secret-code'
|
|
786
868
|
},
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
'type': 'parse',
|
|
790
|
-
'config': {
|
|
791
|
-
'provider': 'textin'
|
|
792
|
-
}
|
|
793
|
-
},
|
|
794
|
-
{
|
|
795
|
-
'type': 'chunk',
|
|
796
|
-
'config': {
|
|
797
|
-
'strategy': 'basic',
|
|
798
|
-
'max_characters': 1024
|
|
799
|
-
}
|
|
800
|
-
},
|
|
801
|
-
{
|
|
802
|
-
'type': 'embed',
|
|
803
|
-
'config': {
|
|
804
|
-
'provider': 'qwen',
|
|
805
|
-
'model_name': 'text-embedding-v3'
|
|
806
|
-
}
|
|
807
|
-
}
|
|
808
|
-
]
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
pipeline = create_pipeline_from_config(config)
|
|
869
|
+
stages=stages
|
|
870
|
+
)
|
|
812
871
|
|
|
813
872
|
# 处理单个文件并获取统计信息
|
|
814
873
|
file_bytes, data_source = pipeline.source.read_file('document.pdf')
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
example/run_pipeline.py,sha256=ybAWBPXcQClRk1HOMySLi9IUPIs1Qn-S5HXNLbNJHjs,15459
|
|
2
|
+
example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
|
|
3
|
+
xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
|
|
4
|
+
xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
|
|
5
|
+
xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
|
|
6
|
+
xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
|
|
7
|
+
xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
|
|
8
|
+
xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
|
|
9
|
+
xparse_client-0.2.8.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
10
|
+
xparse_client-0.2.8.dist-info/METADATA,sha256=LX8TfLSbFZerGPhh16x5QK1lwrPh55CYKNLhr2kdBcY,27850
|
|
11
|
+
xparse_client-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
xparse_client-0.2.8.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
|
|
13
|
+
xparse_client-0.2.8.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
example/run_pipeline.py,sha256=ijws5q_vMmV0-bMHuFtOUMrEnxnL1LvOBCtcCD2c8zc,15366
|
|
2
|
-
example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
|
|
3
|
-
xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
|
|
4
|
-
xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
|
|
5
|
-
xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
|
|
6
|
-
xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
|
|
7
|
-
xparse_client/pipeline/pipeline.py,sha256=BEC1kf5HKn4qIiNMe5QPNOs9PJcjaE_4qRK0I9yaLSQ,20430
|
|
8
|
-
xparse_client/pipeline/sources.py,sha256=0m2GFYCQTORSkbn_fWIWrOOSFes4Aso6oyVJDHfDeqc,19964
|
|
9
|
-
xparse_client-0.2.6.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
10
|
-
xparse_client-0.2.6.dist-info/METADATA,sha256=-VNzfb68RsczdETZ17vRhGzClj2X5mrwiwT5-HzK1XE,26805
|
|
11
|
-
xparse_client-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
xparse_client-0.2.6.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
|
|
13
|
-
xparse_client-0.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|