xparse-client 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/run_pipeline.py +17 -14
- example/run_pipeline_test.py +8 -8
- xparse_client/pipeline/pipeline.py +133 -0
- xparse_client/pipeline/sources.py +83 -24
- {xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/METADATA +311 -252
- xparse_client-0.2.9.dist-info/RECORD +13 -0
- xparse_client-0.2.7.dist-info/RECORD +0 -13
- {xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/WHEEL +0 -0
- {xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/licenses/LICENSE +0 -0
- {xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/top_level.txt +0 -0
example/run_pipeline.py
CHANGED
|
@@ -5,6 +5,7 @@ Pipeline 运行脚本
|
|
|
5
5
|
快速启动和运行 Pipeline 的示例
|
|
6
6
|
'''
|
|
7
7
|
|
|
8
|
+
import json
|
|
8
9
|
from datetime import datetime, timezone
|
|
9
10
|
from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
|
|
10
11
|
|
|
@@ -127,14 +128,14 @@ def run_with_manual_setup():
|
|
|
127
128
|
# prefix='',
|
|
128
129
|
# region='cn-east-3'
|
|
129
130
|
# )
|
|
130
|
-
source = S3Source(
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
131
|
+
# source = S3Source(
|
|
132
|
+
# endpoint='https://s3.us-east-1.amazonaws.com',
|
|
133
|
+
# access_key='AKIA6QUE3TVZADUWA4PO',
|
|
134
|
+
# secret_key='OfV4r9/u+CmlLxmiZDYwtiFSl0OsNdWLADKdPek7',
|
|
135
|
+
# bucket='textin-xparse',
|
|
136
|
+
# prefix='',
|
|
137
|
+
# region='us-east-1'
|
|
138
|
+
# )
|
|
138
139
|
# source = S3Source(
|
|
139
140
|
# endpoint='http://127.0.0.1:9000',
|
|
140
141
|
# access_key='',
|
|
@@ -157,10 +158,11 @@ def run_with_manual_setup():
|
|
|
157
158
|
# username='', # 用户名,按照实际填写
|
|
158
159
|
# password='' # 密码,按照实际填写
|
|
159
160
|
# )
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
161
|
+
source = LocalSource(
|
|
162
|
+
directory='/Users/ke_wang/Documents/doc',
|
|
163
|
+
recursive=False,
|
|
164
|
+
pattern=['*'] # 支持通配符: *.pdf, *.docx, **/*.txt
|
|
165
|
+
)
|
|
164
166
|
|
|
165
167
|
# 创建 Milvus 目的地
|
|
166
168
|
# destination = MilvusDestination(
|
|
@@ -175,7 +177,7 @@ def run_with_manual_setup():
|
|
|
175
177
|
|
|
176
178
|
destination = MilvusDestination(
|
|
177
179
|
db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
|
|
178
|
-
collection_name='
|
|
180
|
+
collection_name='textin_test_3_copy', # 数据库collection名称
|
|
179
181
|
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
180
182
|
api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
|
|
181
183
|
)
|
|
@@ -193,7 +195,7 @@ def run_with_manual_setup():
|
|
|
193
195
|
stages = [
|
|
194
196
|
Stage(
|
|
195
197
|
type='parse',
|
|
196
|
-
config=ParseConfig(provider='textin
|
|
198
|
+
config=ParseConfig(provider='textin')
|
|
197
199
|
),
|
|
198
200
|
Stage(
|
|
199
201
|
type='chunk',
|
|
@@ -235,6 +237,7 @@ def run_with_manual_setup():
|
|
|
235
237
|
)
|
|
236
238
|
|
|
237
239
|
# 运行
|
|
240
|
+
# config = pipeline.get_config()
|
|
238
241
|
pipeline.run()
|
|
239
242
|
|
|
240
243
|
|
example/run_pipeline_test.py
CHANGED
|
@@ -86,14 +86,14 @@ def run_with_manual_setup():
|
|
|
86
86
|
from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
|
|
87
87
|
|
|
88
88
|
# 创建 S3 数据源
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
89
|
+
source = S3Source(
|
|
90
|
+
endpoint='https://textin-minio-api.ai.intsig.net',
|
|
91
|
+
access_key='IEQspf8C7fVcgmp3AZWl',
|
|
92
|
+
secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
|
|
93
|
+
bucket='textin-test',
|
|
94
|
+
prefix='',
|
|
95
|
+
region='us-east-1'
|
|
96
|
+
)
|
|
97
97
|
source = S3Source(
|
|
98
98
|
endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
|
|
99
99
|
access_key='LTAI5tBgsaVfkbh9rbPyuB17',
|
|
@@ -79,6 +79,139 @@ class Pipeline:
|
|
|
79
79
|
print(f" Pipeline Config: 中间结果保存已启用")
|
|
80
80
|
print("=" * 60)
|
|
81
81
|
|
|
82
|
+
def get_config(self) -> Dict[str, Any]:
|
|
83
|
+
"""获取 Pipeline 的完整配置信息,返回字典格式(与 create_pipeline_from_config 的入参格式一致)"""
|
|
84
|
+
config = {}
|
|
85
|
+
|
|
86
|
+
# Source 配置
|
|
87
|
+
source_type = type(self.source).__name__.replace('Source', '').lower()
|
|
88
|
+
config['source'] = {'type': source_type}
|
|
89
|
+
|
|
90
|
+
if isinstance(self.source, S3Source):
|
|
91
|
+
config['source'].update({
|
|
92
|
+
'endpoint': self.source.endpoint,
|
|
93
|
+
'bucket': self.source.bucket,
|
|
94
|
+
'prefix': self.source.prefix,
|
|
95
|
+
'pattern': self.source.pattern,
|
|
96
|
+
'recursive': self.source.recursive
|
|
97
|
+
})
|
|
98
|
+
# access_key 和 secret_key 不在对象中保存,无法恢复
|
|
99
|
+
# region 也不在对象中保存,使用默认值
|
|
100
|
+
config['source']['region'] = 'us-east-1' # 默认值
|
|
101
|
+
elif isinstance(self.source, LocalSource):
|
|
102
|
+
config['source'].update({
|
|
103
|
+
'directory': str(self.source.directory),
|
|
104
|
+
'pattern': self.source.pattern,
|
|
105
|
+
'recursive': self.source.recursive
|
|
106
|
+
})
|
|
107
|
+
elif isinstance(self.source, FtpSource):
|
|
108
|
+
config['source'].update({
|
|
109
|
+
'host': self.source.host,
|
|
110
|
+
'port': self.source.port,
|
|
111
|
+
'username': self.source.username,
|
|
112
|
+
'pattern': self.source.pattern,
|
|
113
|
+
'recursive': self.source.recursive
|
|
114
|
+
})
|
|
115
|
+
# password 不在对象中保存,无法恢复
|
|
116
|
+
elif isinstance(self.source, SmbSource):
|
|
117
|
+
config['source'].update({
|
|
118
|
+
'host': self.source.host,
|
|
119
|
+
'share_name': self.source.share_name,
|
|
120
|
+
'username': self.source.username,
|
|
121
|
+
'domain': self.source.domain,
|
|
122
|
+
'port': self.source.port,
|
|
123
|
+
'path': self.source.path,
|
|
124
|
+
'pattern': self.source.pattern,
|
|
125
|
+
'recursive': self.source.recursive
|
|
126
|
+
})
|
|
127
|
+
# password 不在对象中保存,无法恢复
|
|
128
|
+
|
|
129
|
+
# Destination 配置
|
|
130
|
+
dest_type = type(self.destination).__name__.replace('Destination', '').lower()
|
|
131
|
+
# MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
|
|
132
|
+
if dest_type == 'milvus':
|
|
133
|
+
# 判断是本地 Milvus 还是 Zilliz(通过 db_path 判断)
|
|
134
|
+
if self.destination.db_path.startswith('http'):
|
|
135
|
+
dest_type = 'zilliz'
|
|
136
|
+
else:
|
|
137
|
+
dest_type = 'milvus'
|
|
138
|
+
|
|
139
|
+
config['destination'] = {'type': dest_type}
|
|
140
|
+
|
|
141
|
+
if isinstance(self.destination, MilvusDestination):
|
|
142
|
+
config['destination'].update({
|
|
143
|
+
'db_path': self.destination.db_path,
|
|
144
|
+
'collection_name': self.destination.collection_name,
|
|
145
|
+
'dimension': self.destination.dimension
|
|
146
|
+
})
|
|
147
|
+
# api_key 和 token 不在对象中保存,无法恢复
|
|
148
|
+
elif isinstance(self.destination, LocalDestination):
|
|
149
|
+
config['destination'].update({
|
|
150
|
+
'output_dir': str(self.destination.output_dir)
|
|
151
|
+
})
|
|
152
|
+
elif isinstance(self.destination, S3Destination):
|
|
153
|
+
config['destination'].update({
|
|
154
|
+
'endpoint': self.destination.endpoint,
|
|
155
|
+
'bucket': self.destination.bucket,
|
|
156
|
+
'prefix': self.destination.prefix
|
|
157
|
+
})
|
|
158
|
+
# access_key, secret_key, region 不在对象中保存,无法恢复
|
|
159
|
+
config['destination']['region'] = 'us-east-1' # 默认值
|
|
160
|
+
|
|
161
|
+
# API 配置
|
|
162
|
+
config['api_base_url'] = self.api_base_url
|
|
163
|
+
config['api_headers'] = {}
|
|
164
|
+
for key, value in self.api_headers.items():
|
|
165
|
+
config['api_headers'][key] = value
|
|
166
|
+
|
|
167
|
+
# Stages 配置
|
|
168
|
+
config['stages'] = []
|
|
169
|
+
for stage in self.stages:
|
|
170
|
+
stage_dict = {
|
|
171
|
+
'type': stage.type,
|
|
172
|
+
'config': {}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if isinstance(stage.config, ParseConfig):
|
|
176
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
177
|
+
elif isinstance(stage.config, ChunkConfig):
|
|
178
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
179
|
+
elif isinstance(stage.config, EmbedConfig):
|
|
180
|
+
stage_dict['config'] = stage.config.to_dict()
|
|
181
|
+
else:
|
|
182
|
+
# 如果 config 是字典或其他类型,尝试转换
|
|
183
|
+
if isinstance(stage.config, dict):
|
|
184
|
+
stage_dict['config'] = stage.config
|
|
185
|
+
else:
|
|
186
|
+
stage_dict['config'] = str(stage.config)
|
|
187
|
+
|
|
188
|
+
config['stages'].append(stage_dict)
|
|
189
|
+
|
|
190
|
+
# Pipeline Config
|
|
191
|
+
if self.pipeline_config.include_intermediate_results:
|
|
192
|
+
config['pipeline_config'] = {
|
|
193
|
+
'include_intermediate_results': True,
|
|
194
|
+
'intermediate_results_destination': {}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
inter_dest = self.pipeline_config.intermediate_results_destination
|
|
198
|
+
if inter_dest:
|
|
199
|
+
inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
|
|
200
|
+
config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
|
|
201
|
+
|
|
202
|
+
if isinstance(inter_dest, LocalDestination):
|
|
203
|
+
config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
|
|
204
|
+
elif isinstance(inter_dest, S3Destination):
|
|
205
|
+
config['pipeline_config']['intermediate_results_destination'].update({
|
|
206
|
+
'endpoint': inter_dest.endpoint,
|
|
207
|
+
'bucket': inter_dest.bucket,
|
|
208
|
+
'prefix': inter_dest.prefix
|
|
209
|
+
})
|
|
210
|
+
# access_key, secret_key, region 不在对象中保存,无法恢复
|
|
211
|
+
config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1' # 默认值
|
|
212
|
+
|
|
213
|
+
return config
|
|
214
|
+
|
|
82
215
|
def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
83
216
|
url = f"{self.api_base_url}/pipeline"
|
|
84
217
|
max_retries = 3
|
|
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
|
|
|
11
11
|
from email.utils import parsedate_to_datetime
|
|
12
12
|
from fnmatch import fnmatch
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import List, Dict, Any, Tuple
|
|
14
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
15
15
|
|
|
16
16
|
from smb.SMBConnection import SMBConnection
|
|
17
17
|
from botocore.config import Config
|
|
@@ -20,6 +20,56 @@ from botocore.config import Config
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
|
|
24
|
+
"""规范化通配符模式列表
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
pattern: 通配符模式列表,如果为 None 或空列表则返回 None(表示匹配所有文件)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
通配符模式列表,如果 pattern 是 None、空列表或包含 "*" 则返回 None(表示匹配所有文件)
|
|
31
|
+
"""
|
|
32
|
+
if pattern is None or not pattern:
|
|
33
|
+
return None # None 表示匹配所有文件
|
|
34
|
+
|
|
35
|
+
if not isinstance(pattern, list):
|
|
36
|
+
raise ValueError(f"pattern 类型错误: {type(pattern)}")
|
|
37
|
+
|
|
38
|
+
# 过滤空字符串并去除空格
|
|
39
|
+
normalized = [p.strip() for p in pattern if p and p.strip()]
|
|
40
|
+
|
|
41
|
+
if not normalized:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
# 如果包含 "*",直接返回 None(匹配所有文件,减少后续开销)
|
|
45
|
+
if '*' in normalized:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
return normalized
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
|
|
52
|
+
"""检查文件路径是否匹配通配符模式
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
file_path: 文件路径
|
|
56
|
+
wildcard_patterns: 已规范化的通配符模式列表(如 ['*.pdf', '*.docx'])
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
如果匹配返回 True,否则返回 False
|
|
60
|
+
"""
|
|
61
|
+
# 如果 wildcard_patterns 是 None 或空列表,匹配所有文件
|
|
62
|
+
if wildcard_patterns is None:
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
# 检查是否匹配任何一个通配符模式
|
|
66
|
+
for wildcard_pattern in wildcard_patterns:
|
|
67
|
+
if fnmatch(file_path, wildcard_pattern):
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
|
|
23
73
|
def _to_millis_timestamp_string(timestamp):
|
|
24
74
|
"""将时间戳转换为毫秒时间戳字符串
|
|
25
75
|
|
|
@@ -62,11 +112,11 @@ class S3Source(Source):
|
|
|
62
112
|
"""S3/MinIO 数据源"""
|
|
63
113
|
|
|
64
114
|
def __init__(self, endpoint: str, access_key: str, secret_key: str,
|
|
65
|
-
bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str =
|
|
115
|
+
bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
66
116
|
self.endpoint = endpoint
|
|
67
117
|
self.bucket = bucket
|
|
68
118
|
self.prefix = prefix
|
|
69
|
-
self.pattern = pattern
|
|
119
|
+
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
70
120
|
self.recursive = recursive
|
|
71
121
|
|
|
72
122
|
if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
|
|
@@ -108,7 +158,7 @@ class S3Source(Source):
|
|
|
108
158
|
key = obj['Key']
|
|
109
159
|
if key.endswith('/') or key.endswith('empty.tmp'):
|
|
110
160
|
continue
|
|
111
|
-
if
|
|
161
|
+
if _match_file_extension(key, self.pattern):
|
|
112
162
|
files.append(key)
|
|
113
163
|
|
|
114
164
|
# 非递归模式下,CommonPrefixes 包含子目录,我们忽略它们
|
|
@@ -156,9 +206,9 @@ class S3Source(Source):
|
|
|
156
206
|
class LocalSource(Source):
|
|
157
207
|
"""本地文件系统数据源"""
|
|
158
208
|
|
|
159
|
-
def __init__(self, directory: str, pattern: str =
|
|
209
|
+
def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
160
210
|
self.directory = Path(directory)
|
|
161
|
-
self.pattern = pattern
|
|
211
|
+
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
162
212
|
self.recursive = recursive
|
|
163
213
|
|
|
164
214
|
if not self.directory.exists():
|
|
@@ -168,20 +218,29 @@ class LocalSource(Source):
|
|
|
168
218
|
logger.info(f"本地目录: {self.directory}")
|
|
169
219
|
|
|
170
220
|
def list_files(self) -> List[str]:
|
|
221
|
+
all_files = []
|
|
222
|
+
# 匹配所有文件
|
|
171
223
|
if self.recursive:
|
|
172
|
-
|
|
173
|
-
files = [
|
|
224
|
+
all_files.extend([
|
|
174
225
|
str(f.relative_to(self.directory))
|
|
175
|
-
for f in self.directory.rglob(
|
|
226
|
+
for f in self.directory.rglob('*')
|
|
176
227
|
if f.is_file()
|
|
177
|
-
]
|
|
228
|
+
])
|
|
178
229
|
else:
|
|
179
|
-
|
|
180
|
-
files = [
|
|
230
|
+
all_files.extend([
|
|
181
231
|
str(f.relative_to(self.directory))
|
|
182
|
-
for f in self.directory.glob(
|
|
232
|
+
for f in self.directory.glob('*')
|
|
183
233
|
if f.is_file()
|
|
184
|
-
]
|
|
234
|
+
])
|
|
235
|
+
|
|
236
|
+
files = []
|
|
237
|
+
if self.pattern is not None:
|
|
238
|
+
for file in all_files:
|
|
239
|
+
if _match_file_extension(file, self.pattern):
|
|
240
|
+
files.append(file)
|
|
241
|
+
else:
|
|
242
|
+
files.extend(all_files)
|
|
243
|
+
|
|
185
244
|
print(f"✓ 本地找到 {len(files)} 个文件")
|
|
186
245
|
return files
|
|
187
246
|
|
|
@@ -217,12 +276,12 @@ class LocalSource(Source):
|
|
|
217
276
|
class FtpSource(Source):
|
|
218
277
|
"""FTP 数据源"""
|
|
219
278
|
|
|
220
|
-
def __init__(self, host: str, port: int, username: str, password: str, pattern: str =
|
|
279
|
+
def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
221
280
|
self.host = host
|
|
222
281
|
self.port = port
|
|
223
282
|
self.username = username
|
|
224
283
|
self.password = password
|
|
225
|
-
self.pattern = pattern
|
|
284
|
+
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
226
285
|
self.recursive = recursive
|
|
227
286
|
|
|
228
287
|
self.client = ftplib.FTP()
|
|
@@ -288,12 +347,12 @@ class FtpSource(Source):
|
|
|
288
347
|
except:
|
|
289
348
|
# 不是目录,是文件
|
|
290
349
|
relative_path = full_path.lstrip('/')
|
|
291
|
-
if
|
|
350
|
+
if _match_file_extension(relative_path, self.pattern):
|
|
292
351
|
files.append(relative_path)
|
|
293
352
|
else:
|
|
294
353
|
# 是文件
|
|
295
354
|
relative_path = full_path.lstrip('/')
|
|
296
|
-
if
|
|
355
|
+
if _match_file_extension(relative_path, self.pattern):
|
|
297
356
|
files.append(relative_path)
|
|
298
357
|
|
|
299
358
|
# 恢复原始目录
|
|
@@ -325,7 +384,7 @@ class FtpSource(Source):
|
|
|
325
384
|
item_type = item_info.get('type', 'unknown')
|
|
326
385
|
# 只添加文件,排除目录
|
|
327
386
|
if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
|
|
328
|
-
if
|
|
387
|
+
if _match_file_extension(item_name, self.pattern):
|
|
329
388
|
files.append(item_name)
|
|
330
389
|
except:
|
|
331
390
|
# 如果不支持 MLSD,使用 LIST 命令
|
|
@@ -341,7 +400,7 @@ class FtpSource(Source):
|
|
|
341
400
|
continue
|
|
342
401
|
is_dir = parts[0].startswith('d')
|
|
343
402
|
# 只添加文件,排除目录
|
|
344
|
-
if not is_dir and
|
|
403
|
+
if not is_dir and _match_file_extension(item_name, self.pattern):
|
|
345
404
|
files.append(item_name)
|
|
346
405
|
except:
|
|
347
406
|
# 最后回退到 nlst,通过尝试切换目录来判断是否为目录
|
|
@@ -357,7 +416,7 @@ class FtpSource(Source):
|
|
|
357
416
|
continue
|
|
358
417
|
except:
|
|
359
418
|
# 不能切换,说明是文件
|
|
360
|
-
if
|
|
419
|
+
if _match_file_extension(item_name, self.pattern):
|
|
361
420
|
files.append(item_name)
|
|
362
421
|
|
|
363
422
|
# 确保回到原始目录
|
|
@@ -405,7 +464,7 @@ class SmbSource(Source):
|
|
|
405
464
|
"""SMB/CIFS 数据源"""
|
|
406
465
|
|
|
407
466
|
def __init__(self, host: str, share_name: str, username: str, password: str,
|
|
408
|
-
domain: str = '', port: int = 445, path: str = '', pattern: str =
|
|
467
|
+
domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
409
468
|
self.host = host
|
|
410
469
|
self.share_name = share_name
|
|
411
470
|
self.username = username
|
|
@@ -413,7 +472,7 @@ class SmbSource(Source):
|
|
|
413
472
|
self.domain = domain
|
|
414
473
|
self.port = port
|
|
415
474
|
self.path = path.strip('/').strip('\\') if path else ''
|
|
416
|
-
self.pattern = pattern
|
|
475
|
+
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
417
476
|
self.recursive = recursive
|
|
418
477
|
|
|
419
478
|
self.conn = SMBConnection(
|
|
@@ -451,7 +510,7 @@ class SmbSource(Source):
|
|
|
451
510
|
_list_recursive(conn, share, item_path)
|
|
452
511
|
# 非递归模式:忽略子目录
|
|
453
512
|
else:
|
|
454
|
-
if
|
|
513
|
+
if _match_file_extension(relative_path, self.pattern):
|
|
455
514
|
files.append(relative_path)
|
|
456
515
|
except Exception as e:
|
|
457
516
|
logger.warning(f"列出路径失败 {current_path}: {str(e)}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -173,7 +173,7 @@ source = S3Source(
|
|
|
173
173
|
bucket='textin',
|
|
174
174
|
prefix='',
|
|
175
175
|
region='us-east-1',
|
|
176
|
-
pattern='*.pdf' #
|
|
176
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
177
177
|
)
|
|
178
178
|
```
|
|
179
179
|
请确保配置的访问凭证至少包括以下几项权限:
|
|
@@ -193,7 +193,7 @@ source = S3Source(
|
|
|
193
193
|
bucket='textin',
|
|
194
194
|
prefix='',
|
|
195
195
|
region='cn-shanghai',
|
|
196
|
-
pattern='*.pdf' #
|
|
196
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
197
197
|
)
|
|
198
198
|
```
|
|
199
199
|
请确保配置的访问凭证至少包括以下几项权限:
|
|
@@ -214,7 +214,7 @@ source = S3Source(
|
|
|
214
214
|
bucket='textin',
|
|
215
215
|
prefix='',
|
|
216
216
|
region='ap-shanghai',
|
|
217
|
-
pattern='*.pdf' #
|
|
217
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
218
218
|
)
|
|
219
219
|
```
|
|
220
220
|
|
|
@@ -235,7 +235,7 @@ source = S3Source(
|
|
|
235
235
|
bucket='textin',
|
|
236
236
|
prefix='',
|
|
237
237
|
region='cn-shanghai',
|
|
238
|
-
pattern='*.pdf' #
|
|
238
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
239
239
|
)
|
|
240
240
|
```
|
|
241
241
|
|
|
@@ -257,7 +257,7 @@ source = S3Source(
|
|
|
257
257
|
bucket='textin',
|
|
258
258
|
prefix='',
|
|
259
259
|
region='cn-east-3',
|
|
260
|
-
pattern='*.pdf' #
|
|
260
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
261
261
|
)
|
|
262
262
|
```
|
|
263
263
|
|
|
@@ -279,7 +279,7 @@ source = S3Source(
|
|
|
279
279
|
bucket='textin-xparse',
|
|
280
280
|
prefix='',
|
|
281
281
|
region='us-east-1',
|
|
282
|
-
pattern='*.pdf' #
|
|
282
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,支持多个扩展名
|
|
283
283
|
)
|
|
284
284
|
```
|
|
285
285
|
请确保配置的访问凭证至少包括以下几项权限:
|
|
@@ -294,7 +294,7 @@ s3:GetObject
|
|
|
294
294
|
```python
|
|
295
295
|
source = LocalSource(
|
|
296
296
|
directory='./input',
|
|
297
|
-
pattern='*.pdf'
|
|
297
|
+
pattern=['*.pdf', '*.docx'] # 支持多个通配符模式列表
|
|
298
298
|
)
|
|
299
299
|
```
|
|
300
300
|
|
|
@@ -306,7 +306,7 @@ source = FtpSource(
|
|
|
306
306
|
port=21,
|
|
307
307
|
username='', # 用户名,按照实际填写
|
|
308
308
|
password='', # 密码,按照实际填写
|
|
309
|
-
pattern='*.pdf' #
|
|
309
|
+
pattern=['*.pdf'] # 可选,通配符模式列表,过滤指定类型文件
|
|
310
310
|
)
|
|
311
311
|
```
|
|
312
312
|
|
|
@@ -319,11 +319,11 @@ source = SmbSource(
|
|
|
319
319
|
username='', # 用户名,按照实际填写
|
|
320
320
|
password='', # 密码,按照实际填写
|
|
321
321
|
domain='your-smb-domain',
|
|
322
|
-
pattern='**/*.pdf' #
|
|
322
|
+
pattern=['**/*.pdf'] # 可选,通配符模式列表,支持多级匹配
|
|
323
323
|
)
|
|
324
324
|
```
|
|
325
325
|
|
|
326
|
-
> 注 1:所有 Source 均支持 `pattern`
|
|
326
|
+
> 注 1:所有 Source 均支持 `pattern` 参数,使用通配符模式列表(如 `['*.pdf', '*.docx']`)来过滤需要处理的文件。支持多个通配符模式,如果列表中包含 `'*'` 则匹配所有文件。默认为 `None`,即处理全部文件。
|
|
327
327
|
|
|
328
328
|
> 注 2:所有 Source 均支持 `recursive` 参数,表示是否递归遍历,默认为 `False`。
|
|
329
329
|
|
|
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
|
|
|
517
517
|
|
|
518
518
|
## 💡 使用示例
|
|
519
519
|
|
|
520
|
-
### 示例 1:
|
|
520
|
+
### 示例 1: 手动创建 Pipeline(推荐)
|
|
521
521
|
|
|
522
522
|
```python
|
|
523
|
-
from xparse_client import
|
|
523
|
+
from xparse_client import (
|
|
524
|
+
Pipeline, S3Source, MilvusDestination,
|
|
525
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
526
|
+
)
|
|
524
527
|
|
|
525
|
-
#
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
'
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
'
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
528
|
+
# 创建数据源
|
|
529
|
+
source = S3Source(
|
|
530
|
+
endpoint='https://your-minio.com',
|
|
531
|
+
access_key='your-access-key',
|
|
532
|
+
secret_key='your-secret-key',
|
|
533
|
+
bucket='documents',
|
|
534
|
+
prefix='pdfs/',
|
|
535
|
+
region='us-east-1',
|
|
536
|
+
pattern=['*.pdf'], # 仅处理匹配的文件
|
|
537
|
+
recursive=False # 不递归子目录
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# 创建目的地
|
|
541
|
+
destination = MilvusDestination(
|
|
542
|
+
db_path='./vectors.db',
|
|
543
|
+
collection_name='documents',
|
|
544
|
+
dimension=1024
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# 配置处理阶段
|
|
548
|
+
stages = [
|
|
549
|
+
Stage(
|
|
550
|
+
type='parse',
|
|
551
|
+
config=ParseConfig(provider='textin')
|
|
552
|
+
),
|
|
553
|
+
Stage(
|
|
554
|
+
type='chunk',
|
|
555
|
+
config=ChunkConfig(
|
|
556
|
+
strategy='by_title', # 按标题分块
|
|
557
|
+
include_orig_elements=False,
|
|
558
|
+
new_after_n_chars=512,
|
|
559
|
+
max_characters=1024,
|
|
560
|
+
overlap=50 # 块之间重叠 50 字符
|
|
561
|
+
)
|
|
562
|
+
),
|
|
563
|
+
Stage(
|
|
564
|
+
type='embed',
|
|
565
|
+
config=EmbedConfig(
|
|
566
|
+
provider='qwen',
|
|
567
|
+
model_name='text-embedding-v3'
|
|
568
|
+
)
|
|
569
|
+
)
|
|
570
|
+
]
|
|
571
|
+
|
|
572
|
+
# 创建并运行 Pipeline
|
|
573
|
+
pipeline = Pipeline(
|
|
574
|
+
source=source,
|
|
575
|
+
destination=destination,
|
|
576
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
577
|
+
api_headers={
|
|
550
578
|
'x-ti-app-id': 'your-app-id',
|
|
551
579
|
'x-ti-secret-code': 'your-secret-code'
|
|
552
580
|
},
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
'stages': [
|
|
556
|
-
{
|
|
557
|
-
'type': 'parse',
|
|
558
|
-
'config': {
|
|
559
|
-
'provider': 'textin'
|
|
560
|
-
}
|
|
561
|
-
},
|
|
562
|
-
{
|
|
563
|
-
'type': 'chunk',
|
|
564
|
-
'config': {
|
|
565
|
-
'strategy': 'by_title', # 按标题分块
|
|
566
|
-
'include_orig_elements': False,
|
|
567
|
-
'new_after_n_chars': 512,
|
|
568
|
-
'max_characters': 1024,
|
|
569
|
-
'overlap': 50 # 块之间重叠 50 字符
|
|
570
|
-
}
|
|
571
|
-
},
|
|
572
|
-
{
|
|
573
|
-
'type': 'embed',
|
|
574
|
-
'config': {
|
|
575
|
-
'provider': 'qwen',
|
|
576
|
-
'model_name': 'text-embedding-v3'
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
]
|
|
580
|
-
}
|
|
581
|
+
stages=stages
|
|
582
|
+
)
|
|
581
583
|
|
|
582
|
-
# 使用配置创建并运行 pipeline
|
|
583
|
-
pipeline = create_pipeline_from_config(config)
|
|
584
584
|
pipeline.run()
|
|
585
585
|
```
|
|
586
586
|
|
|
587
|
-
### 示例
|
|
587
|
+
### 示例 1.1: 输出配置字典
|
|
588
|
+
|
|
589
|
+
手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
|
|
588
590
|
|
|
589
591
|
```python
|
|
590
|
-
from
|
|
591
|
-
|
|
592
|
+
from xparse_client import (
|
|
593
|
+
Pipeline, LocalSource, LocalDestination,
|
|
594
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
595
|
+
)
|
|
592
596
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
597
|
+
# 手动创建 Pipeline
|
|
598
|
+
source = LocalSource(
|
|
599
|
+
directory='./test_files',
|
|
600
|
+
pattern=['*.pdf'],
|
|
601
|
+
recursive=False
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
605
|
+
|
|
606
|
+
stages = [
|
|
607
|
+
Stage(type='parse', config=ParseConfig(provider='textin')),
|
|
608
|
+
Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
|
|
609
|
+
Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
pipeline = Pipeline(
|
|
613
|
+
source=source,
|
|
614
|
+
destination=destination,
|
|
615
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
616
|
+
api_headers={
|
|
617
|
+
'x-ti-app-id': 'your-app-id',
|
|
618
|
+
'x-ti-secret-code': 'your-secret-code'
|
|
602
619
|
},
|
|
603
|
-
|
|
604
|
-
|
|
620
|
+
stages=stages
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
|
|
624
|
+
config_dict = pipeline.get_config()
|
|
625
|
+
|
|
626
|
+
# 可以保存为 JSON 文件
|
|
627
|
+
import json
|
|
628
|
+
with open('pipeline_config.json', 'w', encoding='utf-8') as f:
|
|
629
|
+
json.dump(config_dict, f, indent=2, ensure_ascii=False)
|
|
630
|
+
|
|
631
|
+
# 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
|
|
632
|
+
# from xparse_client import create_pipeline_from_config
|
|
633
|
+
# new_pipeline = create_pipeline_from_config(config_dict)
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
### 示例 2: 本地到本地(测试)
|
|
637
|
+
|
|
638
|
+
```python
|
|
639
|
+
from xparse_client import (
|
|
640
|
+
Pipeline, LocalSource, LocalDestination,
|
|
641
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# 创建本地数据源
|
|
645
|
+
source = LocalSource(
|
|
646
|
+
directory='./test_files',
|
|
647
|
+
pattern=['*.pdf'],
|
|
648
|
+
recursive=False
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# 创建本地输出目的地
|
|
652
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
653
|
+
|
|
654
|
+
# 配置处理阶段
|
|
655
|
+
stages = [
|
|
656
|
+
Stage(
|
|
657
|
+
type='parse',
|
|
658
|
+
config=ParseConfig(provider='textin')
|
|
659
|
+
),
|
|
660
|
+
Stage(
|
|
661
|
+
type='chunk',
|
|
662
|
+
config=ChunkConfig(
|
|
663
|
+
strategy='basic',
|
|
664
|
+
max_characters=1024
|
|
665
|
+
)
|
|
666
|
+
),
|
|
667
|
+
Stage(
|
|
668
|
+
type='embed',
|
|
669
|
+
config=EmbedConfig(
|
|
670
|
+
provider='qwen',
|
|
671
|
+
model_name='text-embedding-v3'
|
|
672
|
+
)
|
|
673
|
+
)
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
# 创建并运行 Pipeline
|
|
677
|
+
pipeline = Pipeline(
|
|
678
|
+
source=source,
|
|
679
|
+
destination=destination,
|
|
680
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
681
|
+
api_headers={
|
|
605
682
|
'x-ti-app-id': 'your-app-id',
|
|
606
683
|
'x-ti-secret-code': 'your-secret-code'
|
|
607
684
|
},
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
{
|
|
611
|
-
'type': 'parse',
|
|
612
|
-
'config': {
|
|
613
|
-
'provider': 'textin'
|
|
614
|
-
}
|
|
615
|
-
},
|
|
616
|
-
{
|
|
617
|
-
'type': 'chunk',
|
|
618
|
-
'config': {
|
|
619
|
-
'strategy': 'basic',
|
|
620
|
-
'max_characters': 1024
|
|
621
|
-
}
|
|
622
|
-
},
|
|
623
|
-
{
|
|
624
|
-
'type': 'embed',
|
|
625
|
-
'config': {
|
|
626
|
-
'provider': 'qwen',
|
|
627
|
-
'model_name': 'text-embedding-v3'
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
]
|
|
631
|
-
}
|
|
685
|
+
stages=stages
|
|
686
|
+
)
|
|
632
687
|
|
|
633
|
-
pipeline = create_pipeline_from_config(config)
|
|
634
688
|
pipeline.run()
|
|
635
689
|
```
|
|
636
690
|
|
|
637
691
|
### 示例 3: 不同分块策略的配置
|
|
638
692
|
|
|
639
693
|
```python
|
|
640
|
-
from xparse_client import
|
|
694
|
+
from xparse_client import (
|
|
695
|
+
Pipeline, S3Source, MilvusDestination,
|
|
696
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# 创建数据源和目的地
|
|
700
|
+
source = S3Source(...)
|
|
701
|
+
destination = MilvusDestination(...)
|
|
641
702
|
|
|
642
703
|
# 配置 1:按页面分块(适合 PDF 文档)
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
'
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
'type': 'embed',
|
|
665
|
-
'config': {
|
|
666
|
-
'provider': 'qwen',
|
|
667
|
-
'model_name': 'text-embedding-v4' # 使用更高精度的模型
|
|
668
|
-
}
|
|
669
|
-
}
|
|
670
|
-
]
|
|
671
|
-
}
|
|
704
|
+
stages_by_page = [
|
|
705
|
+
Stage(
|
|
706
|
+
type='parse',
|
|
707
|
+
config=ParseConfig(provider='textin')
|
|
708
|
+
),
|
|
709
|
+
Stage(
|
|
710
|
+
type='chunk',
|
|
711
|
+
config=ChunkConfig(
|
|
712
|
+
strategy='by_page', # 按页面分块
|
|
713
|
+
max_characters=2048, # 增大块大小
|
|
714
|
+
overlap=100 # 页面间重叠 100 字符
|
|
715
|
+
)
|
|
716
|
+
),
|
|
717
|
+
Stage(
|
|
718
|
+
type='embed',
|
|
719
|
+
config=EmbedConfig(
|
|
720
|
+
provider='qwen',
|
|
721
|
+
model_name='text-embedding-v4' # 使用更高精度的模型
|
|
722
|
+
)
|
|
723
|
+
)
|
|
724
|
+
]
|
|
672
725
|
|
|
673
726
|
# 配置 2:按标题分块(适合结构化文档)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
'
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
'type': 'embed',
|
|
696
|
-
'config': {
|
|
697
|
-
'provider': 'qwen',
|
|
698
|
-
'model_name': 'text-embedding-v3'
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
]
|
|
702
|
-
}
|
|
727
|
+
stages_by_title = [
|
|
728
|
+
Stage(
|
|
729
|
+
type='parse',
|
|
730
|
+
config=ParseConfig(provider='textin')
|
|
731
|
+
),
|
|
732
|
+
Stage(
|
|
733
|
+
type='chunk',
|
|
734
|
+
config=ChunkConfig(
|
|
735
|
+
strategy='by_title', # 按标题分块
|
|
736
|
+
include_orig_elements=True, # 保留原始元素信息
|
|
737
|
+
max_characters=1536
|
|
738
|
+
)
|
|
739
|
+
),
|
|
740
|
+
Stage(
|
|
741
|
+
type='embed',
|
|
742
|
+
config=EmbedConfig(
|
|
743
|
+
provider='qwen',
|
|
744
|
+
model_name='text-embedding-v3'
|
|
745
|
+
)
|
|
746
|
+
)
|
|
747
|
+
]
|
|
703
748
|
|
|
704
749
|
# 根据文档类型选择配置
|
|
705
|
-
pipeline =
|
|
750
|
+
pipeline = Pipeline(
|
|
751
|
+
source=source,
|
|
752
|
+
destination=destination,
|
|
753
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
754
|
+
api_headers={...},
|
|
755
|
+
stages=stages_by_page # 或 stages_by_title
|
|
756
|
+
)
|
|
706
757
|
pipeline.run()
|
|
707
758
|
```
|
|
708
759
|
|
|
709
760
|
### 示例 4: FTP 数据源配置
|
|
710
761
|
|
|
711
762
|
```python
|
|
712
|
-
from xparse_client import
|
|
763
|
+
from xparse_client import (
|
|
764
|
+
Pipeline, FtpSource, MilvusDestination,
|
|
765
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
766
|
+
)
|
|
713
767
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
'
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
768
|
+
# 创建 FTP 数据源
|
|
769
|
+
source = FtpSource(
|
|
770
|
+
host='ftp.example.com',
|
|
771
|
+
port=21,
|
|
772
|
+
username='user',
|
|
773
|
+
password='pass',
|
|
774
|
+
pattern=['*.pdf'],
|
|
775
|
+
recursive=False
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# 创建 Milvus 目的地
|
|
779
|
+
destination = MilvusDestination(
|
|
780
|
+
db_path='./vectors.db',
|
|
781
|
+
collection_name='ftp_docs',
|
|
782
|
+
dimension=1024
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# 配置处理阶段
|
|
786
|
+
stages = [
|
|
787
|
+
Stage(
|
|
788
|
+
type='parse',
|
|
789
|
+
config=ParseConfig(provider='textin')
|
|
790
|
+
),
|
|
791
|
+
Stage(
|
|
792
|
+
type='chunk',
|
|
793
|
+
config=ChunkConfig(
|
|
794
|
+
strategy='basic',
|
|
795
|
+
max_characters=1024
|
|
796
|
+
)
|
|
797
|
+
),
|
|
798
|
+
Stage(
|
|
799
|
+
type='embed',
|
|
800
|
+
config=EmbedConfig(
|
|
801
|
+
provider='qwen',
|
|
802
|
+
model_name='text-embedding-v3'
|
|
803
|
+
)
|
|
804
|
+
)
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# 创建并运行 Pipeline
|
|
808
|
+
pipeline = Pipeline(
|
|
809
|
+
source=source,
|
|
810
|
+
destination=destination,
|
|
811
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
812
|
+
api_headers={
|
|
734
813
|
'x-ti-app-id': 'app-id',
|
|
735
814
|
'x-ti-secret-code': 'secret'
|
|
736
815
|
},
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
'stages': [
|
|
740
|
-
{
|
|
741
|
-
'type': 'parse',
|
|
742
|
-
'config': {
|
|
743
|
-
'provider': 'textin'
|
|
744
|
-
}
|
|
745
|
-
},
|
|
746
|
-
{
|
|
747
|
-
'type': 'chunk',
|
|
748
|
-
'config': {
|
|
749
|
-
'strategy': 'basic',
|
|
750
|
-
'max_characters': 1024
|
|
751
|
-
}
|
|
752
|
-
},
|
|
753
|
-
{
|
|
754
|
-
'type': 'embed',
|
|
755
|
-
'config': {
|
|
756
|
-
'provider': 'qwen',
|
|
757
|
-
'model_name': 'text-embedding-v3'
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
]
|
|
761
|
-
}
|
|
816
|
+
stages=stages
|
|
817
|
+
)
|
|
762
818
|
|
|
763
|
-
pipeline = create_pipeline_from_config(config)
|
|
764
819
|
pipeline.run()
|
|
765
820
|
```
|
|
766
821
|
|
|
767
822
|
### 示例 5: 获取处理统计信息
|
|
768
823
|
|
|
769
824
|
```python
|
|
770
|
-
from
|
|
825
|
+
from datetime import datetime, timezone
|
|
826
|
+
from xparse_client import (
|
|
827
|
+
Pipeline, LocalSource, LocalDestination,
|
|
828
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
829
|
+
)
|
|
771
830
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
831
|
+
# 创建 Pipeline
|
|
832
|
+
source = LocalSource(
|
|
833
|
+
directory='./docs',
|
|
834
|
+
pattern=['*.pdf'],
|
|
835
|
+
recursive=False
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
destination = LocalDestination(output_dir='./output')
|
|
839
|
+
|
|
840
|
+
stages = [
|
|
841
|
+
Stage(
|
|
842
|
+
type='parse',
|
|
843
|
+
config=ParseConfig(provider='textin')
|
|
844
|
+
),
|
|
845
|
+
Stage(
|
|
846
|
+
type='chunk',
|
|
847
|
+
config=ChunkConfig(
|
|
848
|
+
strategy='basic',
|
|
849
|
+
max_characters=1024
|
|
850
|
+
)
|
|
851
|
+
),
|
|
852
|
+
Stage(
|
|
853
|
+
type='embed',
|
|
854
|
+
config=EmbedConfig(
|
|
855
|
+
provider='qwen',
|
|
856
|
+
model_name='text-embedding-v3'
|
|
857
|
+
)
|
|
858
|
+
)
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
pipeline = Pipeline(
|
|
862
|
+
source=source,
|
|
863
|
+
destination=destination,
|
|
864
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
865
|
+
api_headers={
|
|
784
866
|
'x-ti-app-id': 'your-app-id',
|
|
785
867
|
'x-ti-secret-code': 'your-secret-code'
|
|
786
868
|
},
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
'type': 'parse',
|
|
790
|
-
'config': {
|
|
791
|
-
'provider': 'textin'
|
|
792
|
-
}
|
|
793
|
-
},
|
|
794
|
-
{
|
|
795
|
-
'type': 'chunk',
|
|
796
|
-
'config': {
|
|
797
|
-
'strategy': 'basic',
|
|
798
|
-
'max_characters': 1024
|
|
799
|
-
}
|
|
800
|
-
},
|
|
801
|
-
{
|
|
802
|
-
'type': 'embed',
|
|
803
|
-
'config': {
|
|
804
|
-
'provider': 'qwen',
|
|
805
|
-
'model_name': 'text-embedding-v3'
|
|
806
|
-
}
|
|
807
|
-
}
|
|
808
|
-
]
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
pipeline = create_pipeline_from_config(config)
|
|
869
|
+
stages=stages
|
|
870
|
+
)
|
|
812
871
|
|
|
813
872
|
# 处理单个文件并获取统计信息
|
|
814
873
|
file_bytes, data_source = pipeline.source.read_file('document.pdf')
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
example/run_pipeline.py,sha256=xZ8TLofrK7naEwBe-tiuotcQ8yKWUES_k9iCQcIOIYo,15446
|
|
2
|
+
example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
|
|
3
|
+
xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
|
|
4
|
+
xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
|
|
5
|
+
xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
|
|
6
|
+
xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
|
|
7
|
+
xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
|
|
8
|
+
xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
|
|
9
|
+
xparse_client-0.2.9.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
10
|
+
xparse_client-0.2.9.dist-info/METADATA,sha256=Faj3fvt9Fc-EW9yFDewhpkqGVo_qSvL5N-tq1aIkkyk,28086
|
|
11
|
+
xparse_client-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
xparse_client-0.2.9.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
|
|
13
|
+
xparse_client-0.2.9.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
example/run_pipeline.py,sha256=ijws5q_vMmV0-bMHuFtOUMrEnxnL1LvOBCtcCD2c8zc,15366
|
|
2
|
-
example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
|
|
3
|
-
xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
|
|
4
|
-
xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
|
|
5
|
-
xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
|
|
6
|
-
xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
|
|
7
|
-
xparse_client/pipeline/pipeline.py,sha256=pHw32eo-bRegzDvkuVUu0CjMXMejJ64dDXH7esGMXjg,20379
|
|
8
|
-
xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
|
|
9
|
-
xparse_client-0.2.7.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
10
|
-
xparse_client-0.2.7.dist-info/METADATA,sha256=qMHiAq2qdH4vfW5zktrYrh9Kj72JxBDERVht8KYerl0,26805
|
|
11
|
-
xparse_client-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
xparse_client-0.2.7.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
|
|
13
|
-
xparse_client-0.2.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|