xparse-client 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/run_pipeline.py +465 -0
- example/run_pipeline_test.py +458 -0
- xparse_client/__init__.py +21 -1
- xparse_client/pipeline/__init__.py +3 -0
- xparse_client/pipeline/config.py +128 -0
- xparse_client/pipeline/destinations.py +220 -0
- xparse_client/pipeline/pipeline.py +440 -0
- xparse_client/pipeline/sources.py +342 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.3.dist-info}/METADATA +1 -1
- xparse_client-0.2.3.dist-info/RECORD +13 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.3.dist-info}/top_level.txt +1 -0
- xparse_client-0.2.2.dist-info/RECORD +0 -6
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.3.dist-info}/WHEEL +0 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- encoding: utf-8 -*-
|
|
3
|
+
'''
|
|
4
|
+
Pipeline 运行脚本
|
|
5
|
+
快速启动和运行 Pipeline 的示例
|
|
6
|
+
'''
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# ============================================================================
|
|
13
|
+
# 常量配置
|
|
14
|
+
# ============================================================================
|
|
15
|
+
|
|
16
|
+
# API 请求头配置
|
|
17
|
+
API_HEADERS = {
|
|
18
|
+
'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
|
|
19
|
+
'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ============================================================================
|
|
24
|
+
# 方式 1: 使用配置字典
|
|
25
|
+
# ============================================================================
|
|
26
|
+
|
|
27
|
+
def run_with_config():
|
|
28
|
+
"""使用配置字典运行 pipeline"""
|
|
29
|
+
|
|
30
|
+
config = {
|
|
31
|
+
'source': {
|
|
32
|
+
'type': 's3',
|
|
33
|
+
'endpoint': 'https://textin-minio-api.ai.intsig.net',
|
|
34
|
+
'access_key': 'IEQspf8C7fVcgmp3AZWl',
|
|
35
|
+
'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
|
|
36
|
+
'bucket': 'textin-test',
|
|
37
|
+
'prefix': '', # 留空处理所有文件,或指定如 'milvus/'
|
|
38
|
+
'region': 'us-east-1'
|
|
39
|
+
},
|
|
40
|
+
'destination': {
|
|
41
|
+
'type': 'milvus',
|
|
42
|
+
'db_path': './milvus_pipeline.db',
|
|
43
|
+
'collection_name': 'pipeline_collection',
|
|
44
|
+
'dimension': 1024
|
|
45
|
+
},
|
|
46
|
+
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
47
|
+
'api_headers': API_HEADERS,
|
|
48
|
+
# Stages 配置
|
|
49
|
+
'stages': [
|
|
50
|
+
{
|
|
51
|
+
'type': 'parse',
|
|
52
|
+
'config': {
|
|
53
|
+
'provider': 'textin'
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
'type': 'chunk',
|
|
58
|
+
'config': {
|
|
59
|
+
'strategy': 'basic', # 分块策略: 'basic' | 'by_title' | 'by_page'
|
|
60
|
+
'include_orig_elements': False, # 是否包含原始元素
|
|
61
|
+
'new_after_n_chars': 512, # 多少字符后创建新块
|
|
62
|
+
'max_characters': 1024, # 最大字符数
|
|
63
|
+
'overlap': 0 # 重叠字符数
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
'type': 'embed',
|
|
68
|
+
'config': {
|
|
69
|
+
'provider': 'qwen', # 向量化供应商: 'qwen'
|
|
70
|
+
'model_name': 'text-embedding-v3' # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
pipeline = create_pipeline_from_config(config)
|
|
77
|
+
pipeline.run()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ============================================================================
|
|
81
|
+
# 方式 2: 手动创建组件
|
|
82
|
+
# ============================================================================
|
|
83
|
+
|
|
84
|
+
def run_with_manual_setup():
|
|
85
|
+
"""手动创建 Source、Destination 和 Pipeline"""
|
|
86
|
+
from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
|
|
87
|
+
|
|
88
|
+
# 创建 S3 数据源
|
|
89
|
+
# source = S3Source(
|
|
90
|
+
# endpoint='https://textin-minio-api.ai.intsig.net',
|
|
91
|
+
# access_key='IEQspf8C7fVcgmp3AZWl',
|
|
92
|
+
# secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
|
|
93
|
+
# bucket='textin-test',
|
|
94
|
+
# prefix='',
|
|
95
|
+
# region='us-east-1'
|
|
96
|
+
# )
|
|
97
|
+
source = S3Source(
|
|
98
|
+
endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
|
|
99
|
+
access_key='LTAI5tBgsaVfkbh9rbPyuB17',
|
|
100
|
+
secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
|
|
101
|
+
bucket='textin',
|
|
102
|
+
prefix='',
|
|
103
|
+
region='cn-shanghai',
|
|
104
|
+
pattern='*.png'
|
|
105
|
+
)
|
|
106
|
+
# source = S3Source(
|
|
107
|
+
# endpoint='https://cos.ap-shanghai.myqcloud.com',
|
|
108
|
+
# access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
|
|
109
|
+
# secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
|
|
110
|
+
# bucket='textin-1300705866',
|
|
111
|
+
# prefix='',
|
|
112
|
+
# region='ap-shanghai'
|
|
113
|
+
# )
|
|
114
|
+
# source = S3Source(
|
|
115
|
+
# endpoint='https://tos-s3-cn-shanghai.volces.com',
|
|
116
|
+
# access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
|
|
117
|
+
# secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
|
|
118
|
+
# bucket='textin',
|
|
119
|
+
# prefix='',
|
|
120
|
+
# region='cn-shanghai'
|
|
121
|
+
# )
|
|
122
|
+
# source = S3Source(
|
|
123
|
+
# endpoint='https://obs.cn-east-3.myhuaweicloud.com',
|
|
124
|
+
# access_key='HPUAL646UCQ1YAT7JMWY',
|
|
125
|
+
# secret_key='z9cm95UXCw0R4J3AEig9siqGpZNbwDYz8PVoBGDI',
|
|
126
|
+
# bucket='textin',
|
|
127
|
+
# prefix='',
|
|
128
|
+
# region='cn-east-3'
|
|
129
|
+
# )
|
|
130
|
+
# source = S3Source(
|
|
131
|
+
# endpoint='https://s3.us-east-1.amazonaws.com',
|
|
132
|
+
# access_key='AKIA6QUE3TVZADUWA4PO',
|
|
133
|
+
# secret_key='OfV4r9/u+CmlLxmiZDYwtiFSl0OsNdWLADKdPek7',
|
|
134
|
+
# bucket='textin-xparse',
|
|
135
|
+
# prefix='',
|
|
136
|
+
# region='us-east-1'
|
|
137
|
+
# )
|
|
138
|
+
# source = S3Source(
|
|
139
|
+
# endpoint='http://127.0.0.1:9000',
|
|
140
|
+
# access_key='ldvOkKVZrsHW8ruqhwVG',
|
|
141
|
+
# secret_key='sH665Q2DKgQxyLOpObXTb088SD2hvP0Rtg1dGTiT',
|
|
142
|
+
# bucket='textin',
|
|
143
|
+
# prefix='',
|
|
144
|
+
# region='us-east-1'
|
|
145
|
+
# )
|
|
146
|
+
# source = SmbSource(
|
|
147
|
+
# host='internal-storage.intsig.net',
|
|
148
|
+
# share_name='ke_wang',
|
|
149
|
+
# username='ke_wang',
|
|
150
|
+
# password='Hhxxblj!4',
|
|
151
|
+
# domain='INTSIG.COM'
|
|
152
|
+
# )
|
|
153
|
+
# source = FtpSource(
|
|
154
|
+
# host='127.0.0.1',
|
|
155
|
+
# port=21,
|
|
156
|
+
# username='', # 用户名,按照实际填写
|
|
157
|
+
# password='' # 密码,按照实际填写
|
|
158
|
+
# )
|
|
159
|
+
# source = LocalSource(
|
|
160
|
+
# directory='/Users/ke_wang/Documents/doc',
|
|
161
|
+
# pattern='*.pdf' # 支持通配符: *.pdf, *.docx, **/*.txt
|
|
162
|
+
# )
|
|
163
|
+
|
|
164
|
+
# 创建 Milvus 目的地
|
|
165
|
+
destination = MilvusDestination(
|
|
166
|
+
db_path='./milvus_pipeline1.db',
|
|
167
|
+
collection_name='pipeline_collection',
|
|
168
|
+
dimension=1024
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# destination = S3Destination(
|
|
172
|
+
# endpoint='https://cos.ap-shanghai.myqcloud.com',
|
|
173
|
+
# access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
|
|
174
|
+
# secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
|
|
175
|
+
# bucket='textin-1300705866',
|
|
176
|
+
# prefix='result',
|
|
177
|
+
# region='ap-shanghai'
|
|
178
|
+
# )
|
|
179
|
+
|
|
180
|
+
# 使用新的 stages 格式创建配置
|
|
181
|
+
stages = [
|
|
182
|
+
Stage(
|
|
183
|
+
type='parse',
|
|
184
|
+
config=ParseConfig(provider='textin')
|
|
185
|
+
),
|
|
186
|
+
Stage(
|
|
187
|
+
type='chunk',
|
|
188
|
+
config=ChunkConfig(
|
|
189
|
+
strategy='by_title', # 按标题分块
|
|
190
|
+
include_orig_elements=False,
|
|
191
|
+
new_after_n_chars=512,
|
|
192
|
+
max_characters=1024,
|
|
193
|
+
overlap=50 # 块之间重叠 50 字符
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
# 如果需要 embed,取消下面的注释
|
|
197
|
+
# Stage(
|
|
198
|
+
# type='embed',
|
|
199
|
+
# config=EmbedConfig(
|
|
200
|
+
# provider='qwen',
|
|
201
|
+
# model_name='text-embedding-v3'
|
|
202
|
+
# )
|
|
203
|
+
# )
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
# 创建 Pipeline
|
|
207
|
+
pipeline = Pipeline(
|
|
208
|
+
source=source,
|
|
209
|
+
destination=destination,
|
|
210
|
+
api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
|
|
211
|
+
api_headers=API_HEADERS,
|
|
212
|
+
stages=stages
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# 运行
|
|
216
|
+
pipeline.run()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ============================================================================
|
|
220
|
+
# 方式 3: 本地测试(本地文件 -> 本地输出)
|
|
221
|
+
# ============================================================================
|
|
222
|
+
|
|
223
|
+
def run_local_test():
|
|
224
|
+
"""使用本地文件进行测试"""
|
|
225
|
+
|
|
226
|
+
config = {
|
|
227
|
+
'source': {
|
|
228
|
+
'type': 'local',
|
|
229
|
+
'directory': '/Users/ke_wang/Documents/doc',
|
|
230
|
+
'pattern': '*.pdf'
|
|
231
|
+
},
|
|
232
|
+
'destination': {
|
|
233
|
+
'type': 's3',
|
|
234
|
+
'endpoint': 'https://textin-minio-api.ai.intsig.net',
|
|
235
|
+
'access_key': 'IEQspf8C7fVcgmp3AZWl',
|
|
236
|
+
'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
|
|
237
|
+
'bucket': 'textin-test',
|
|
238
|
+
'prefix': '',
|
|
239
|
+
'region': 'us-east-1'
|
|
240
|
+
},
|
|
241
|
+
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
242
|
+
'api_headers': API_HEADERS,
|
|
243
|
+
# Stages 配置
|
|
244
|
+
'stages': [
|
|
245
|
+
{
|
|
246
|
+
'type': 'parse',
|
|
247
|
+
'config': {
|
|
248
|
+
'provider': 'textin'
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
'type': 'embed',
|
|
253
|
+
'config': {
|
|
254
|
+
'provider': 'qwen',
|
|
255
|
+
'model_name': 'text-embedding-v3'
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
]
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
pipeline = create_pipeline_from_config(config)
|
|
262
|
+
pipeline.run()
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ============================================================================
|
|
266
|
+
# 方式 4: 处理单个文件
|
|
267
|
+
# ============================================================================
|
|
268
|
+
|
|
269
|
+
def run_single_file():
|
|
270
|
+
"""只处理单个文件"""
|
|
271
|
+
from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
|
|
272
|
+
|
|
273
|
+
# 创建 pipeline
|
|
274
|
+
source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
|
|
275
|
+
destination = LocalDestination(output_dir='./output')
|
|
276
|
+
|
|
277
|
+
# 使用新的 stages 格式创建配置
|
|
278
|
+
stages = [
|
|
279
|
+
Stage(
|
|
280
|
+
type='parse',
|
|
281
|
+
config=ParseConfig(provider='textin')
|
|
282
|
+
),
|
|
283
|
+
Stage(
|
|
284
|
+
type='chunk',
|
|
285
|
+
config=ChunkConfig(
|
|
286
|
+
strategy='by_page', # 按页面分块
|
|
287
|
+
max_characters=2048, # 增大块大小
|
|
288
|
+
overlap=100
|
|
289
|
+
)
|
|
290
|
+
),
|
|
291
|
+
Stage(
|
|
292
|
+
type='embed',
|
|
293
|
+
config=EmbedConfig(
|
|
294
|
+
provider='qwen',
|
|
295
|
+
model_name='text-embedding-v4' # 使用更高精度的模型
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
pipeline = Pipeline(
|
|
301
|
+
source=source,
|
|
302
|
+
destination=destination,
|
|
303
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
304
|
+
api_headers=API_HEADERS,
|
|
305
|
+
stages=stages
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# 只处理指定文件
|
|
309
|
+
file_path = '4e3250f00210431fb29ca0c808.pdf' # 相对于 source directory 的路径
|
|
310
|
+
success = pipeline.process_file(file_path)
|
|
311
|
+
|
|
312
|
+
if success:
|
|
313
|
+
print(f"\n✅ 文件 {file_path} 处理成功!")
|
|
314
|
+
else:
|
|
315
|
+
print(f"\n❌ 文件 {file_path} 处理失败!")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ============================================================================
|
|
319
|
+
# 方式 5: 自定义处理流程
|
|
320
|
+
# ============================================================================
|
|
321
|
+
|
|
322
|
+
def run_custom_flow():
|
|
323
|
+
"""自定义处理流程,手动控制文件处理"""
|
|
324
|
+
from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
|
|
325
|
+
|
|
326
|
+
# 创建组件
|
|
327
|
+
source = S3Source(
|
|
328
|
+
endpoint='https://textin-minio-api.ai.intsig.net',
|
|
329
|
+
access_key='IEQspf8C7fVcgmp3AZWl',
|
|
330
|
+
secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
|
|
331
|
+
bucket='textin-test',
|
|
332
|
+
prefix='',
|
|
333
|
+
region='us-east-1',
|
|
334
|
+
pattern='*.pdf'
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
destination = MilvusDestination(
|
|
338
|
+
db_path='./milvus_custom.db',
|
|
339
|
+
collection_name='custom_collection',
|
|
340
|
+
dimension=1024
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# 使用新的 stages 格式创建配置
|
|
344
|
+
stages = [
|
|
345
|
+
Stage(
|
|
346
|
+
type='parse',
|
|
347
|
+
config=ParseConfig(provider='textin')
|
|
348
|
+
),
|
|
349
|
+
Stage(
|
|
350
|
+
type='chunk',
|
|
351
|
+
config=ChunkConfig(
|
|
352
|
+
strategy='by_title',
|
|
353
|
+
include_orig_elements=True,
|
|
354
|
+
max_characters=1536,
|
|
355
|
+
overlap=80
|
|
356
|
+
)
|
|
357
|
+
),
|
|
358
|
+
Stage(
|
|
359
|
+
type='embed',
|
|
360
|
+
config=EmbedConfig(
|
|
361
|
+
provider='qwen',
|
|
362
|
+
model_name='text-embedding-v4'
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
pipeline = Pipeline(
|
|
368
|
+
source=source,
|
|
369
|
+
destination=destination,
|
|
370
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
371
|
+
api_headers=API_HEADERS,
|
|
372
|
+
stages=stages
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# 手动控制文件处理
|
|
376
|
+
files = source.list_files()
|
|
377
|
+
|
|
378
|
+
for file_path in files[:2]: # 只处理前2个文件
|
|
379
|
+
print(f"\n处理: {file_path}")
|
|
380
|
+
file_bytes, data_source = source.read_file(file_path)
|
|
381
|
+
data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
|
|
382
|
+
|
|
383
|
+
# 使用 pipeline 接口处理
|
|
384
|
+
result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
|
|
385
|
+
|
|
386
|
+
if result:
|
|
387
|
+
embedded, stats = result
|
|
388
|
+
print(f" - 原始元素: {stats.original_elements}")
|
|
389
|
+
print(f" - 分块后: {stats.chunked_elements}")
|
|
390
|
+
print(f" - 向量化: {stats.embedded_elements}")
|
|
391
|
+
|
|
392
|
+
# 写入
|
|
393
|
+
metadata = {
|
|
394
|
+
'file_name': file_path,
|
|
395
|
+
'data_source': data_source,
|
|
396
|
+
'stats': {
|
|
397
|
+
'original_elements': stats.original_elements,
|
|
398
|
+
'chunked_elements': stats.chunked_elements,
|
|
399
|
+
'embedded_elements': stats.embedded_elements
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
destination.write(embedded, metadata)
|
|
403
|
+
print(f"✓ 完成: {file_path}")
|
|
404
|
+
else:
|
|
405
|
+
print(f"✗ 失败: {file_path}")
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
# ============================================================================
|
|
409
|
+
# 主函数
|
|
410
|
+
# ============================================================================
|
|
411
|
+
|
|
412
|
+
def main():
|
|
413
|
+
"""主函数 - 选择运行方式"""
|
|
414
|
+
|
|
415
|
+
print("=" * 60)
|
|
416
|
+
print("Pipeline 运行脚本")
|
|
417
|
+
print("=" * 60)
|
|
418
|
+
print("\n请选择运行方式:")
|
|
419
|
+
print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
|
|
420
|
+
print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
|
|
421
|
+
print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
|
|
422
|
+
print("4. 处理单个文件 [按页面分块 + V4模型]")
|
|
423
|
+
print("5. 自定义处理流程 [手动控制 + 统计信息]")
|
|
424
|
+
print()
|
|
425
|
+
|
|
426
|
+
try:
|
|
427
|
+
choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
|
|
428
|
+
|
|
429
|
+
if choice == '1':
|
|
430
|
+
print("\n使用配置字典运行...")
|
|
431
|
+
run_with_config()
|
|
432
|
+
elif choice == '2':
|
|
433
|
+
print("\n手动创建组件运行...")
|
|
434
|
+
run_with_manual_setup()
|
|
435
|
+
elif choice == '3':
|
|
436
|
+
print("\n本地测试模式...")
|
|
437
|
+
run_local_test()
|
|
438
|
+
elif choice == '4':
|
|
439
|
+
print("\n处理单个文件...")
|
|
440
|
+
run_single_file()
|
|
441
|
+
elif choice == '5':
|
|
442
|
+
print("\n自定义处理流程...")
|
|
443
|
+
run_custom_flow()
|
|
444
|
+
else:
|
|
445
|
+
print("无效的选项,使用默认方式运行...")
|
|
446
|
+
run_with_config()
|
|
447
|
+
|
|
448
|
+
except KeyboardInterrupt:
|
|
449
|
+
print("\n\n用户中断执行")
|
|
450
|
+
except Exception as e:
|
|
451
|
+
print(f"\n程序异常: {str(e)}")
|
|
452
|
+
import traceback
|
|
453
|
+
traceback.print_exc()
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
if __name__ == '__main__':
|
|
457
|
+
main()
|
|
458
|
+
|
xparse_client/__init__.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import re
|
|
2
4
|
|
|
3
5
|
logging.basicConfig(
|
|
4
6
|
level=logging.INFO,
|
|
@@ -31,4 +33,22 @@ __all__ = [
|
|
|
31
33
|
'create_pipeline_from_config',
|
|
32
34
|
]
|
|
33
35
|
|
|
34
|
-
|
|
36
|
+
# 自动从 pyproject.toml 读取版本号
|
|
37
|
+
def _get_version():
|
|
38
|
+
"""从 pyproject.toml 读取版本号"""
|
|
39
|
+
try:
|
|
40
|
+
# 优先尝试从已安装的包中读取版本
|
|
41
|
+
from importlib.metadata import version
|
|
42
|
+
return version('xparse-client')
|
|
43
|
+
except Exception:
|
|
44
|
+
# 如果包未安装,从 pyproject.toml 读取
|
|
45
|
+
pyproject_path = Path(__file__).parent.parent / 'pyproject.toml'
|
|
46
|
+
if pyproject_path.exists():
|
|
47
|
+
content = pyproject_path.read_text(encoding='utf-8')
|
|
48
|
+
# 使用正则表达式匹配 version = "x.x.x"
|
|
49
|
+
match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
|
|
50
|
+
if match:
|
|
51
|
+
return match.group(1)
|
|
52
|
+
return '0.0.0'
|
|
53
|
+
|
|
54
|
+
__version__ = _get_version()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- encoding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, asdict
|
|
5
|
+
from typing import Dict, Any, Optional, Literal, List, Union
|
|
6
|
+
from .destinations import Destination
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ParseConfig:
|
|
10
|
+
"""Parse 配置,支持动态字段"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, provider: Literal["textin", "mineru", "paddle"] = "textin", **kwargs):
|
|
13
|
+
self.provider = provider
|
|
14
|
+
for key, value in kwargs.items():
|
|
15
|
+
setattr(self, key, value)
|
|
16
|
+
|
|
17
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
18
|
+
result = {'provider': self.provider}
|
|
19
|
+
for key, value in self.__dict__.items():
|
|
20
|
+
if not key.startswith('_') and key != 'provider':
|
|
21
|
+
result[key] = value
|
|
22
|
+
return result
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
attrs = ', '.join(f'{k}={v!r}' for k, v in sorted(self.__dict__.items()) if not k.startswith('_'))
|
|
26
|
+
return f'ParseConfig({attrs})'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ChunkConfig:
|
|
31
|
+
"""Chunk 配置"""
|
|
32
|
+
strategy: Literal["basic", "by_title", "by_page"] = "basic"
|
|
33
|
+
include_orig_elements: bool = False
|
|
34
|
+
new_after_n_chars: int = 512
|
|
35
|
+
max_characters: int = 1024
|
|
36
|
+
overlap: int = 0
|
|
37
|
+
overlap_all: bool = False
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
40
|
+
return asdict(self)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class EmbedConfig:
|
|
45
|
+
"""Embed 配置"""
|
|
46
|
+
provider: Literal["qwen", "doubao"] = "qwen"
|
|
47
|
+
model_name: Literal[
|
|
48
|
+
"text-embedding-v3",
|
|
49
|
+
"text-embedding-v4",
|
|
50
|
+
"doubao-embedding-large-text-250515",
|
|
51
|
+
"doubao-embedding-text-240715"
|
|
52
|
+
] = "text-embedding-v3"
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
55
|
+
return asdict(self)
|
|
56
|
+
|
|
57
|
+
def validate(self) -> None:
|
|
58
|
+
provider_models = {
|
|
59
|
+
"qwen": ["text-embedding-v3", "text-embedding-v4"],
|
|
60
|
+
"doubao": ["doubao-embedding-large-text-250515", "doubao-embedding-text-240715"]
|
|
61
|
+
}
|
|
62
|
+
if self.provider not in provider_models:
|
|
63
|
+
raise ValueError(f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}")
|
|
64
|
+
if self.model_name not in provider_models[self.provider]:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class Stage:
|
|
72
|
+
"""Pipeline Stage 配置"""
|
|
73
|
+
type: Literal["parse", "chunk", "embed"]
|
|
74
|
+
config: Union[ParseConfig, ChunkConfig, EmbedConfig, Dict[str, Any]]
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
77
|
+
"""转换为字典格式,用于 API 请求"""
|
|
78
|
+
if isinstance(self.config, (ParseConfig, ChunkConfig, EmbedConfig)):
|
|
79
|
+
return {
|
|
80
|
+
'type': self.type,
|
|
81
|
+
'config': self.config.to_dict()
|
|
82
|
+
}
|
|
83
|
+
else:
|
|
84
|
+
# 如果 config 已经是字典,直接使用
|
|
85
|
+
return {
|
|
86
|
+
'type': self.type,
|
|
87
|
+
'config': self.config
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def __repr__(self) -> str:
|
|
91
|
+
return f"Stage(type={self.type!r}, config={self.config!r})"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class PipelineStats:
|
|
96
|
+
"""Pipeline 统计信息"""
|
|
97
|
+
original_elements: int = 0
|
|
98
|
+
chunked_elements: int = 0
|
|
99
|
+
embedded_elements: int = 0
|
|
100
|
+
stages: Optional[List[Stage]] = None # 存储实际执行的 stages
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class PipelineConfig:
|
|
105
|
+
"""Pipeline 配置"""
|
|
106
|
+
include_intermediate_results: bool = False
|
|
107
|
+
intermediate_results_destination: Optional[Destination] = None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
|
|
108
|
+
|
|
109
|
+
def __post_init__(self):
|
|
110
|
+
"""验证配置"""
|
|
111
|
+
if self.include_intermediate_results and not self.intermediate_results_destination:
|
|
112
|
+
raise ValueError("当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination")
|
|
113
|
+
|
|
114
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
115
|
+
return {
|
|
116
|
+
'include_intermediate_results': self.include_intermediate_results,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
__all__ = [
|
|
121
|
+
'ParseConfig',
|
|
122
|
+
'ChunkConfig',
|
|
123
|
+
'EmbedConfig',
|
|
124
|
+
'Stage',
|
|
125
|
+
'PipelineStats',
|
|
126
|
+
'PipelineConfig',
|
|
127
|
+
]
|
|
128
|
+
|