xparse-client 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xparse_client-0.2.3 → xparse_client-0.2.4}/PKG-INFO +1 -1
- {xparse_client-0.2.3 → xparse_client-0.2.4}/example/run_pipeline.py +19 -5
- {xparse_client-0.2.3 → xparse_client-0.2.4}/pyproject.toml +1 -1
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/pipeline/config.py +1 -1
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/pipeline/destinations.py +35 -5
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client.egg-info/PKG-INFO +1 -1
- {xparse_client-0.2.3 → xparse_client-0.2.4}/LICENSE +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/README.md +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/example/run_pipeline_test.py +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/setup.cfg +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/__init__.py +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/pipeline/__init__.py +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/pipeline/pipeline.py +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client/pipeline/sources.py +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client.egg-info/SOURCES.txt +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client.egg-info/dependency_links.txt +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client.egg-info/requires.txt +0 -0
- {xparse_client-0.2.3 → xparse_client-0.2.4}/xparse_client.egg-info/top_level.txt +0 -0
|
@@ -15,8 +15,8 @@ from xparse_client import create_pipeline_from_config, S3Source, LocalSource, Mi
|
|
|
15
15
|
|
|
16
16
|
# API 请求头配置
|
|
17
17
|
API_HEADERS = {
|
|
18
|
-
'x-ti-app-id': '',
|
|
19
|
-
'x-ti-secret-code': ''
|
|
18
|
+
'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
|
|
19
|
+
'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
|
|
@@ -168,8 +168,15 @@ def run_with_manual_setup():
|
|
|
168
168
|
# dimension=1024
|
|
169
169
|
# )
|
|
170
170
|
|
|
171
|
-
destination = LocalDestination(
|
|
172
|
-
|
|
171
|
+
# destination = LocalDestination(
|
|
172
|
+
# output_dir='./result'
|
|
173
|
+
# )
|
|
174
|
+
|
|
175
|
+
destination = MilvusDestination(
|
|
176
|
+
db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
|
|
177
|
+
collection_name='textin_test_2', # 数据库collection名称
|
|
178
|
+
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
179
|
+
api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
|
|
173
180
|
)
|
|
174
181
|
|
|
175
182
|
# destination = S3Destination(
|
|
@@ -185,7 +192,7 @@ def run_with_manual_setup():
|
|
|
185
192
|
stages = [
|
|
186
193
|
Stage(
|
|
187
194
|
type='parse',
|
|
188
|
-
config=ParseConfig(provider='textin')
|
|
195
|
+
config=ParseConfig(provider='textin-lite')
|
|
189
196
|
),
|
|
190
197
|
Stage(
|
|
191
198
|
type='chunk',
|
|
@@ -196,6 +203,13 @@ def run_with_manual_setup():
|
|
|
196
203
|
max_characters=1024,
|
|
197
204
|
overlap=50 # 块之间重叠 50 字符
|
|
198
205
|
)
|
|
206
|
+
),
|
|
207
|
+
Stage(
|
|
208
|
+
type='embed',
|
|
209
|
+
config=EmbedConfig(
|
|
210
|
+
provider='qwen',
|
|
211
|
+
model_name='text-embedding-v3'
|
|
212
|
+
)
|
|
199
213
|
)
|
|
200
214
|
]
|
|
201
215
|
|
|
@@ -9,7 +9,7 @@ from .destinations import Destination
|
|
|
9
9
|
class ParseConfig:
|
|
10
10
|
"""Parse 配置,支持动态字段"""
|
|
11
11
|
|
|
12
|
-
def __init__(self, provider: Literal["textin", "mineru", "paddle"] = "textin", **kwargs):
|
|
12
|
+
def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
|
|
13
13
|
self.provider = provider
|
|
14
14
|
for key, value in kwargs.items():
|
|
15
15
|
setattr(self, key, value)
|
|
@@ -79,17 +79,47 @@ class MilvusDestination(Destination):
|
|
|
79
79
|
try:
|
|
80
80
|
insert_data = []
|
|
81
81
|
for item in data:
|
|
82
|
-
|
|
82
|
+
# 获取元素级别的 metadata
|
|
83
|
+
element_metadata = item.get('metadata', {})
|
|
84
|
+
|
|
83
85
|
if 'embeddings' in item and item['embeddings']:
|
|
84
86
|
element_id = item.get('element_id') or item.get('id') or str(uuid.uuid4())
|
|
85
|
-
|
|
87
|
+
|
|
88
|
+
# 构建基础数据
|
|
89
|
+
insert_item = {
|
|
86
90
|
'embeddings': item['embeddings'],
|
|
87
91
|
'text': item.get('text', ''),
|
|
88
92
|
'element_id': element_id,
|
|
89
|
-
'record_id':
|
|
90
|
-
'metadata': metadata,
|
|
93
|
+
'record_id': element_metadata.get('record_id', ''),
|
|
91
94
|
'created_at': datetime.now().isoformat()
|
|
92
|
-
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# 合并文件级别的 metadata 和元素级别的 metadata
|
|
98
|
+
# 文件级别的 metadata 优先级更高
|
|
99
|
+
merged_metadata = {**element_metadata, **metadata}
|
|
100
|
+
|
|
101
|
+
# 将 metadata 中的字段展平到顶层作为动态字段
|
|
102
|
+
# 排除已存在的固定字段,避免冲突
|
|
103
|
+
fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
|
|
104
|
+
for key, value in merged_metadata.items():
|
|
105
|
+
if key not in fixed_fields:
|
|
106
|
+
# 特殊处理 data_source 字段:如果是字典则展平
|
|
107
|
+
if key == 'data_source' and isinstance(value, dict):
|
|
108
|
+
# 将 data_source 字典展平为 data_source_* 格式
|
|
109
|
+
for sub_key, sub_value in value.items():
|
|
110
|
+
flat_key = f'data_source_{sub_key}'
|
|
111
|
+
if flat_key not in fixed_fields:
|
|
112
|
+
# 如果子值也是字典或列表,转换为 JSON 字符串
|
|
113
|
+
if isinstance(sub_value, (dict, list)):
|
|
114
|
+
insert_item[flat_key] = json.dumps(sub_value, ensure_ascii=False)
|
|
115
|
+
else:
|
|
116
|
+
insert_item[flat_key] = sub_value
|
|
117
|
+
elif isinstance(value, (dict, list)):
|
|
118
|
+
continue
|
|
119
|
+
else:
|
|
120
|
+
insert_item[key] = value
|
|
121
|
+
|
|
122
|
+
insert_data.append(insert_item)
|
|
93
123
|
|
|
94
124
|
if not insert_data:
|
|
95
125
|
print(f" ! 警告: 没有有效的向量数据")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|