xparse-client 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
example/run_pipeline.py CHANGED
@@ -15,8 +15,8 @@ from xparse_client import create_pipeline_from_config, S3Source, LocalSource, Mi
15
15
 
16
16
  # API 请求头配置
17
17
  API_HEADERS = {
18
- 'x-ti-app-id': '',
19
- 'x-ti-secret-code': ''
18
+ 'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
19
+ 'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
20
20
  }
21
21
 
22
22
 
@@ -168,8 +168,15 @@ def run_with_manual_setup():
168
168
  # dimension=1024
169
169
  # )
170
170
 
171
- destination = LocalDestination(
172
- output_dir='./result'
171
+ # destination = LocalDestination(
172
+ # output_dir='./result'
173
+ # )
174
+
175
+ destination = MilvusDestination(
176
+ db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
177
+ collection_name='textin_test_2', # 数据库collection名称
178
+ dimension=1024, # 向量维度,需与 embed API 返回一致
179
+ api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
173
180
  )
174
181
 
175
182
  # destination = S3Destination(
@@ -185,7 +192,7 @@ def run_with_manual_setup():
185
192
  stages = [
186
193
  Stage(
187
194
  type='parse',
188
- config=ParseConfig(provider='textin')
195
+ config=ParseConfig(provider='textin-lite')
189
196
  ),
190
197
  Stage(
191
198
  type='chunk',
@@ -196,6 +203,13 @@ def run_with_manual_setup():
196
203
  max_characters=1024,
197
204
  overlap=50 # 块之间重叠 50 字符
198
205
  )
206
+ ),
207
+ Stage(
208
+ type='embed',
209
+ config=EmbedConfig(
210
+ provider='qwen',
211
+ model_name='text-embedding-v3'
212
+ )
199
213
  )
200
214
  ]
201
215
 
@@ -9,7 +9,7 @@ from .destinations import Destination
9
9
  class ParseConfig:
10
10
  """Parse 配置,支持动态字段"""
11
11
 
12
- def __init__(self, provider: Literal["textin", "mineru", "paddle"] = "textin", **kwargs):
12
+ def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
13
13
  self.provider = provider
14
14
  for key, value in kwargs.items():
15
15
  setattr(self, key, value)
@@ -79,17 +79,47 @@ class MilvusDestination(Destination):
79
79
  try:
80
80
  insert_data = []
81
81
  for item in data:
82
- metadata = item.get('metadata', {})
82
+ # 获取元素级别的 metadata
83
+ element_metadata = item.get('metadata', {})
84
+
83
85
  if 'embeddings' in item and item['embeddings']:
84
86
  element_id = item.get('element_id') or item.get('id') or str(uuid.uuid4())
85
- insert_data.append({
87
+
88
+ # 构建基础数据
89
+ insert_item = {
86
90
  'embeddings': item['embeddings'],
87
91
  'text': item.get('text', ''),
88
92
  'element_id': element_id,
89
- 'record_id': metadata.get('record_id', ''),
90
- 'metadata': metadata,
93
+ 'record_id': element_metadata.get('record_id', ''),
91
94
  'created_at': datetime.now().isoformat()
92
- })
95
+ }
96
+
97
+ # 合并文件级别的 metadata 和元素级别的 metadata
98
+ # 文件级别的 metadata 优先级更高
99
+ merged_metadata = {**element_metadata, **metadata}
100
+
101
+ # 将 metadata 中的字段展平到顶层作为动态字段
102
+ # 排除已存在的固定字段,避免冲突
103
+ fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
104
+ for key, value in merged_metadata.items():
105
+ if key not in fixed_fields:
106
+ # 特殊处理 data_source 字段:如果是字典则展平
107
+ if key == 'data_source' and isinstance(value, dict):
108
+ # 将 data_source 字典展平为 data_source_* 格式
109
+ for sub_key, sub_value in value.items():
110
+ flat_key = f'data_source_{sub_key}'
111
+ if flat_key not in fixed_fields:
112
+ # 如果子值也是字典或列表,转换为 JSON 字符串
113
+ if isinstance(sub_value, (dict, list)):
114
+ insert_item[flat_key] = json.dumps(sub_value, ensure_ascii=False)
115
+ else:
116
+ insert_item[flat_key] = sub_value
117
+ elif isinstance(value, (dict, list)):
118
+ continue
119
+ else:
120
+ insert_item[key] = value
121
+
122
+ insert_data.append(insert_item)
93
123
 
94
124
  if not insert_data:
95
125
  print(f" ! 警告: 没有有效的向量数据")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
2
+ example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
6
+ xparse_client/pipeline/destinations.py,sha256=rqcxmsn1YGClVxGQxSVmyr-uumOVilOv_vX82fUBj-I,9859
7
+ xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
8
+ xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
9
+ xparse_client-0.2.4.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.4.dist-info/METADATA,sha256=IMgXO9a7wnN0Ygzauk7eOkyrRFE3A2rq73eofvq3wBs,26508
11
+ xparse_client-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.4.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.4.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=UNG18bBqTqFlkRp4liVtJFpXXnuiZQ6nuGO3LbeksCY,14424
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=n2T6Ptma7pAyfcbUcULsvIPNR__yXIIznojHxm_g0Rw,4030
6
- xparse_client/pipeline/destinations.py,sha256=JqTuSqRqJFpMdeDYlvevdF32Cti40QL1oR1pMv-uxuc,7976
7
- xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
8
- xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
9
- xparse_client-0.2.3.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.3.dist-info/METADATA,sha256=aumlSAr6sgrfppsMCQy_hUtkTJW4bZn70p6dhnO1nHw,26508
11
- xparse_client-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.3.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.3.dist-info/RECORD,,