xparse-client 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
example/run_pipeline.py CHANGED
@@ -15,8 +15,8 @@ from xparse_client import create_pipeline_from_config, S3Source, LocalSource, Mi
15
15
 
16
16
  # API 请求头配置
17
17
  API_HEADERS = {
18
- 'x-ti-app-id': '',
19
- 'x-ti-secret-code': ''
18
+ 'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
19
+ 'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
20
20
  }
21
21
 
22
22
 
@@ -168,8 +168,15 @@ def run_with_manual_setup():
168
168
  # dimension=1024
169
169
  # )
170
170
 
171
- destination = LocalDestination(
172
- output_dir='./result'
171
+ # destination = LocalDestination(
172
+ # output_dir='./result'
173
+ # )
174
+
175
+ destination = MilvusDestination(
176
+ db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
177
+ collection_name='textin_test_2', # 数据库collection名称
178
+ dimension=1024, # 向量维度,需与 embed API 返回一致
179
+ api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
173
180
  )
174
181
 
175
182
  # destination = S3Destination(
@@ -185,7 +192,7 @@ def run_with_manual_setup():
185
192
  stages = [
186
193
  Stage(
187
194
  type='parse',
188
- config=ParseConfig(provider='textin')
195
+ config=ParseConfig(provider='textin-lite')
189
196
  ),
190
197
  Stage(
191
198
  type='chunk',
@@ -196,6 +203,13 @@ def run_with_manual_setup():
196
203
  max_characters=1024,
197
204
  overlap=50 # 块之间重叠 50 字符
198
205
  )
206
+ ),
207
+ Stage(
208
+ type='embed',
209
+ config=EmbedConfig(
210
+ provider='qwen',
211
+ model_name='text-embedding-v3'
212
+ )
199
213
  )
200
214
  ]
201
215
 
@@ -9,7 +9,7 @@ from .destinations import Destination
9
9
  class ParseConfig:
10
10
  """Parse 配置,支持动态字段"""
11
11
 
12
- def __init__(self, provider: Literal["textin", "mineru", "paddle"] = "textin", **kwargs):
12
+ def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
13
13
  self.provider = provider
14
14
  for key, value in kwargs.items():
15
15
  setattr(self, key, value)
@@ -18,6 +18,41 @@ from pymilvus import MilvusClient
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ def _flatten_dict(data: Dict[str, Any], prefix: str = '', fixed_fields: set = None) -> Dict[str, Any]:
22
+ """递归展平嵌套字典
23
+
24
+ Args:
25
+ data: 要展平的字典
26
+ prefix: 键的前缀
27
+ fixed_fields: 需要排除的字段集合
28
+
29
+ Returns:
30
+ 展平后的字典
31
+ """
32
+ if fixed_fields is None:
33
+ fixed_fields = set()
34
+
35
+ result = {}
36
+ for key, value in data.items():
37
+ flat_key = f'{prefix}_{key}' if prefix else key
38
+
39
+ if flat_key in fixed_fields:
40
+ continue
41
+
42
+ if isinstance(value, dict):
43
+ # 递归展平嵌套字典
44
+ nested = _flatten_dict(value, flat_key, fixed_fields)
45
+ result.update(nested)
46
+ elif isinstance(value, list):
47
+ # 列表转换为 JSON 字符串
48
+ result[flat_key] = json.dumps(value, ensure_ascii=False)
49
+ else:
50
+ # 其他类型直接使用
51
+ result[flat_key] = value
52
+
53
+ return result
54
+
55
+
21
56
  class Destination(ABC):
22
57
  """数据目的地抽象基类"""
23
58
 
@@ -79,17 +114,42 @@ class MilvusDestination(Destination):
79
114
  try:
80
115
  insert_data = []
81
116
  for item in data:
82
- metadata = item.get('metadata', {})
117
+ # 获取元素级别的 metadata
118
+ element_metadata = item.get('metadata', {})
119
+
83
120
  if 'embeddings' in item and item['embeddings']:
84
121
  element_id = item.get('element_id') or item.get('id') or str(uuid.uuid4())
85
- insert_data.append({
122
+
123
+ # 构建基础数据
124
+ insert_item = {
86
125
  'embeddings': item['embeddings'],
87
126
  'text': item.get('text', ''),
88
127
  'element_id': element_id,
89
- 'record_id': metadata.get('record_id', ''),
90
- 'metadata': metadata,
91
- 'created_at': datetime.now().isoformat()
92
- })
128
+ 'record_id': element_metadata.get('record_id', '')
129
+ }
130
+
131
+ # 合并文件级别的 metadata 和元素级别的 metadata
132
+ # 文件级别的 metadata 优先级更高
133
+ merged_metadata = {**element_metadata, **metadata}
134
+
135
+ # 将 metadata 中的字段展平到顶层作为动态字段
136
+ # 排除已存在的固定字段,避免冲突
137
+ fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
138
+ for key, value in merged_metadata.items():
139
+ if key not in fixed_fields:
140
+ # 特殊处理 data_source 字段:如果是字典则递归展平
141
+ if key == 'data_source' and isinstance(value, dict):
142
+ # 递归展平 data_source 字典,包括嵌套的字典
143
+ flattened = _flatten_dict(value, 'data_source', fixed_fields)
144
+ insert_item.update(flattened)
145
+ elif key == 'coordinates' and isinstance(value, list):
146
+ insert_item[key] = value
147
+ elif isinstance(value, (dict, list)):
148
+ continue
149
+ else:
150
+ insert_item[key] = value
151
+
152
+ insert_data.append(insert_item)
93
153
 
94
154
  if not insert_data:
95
155
  print(f" ! 警告: 没有有效的向量数据")
@@ -198,7 +198,9 @@ class Pipeline:
198
198
  print(f" → 读取文件...")
199
199
  file_bytes, data_source = self.source.read_file(file_path)
200
200
  data_source = data_source or {}
201
- data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
201
+ # 转换为毫秒时间戳字符串
202
+ timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
203
+ data_source['date_processed'] = str(timestamp_ms)
202
204
  print(f" ✓ 文件读取完成: {len(file_bytes)} bytes")
203
205
 
204
206
  result = self.process_with_pipeline(file_bytes, file_path, data_source)
@@ -209,15 +211,8 @@ class Pipeline:
209
211
 
210
212
  print(f" → 写入目的地...")
211
213
  metadata = {
212
- 'file_name': file_path,
213
- 'total_elements': len(embedded_data),
214
- 'processed_at': datetime.now().isoformat(),
215
- 'data_source': data_source,
216
- 'stats': {
217
- 'original_elements': stats.original_elements,
218
- 'chunked_elements': stats.chunked_elements,
219
- 'embedded_elements': stats.embedded_elements
220
- }
214
+ 'filename': file_path,
215
+ 'processed_at': str(timestamp_ms),
221
216
  }
222
217
 
223
218
  success = self.destination.write(embedded_data, metadata)
@@ -20,6 +20,30 @@ from botocore.config import Config
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
+ def _to_millis_timestamp_string(timestamp):
24
+ """将时间戳转换为毫秒时间戳字符串
25
+
26
+ Args:
27
+ timestamp: 时间戳(秒或毫秒),可以是 int、float 或 None
28
+
29
+ Returns:
30
+ str: 毫秒时间戳字符串,如果输入为 None 则返回空字符串
31
+ """
32
+ if timestamp is None:
33
+ return ""
34
+
35
+ # 如果已经是毫秒时间戳(大于 1e12),直接转换
36
+ if isinstance(timestamp, (int, float)):
37
+ if timestamp > 1e12:
38
+ # 已经是毫秒时间戳
39
+ return str(int(timestamp))
40
+ else:
41
+ # 秒级时间戳,转换为毫秒
42
+ return str(int(timestamp * 1000))
43
+
44
+ return str(timestamp)
45
+
46
+
23
47
  class Source(ABC):
24
48
  """数据源抽象基类"""
25
49
 
@@ -106,8 +130,8 @@ class S3Source(Source):
106
130
  data_source = {
107
131
  'url': f"s3://{self.bucket}/{normalized_key}",
108
132
  'version': version,
109
- 'date_created': date_modified,
110
- 'date_modified': date_modified,
133
+ 'date_created': _to_millis_timestamp_string(date_modified),
134
+ 'date_modified': _to_millis_timestamp_string(date_modified),
111
135
  'record_locator': {
112
136
  'server': server,
113
137
  'protocol': 's3',
@@ -159,8 +183,8 @@ class LocalSource(Source):
159
183
  data_source = {
160
184
  'url': full_path.as_uri(),
161
185
  'version': version,
162
- 'date_created': date_created,
163
- 'date_modified': date_modified,
186
+ 'date_created': _to_millis_timestamp_string(date_created),
187
+ 'date_modified': _to_millis_timestamp_string(date_modified),
164
188
  'record_locator': {
165
189
  'protocol': 'file',
166
190
  'remote_file_path': str(full_path)
@@ -211,8 +235,8 @@ class FtpSource(Source):
211
235
  data_source = {
212
236
  'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
213
237
  'version': None,
214
- 'date_created': None,
215
- 'date_modified': date_modified,
238
+ 'date_created': "",
239
+ 'date_modified': _to_millis_timestamp_string(date_modified),
216
240
  'record_locator': {
217
241
  'server': f"{self.host}:{self.port}",
218
242
  'protocol': 'ftp',
@@ -311,8 +335,8 @@ class SmbSource(Source):
311
335
  data_source = {
312
336
  'url': smb_url,
313
337
  'version': None,
314
- 'date_created': date_created,
315
- 'date_modified': date_modified,
338
+ 'date_created': _to_millis_timestamp_string(date_created),
339
+ 'date_modified': _to_millis_timestamp_string(date_modified),
316
340
  'record_locator': {
317
341
  'server': self.host,
318
342
  'share': self.share_name,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -331,9 +331,9 @@ source = SmbSource(
331
331
 
332
332
  ```python
333
333
  destination = MilvusDestination(
334
- db_path: './milvus_pipeline.db', # 本地数据库文件
335
- collection_name: 'my_collection', # 数据库collection名称
336
- dimension: 1024 # 向量维度,需与 embed API 返回一致
334
+ db_path='./milvus_pipeline.db', # 本地数据库文件
335
+ collection_name='my_collection', # 数据库collection名称
336
+ dimension=1024 # 向量维度,需与 embed API 返回一致
337
337
  )
338
338
  ```
339
339
 
@@ -341,10 +341,10 @@ destination = MilvusDestination(
341
341
 
342
342
  ```python
343
343
  destination = MilvusDestination(
344
- db_path: 'https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
345
- collection_name: 'my_collection', # 数据库collection名称
346
- dimension: 1024, # 向量维度,需与 embed API 返回一致
347
- api_key: 'your-api-key' # Zilliz Cloud API Key
344
+ db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
345
+ collection_name='my_collection', # 数据库collection名称
346
+ dimension=1024, # 向量维度,需与 embed API 返回一致
347
+ api_key='your-api-key' # Zilliz Cloud API Key
348
348
  )
349
349
  ```
350
350
 
@@ -354,7 +354,7 @@ destination = MilvusDestination(
354
354
 
355
355
  ```python
356
356
  destination = LocalDestination(
357
- output_dir: './output'
357
+ output_dir='./output'
358
358
  )
359
359
  ```
360
360
 
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
2
+ example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
6
+ xparse_client/pipeline/destinations.py,sha256=dXtsYw6xu_pBH8GKu9zKRzD6qP6Krgg-l1zCTzj8PhU,10475
7
+ xparse_client/pipeline/pipeline.py,sha256=nWebrMGJjxTKF1de-7DeNtyozuUOkh3zbAqv-Q78Ev4,18039
8
+ xparse_client/pipeline/sources.py,sha256=NdME4mTSzC7JXXhaw985Tt_nc0y_aRUl2NEALm8xLII,12484
9
+ xparse_client-0.2.5.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.5.dist-info/METADATA,sha256=iN0qG5s4bc1hAueNx0VucDekicwdF-kQ5IzUWH2tJd4,26500
11
+ xparse_client-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.5.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.5.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=UNG18bBqTqFlkRp4liVtJFpXXnuiZQ6nuGO3LbeksCY,14424
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=n2T6Ptma7pAyfcbUcULsvIPNR__yXIIznojHxm_g0Rw,4030
6
- xparse_client/pipeline/destinations.py,sha256=JqTuSqRqJFpMdeDYlvevdF32Cti40QL1oR1pMv-uxuc,7976
7
- xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
8
- xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
9
- xparse_client-0.2.3.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.3.dist-info/METADATA,sha256=aumlSAr6sgrfppsMCQy_hUtkTJW4bZn70p6dhnO1nHw,26508
11
- xparse_client-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.3.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.3.dist-info/RECORD,,