xparse-client 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,41 @@ from pymilvus import MilvusClient
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ def _flatten_dict(data: Dict[str, Any], prefix: str = '', fixed_fields: set = None) -> Dict[str, Any]:
22
+ """递归展平嵌套字典
23
+
24
+ Args:
25
+ data: 要展平的字典
26
+ prefix: 键的前缀
27
+ fixed_fields: 需要排除的字段集合
28
+
29
+ Returns:
30
+ 展平后的字典
31
+ """
32
+ if fixed_fields is None:
33
+ fixed_fields = set()
34
+
35
+ result = {}
36
+ for key, value in data.items():
37
+ flat_key = f'{prefix}_{key}' if prefix else key
38
+
39
+ if flat_key in fixed_fields:
40
+ continue
41
+
42
+ if isinstance(value, dict):
43
+ # 递归展平嵌套字典
44
+ nested = _flatten_dict(value, flat_key, fixed_fields)
45
+ result.update(nested)
46
+ elif isinstance(value, list):
47
+ # 列表转换为 JSON 字符串
48
+ result[flat_key] = json.dumps(value, ensure_ascii=False)
49
+ else:
50
+ # 其他类型直接使用
51
+ result[flat_key] = value
52
+
53
+ return result
54
+
55
+
21
56
  class Destination(ABC):
22
57
  """数据目的地抽象基类"""
23
58
 
@@ -90,8 +125,7 @@ class MilvusDestination(Destination):
90
125
  'embeddings': item['embeddings'],
91
126
  'text': item.get('text', ''),
92
127
  'element_id': element_id,
93
- 'record_id': element_metadata.get('record_id', ''),
94
- 'created_at': datetime.now().isoformat()
128
+ 'record_id': element_metadata.get('record_id', '')
95
129
  }
96
130
 
97
131
  # 合并文件级别的 metadata 和元素级别的 metadata
@@ -103,17 +137,13 @@ class MilvusDestination(Destination):
103
137
  fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
104
138
  for key, value in merged_metadata.items():
105
139
  if key not in fixed_fields:
106
- # 特殊处理 data_source 字段:如果是字典则展平
140
+ # 特殊处理 data_source 字段:如果是字典则递归展平
107
141
  if key == 'data_source' and isinstance(value, dict):
108
- # data_source 字典展平为 data_source_* 格式
109
- for sub_key, sub_value in value.items():
110
- flat_key = f'data_source_{sub_key}'
111
- if flat_key not in fixed_fields:
112
- # 如果子值也是字典或列表,转换为 JSON 字符串
113
- if isinstance(sub_value, (dict, list)):
114
- insert_item[flat_key] = json.dumps(sub_value, ensure_ascii=False)
115
- else:
116
- insert_item[flat_key] = sub_value
142
+ # 递归展平 data_source 字典,包括嵌套的字典
143
+ flattened = _flatten_dict(value, 'data_source', fixed_fields)
144
+ insert_item.update(flattened)
145
+ elif key == 'coordinates' and isinstance(value, list):
146
+ insert_item[key] = value
117
147
  elif isinstance(value, (dict, list)):
118
148
  continue
119
149
  else:
@@ -198,7 +198,9 @@ class Pipeline:
198
198
  print(f" → 读取文件...")
199
199
  file_bytes, data_source = self.source.read_file(file_path)
200
200
  data_source = data_source or {}
201
- data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
201
+ # 转换为毫秒时间戳字符串
202
+ timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
203
+ data_source['date_processed'] = str(timestamp_ms)
202
204
  print(f" ✓ 文件读取完成: {len(file_bytes)} bytes")
203
205
 
204
206
  result = self.process_with_pipeline(file_bytes, file_path, data_source)
@@ -209,15 +211,8 @@ class Pipeline:
209
211
 
210
212
  print(f" → 写入目的地...")
211
213
  metadata = {
212
- 'file_name': file_path,
213
- 'total_elements': len(embedded_data),
214
- 'processed_at': datetime.now().isoformat(),
215
- 'data_source': data_source,
216
- 'stats': {
217
- 'original_elements': stats.original_elements,
218
- 'chunked_elements': stats.chunked_elements,
219
- 'embedded_elements': stats.embedded_elements
220
- }
214
+ 'filename': file_path,
215
+ 'processed_at': str(timestamp_ms),
221
216
  }
222
217
 
223
218
  success = self.destination.write(embedded_data, metadata)
@@ -20,6 +20,30 @@ from botocore.config import Config
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
+ def _to_millis_timestamp_string(timestamp):
24
+ """将时间戳转换为毫秒时间戳字符串
25
+
26
+ Args:
27
+ timestamp: 时间戳(秒或毫秒),可以是 int、float 或 None
28
+
29
+ Returns:
30
+ str: 毫秒时间戳字符串,如果输入为 None 则返回空字符串
31
+ """
32
+ if timestamp is None:
33
+ return ""
34
+
35
+ # 如果已经是毫秒时间戳(大于 1e12),直接转换
36
+ if isinstance(timestamp, (int, float)):
37
+ if timestamp > 1e12:
38
+ # 已经是毫秒时间戳
39
+ return str(int(timestamp))
40
+ else:
41
+ # 秒级时间戳,转换为毫秒
42
+ return str(int(timestamp * 1000))
43
+
44
+ return str(timestamp)
45
+
46
+
23
47
  class Source(ABC):
24
48
  """数据源抽象基类"""
25
49
 
@@ -106,8 +130,8 @@ class S3Source(Source):
106
130
  data_source = {
107
131
  'url': f"s3://{self.bucket}/{normalized_key}",
108
132
  'version': version,
109
- 'date_created': date_modified,
110
- 'date_modified': date_modified,
133
+ 'date_created': _to_millis_timestamp_string(date_modified),
134
+ 'date_modified': _to_millis_timestamp_string(date_modified),
111
135
  'record_locator': {
112
136
  'server': server,
113
137
  'protocol': 's3',
@@ -159,8 +183,8 @@ class LocalSource(Source):
159
183
  data_source = {
160
184
  'url': full_path.as_uri(),
161
185
  'version': version,
162
- 'date_created': date_created,
163
- 'date_modified': date_modified,
186
+ 'date_created': _to_millis_timestamp_string(date_created),
187
+ 'date_modified': _to_millis_timestamp_string(date_modified),
164
188
  'record_locator': {
165
189
  'protocol': 'file',
166
190
  'remote_file_path': str(full_path)
@@ -211,8 +235,8 @@ class FtpSource(Source):
211
235
  data_source = {
212
236
  'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
213
237
  'version': None,
214
- 'date_created': None,
215
- 'date_modified': date_modified,
238
+ 'date_created': "",
239
+ 'date_modified': _to_millis_timestamp_string(date_modified),
216
240
  'record_locator': {
217
241
  'server': f"{self.host}:{self.port}",
218
242
  'protocol': 'ftp',
@@ -311,8 +335,8 @@ class SmbSource(Source):
311
335
  data_source = {
312
336
  'url': smb_url,
313
337
  'version': None,
314
- 'date_created': date_created,
315
- 'date_modified': date_modified,
338
+ 'date_created': _to_millis_timestamp_string(date_created),
339
+ 'date_modified': _to_millis_timestamp_string(date_modified),
316
340
  'record_locator': {
317
341
  'server': self.host,
318
342
  'share': self.share_name,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -331,9 +331,9 @@ source = SmbSource(
331
331
 
332
332
  ```python
333
333
  destination = MilvusDestination(
334
- db_path: './milvus_pipeline.db', # 本地数据库文件
335
- collection_name: 'my_collection', # 数据库collection名称
336
- dimension: 1024 # 向量维度,需与 embed API 返回一致
334
+ db_path='./milvus_pipeline.db', # 本地数据库文件
335
+ collection_name='my_collection', # 数据库collection名称
336
+ dimension=1024 # 向量维度,需与 embed API 返回一致
337
337
  )
338
338
  ```
339
339
 
@@ -341,10 +341,10 @@ destination = MilvusDestination(
341
341
 
342
342
  ```python
343
343
  destination = MilvusDestination(
344
- db_path: 'https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
345
- collection_name: 'my_collection', # 数据库collection名称
346
- dimension: 1024, # 向量维度,需与 embed API 返回一致
347
- api_key: 'your-api-key' # Zilliz Cloud API Key
344
+ db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
345
+ collection_name='my_collection', # 数据库collection名称
346
+ dimension=1024, # 向量维度,需与 embed API 返回一致
347
+ api_key='your-api-key' # Zilliz Cloud API Key
348
348
  )
349
349
  ```
350
350
 
@@ -354,7 +354,7 @@ destination = MilvusDestination(
354
354
 
355
355
  ```python
356
356
  destination = LocalDestination(
357
- output_dir: './output'
357
+ output_dir='./output'
358
358
  )
359
359
  ```
360
360
 
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
2
+ example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
6
+ xparse_client/pipeline/destinations.py,sha256=dXtsYw6xu_pBH8GKu9zKRzD6qP6Krgg-l1zCTzj8PhU,10475
7
+ xparse_client/pipeline/pipeline.py,sha256=nWebrMGJjxTKF1de-7DeNtyozuUOkh3zbAqv-Q78Ev4,18039
8
+ xparse_client/pipeline/sources.py,sha256=NdME4mTSzC7JXXhaw985Tt_nc0y_aRUl2NEALm8xLII,12484
9
+ xparse_client-0.2.5.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.5.dist-info/METADATA,sha256=iN0qG5s4bc1hAueNx0VucDekicwdF-kQ5IzUWH2tJd4,26500
11
+ xparse_client-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.5.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.5.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
6
- xparse_client/pipeline/destinations.py,sha256=rqcxmsn1YGClVxGQxSVmyr-uumOVilOv_vX82fUBj-I,9859
7
- xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
8
- xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
9
- xparse_client-0.2.4.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.4.dist-info/METADATA,sha256=IMgXO9a7wnN0Ygzauk7eOkyrRFE3A2rq73eofvq3wBs,26508
11
- xparse_client-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.4.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.4.dist-info/RECORD,,