xparse-client 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xparse_client-0.2.4 → xparse_client-0.2.5}/PKG-INFO +9 -9
- {xparse_client-0.2.4 → xparse_client-0.2.5}/README.md +8 -8
- {xparse_client-0.2.4 → xparse_client-0.2.5}/pyproject.toml +1 -1
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/pipeline/destinations.py +42 -12
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/pipeline/pipeline.py +5 -10
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/pipeline/sources.py +32 -8
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client.egg-info/PKG-INFO +9 -9
- {xparse_client-0.2.4 → xparse_client-0.2.5}/LICENSE +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/example/run_pipeline.py +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/example/run_pipeline_test.py +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/setup.cfg +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/__init__.py +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/pipeline/__init__.py +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client/pipeline/config.py +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client.egg-info/SOURCES.txt +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client.egg-info/dependency_links.txt +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client.egg-info/requires.txt +0 -0
- {xparse_client-0.2.4 → xparse_client-0.2.5}/xparse_client.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -331,9 +331,9 @@ source = SmbSource(
|
|
|
331
331
|
|
|
332
332
|
```python
|
|
333
333
|
destination = MilvusDestination(
|
|
334
|
-
db_path
|
|
335
|
-
collection_name
|
|
336
|
-
dimension
|
|
334
|
+
db_path='./milvus_pipeline.db', # 本地数据库文件
|
|
335
|
+
collection_name='my_collection', # 数据库collection名称
|
|
336
|
+
dimension=1024 # 向量维度,需与 embed API 返回一致
|
|
337
337
|
)
|
|
338
338
|
```
|
|
339
339
|
|
|
@@ -341,10 +341,10 @@ destination = MilvusDestination(
|
|
|
341
341
|
|
|
342
342
|
```python
|
|
343
343
|
destination = MilvusDestination(
|
|
344
|
-
db_path
|
|
345
|
-
collection_name
|
|
346
|
-
dimension
|
|
347
|
-
api_key
|
|
344
|
+
db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
|
|
345
|
+
collection_name='my_collection', # 数据库collection名称
|
|
346
|
+
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
347
|
+
api_key='your-api-key' # Zilliz Cloud API Key
|
|
348
348
|
)
|
|
349
349
|
```
|
|
350
350
|
|
|
@@ -354,7 +354,7 @@ destination = MilvusDestination(
|
|
|
354
354
|
|
|
355
355
|
```python
|
|
356
356
|
destination = LocalDestination(
|
|
357
|
-
output_dir
|
|
357
|
+
output_dir='./output'
|
|
358
358
|
)
|
|
359
359
|
```
|
|
360
360
|
|
|
@@ -314,9 +314,9 @@ source = SmbSource(
|
|
|
314
314
|
|
|
315
315
|
```python
|
|
316
316
|
destination = MilvusDestination(
|
|
317
|
-
db_path
|
|
318
|
-
collection_name
|
|
319
|
-
dimension
|
|
317
|
+
db_path='./milvus_pipeline.db', # 本地数据库文件
|
|
318
|
+
collection_name='my_collection', # 数据库collection名称
|
|
319
|
+
dimension=1024 # 向量维度,需与 embed API 返回一致
|
|
320
320
|
)
|
|
321
321
|
```
|
|
322
322
|
|
|
@@ -324,10 +324,10 @@ destination = MilvusDestination(
|
|
|
324
324
|
|
|
325
325
|
```python
|
|
326
326
|
destination = MilvusDestination(
|
|
327
|
-
db_path
|
|
328
|
-
collection_name
|
|
329
|
-
dimension
|
|
330
|
-
api_key
|
|
327
|
+
db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
|
|
328
|
+
collection_name='my_collection', # 数据库collection名称
|
|
329
|
+
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
330
|
+
api_key='your-api-key' # Zilliz Cloud API Key
|
|
331
331
|
)
|
|
332
332
|
```
|
|
333
333
|
|
|
@@ -337,7 +337,7 @@ destination = MilvusDestination(
|
|
|
337
337
|
|
|
338
338
|
```python
|
|
339
339
|
destination = LocalDestination(
|
|
340
|
-
output_dir
|
|
340
|
+
output_dir='./output'
|
|
341
341
|
)
|
|
342
342
|
```
|
|
343
343
|
|
|
@@ -18,6 +18,41 @@ from pymilvus import MilvusClient
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
def _flatten_dict(data: Dict[str, Any], prefix: str = '', fixed_fields: set = None) -> Dict[str, Any]:
|
|
22
|
+
"""递归展平嵌套字典
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
data: 要展平的字典
|
|
26
|
+
prefix: 键的前缀
|
|
27
|
+
fixed_fields: 需要排除的字段集合
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
展平后的字典
|
|
31
|
+
"""
|
|
32
|
+
if fixed_fields is None:
|
|
33
|
+
fixed_fields = set()
|
|
34
|
+
|
|
35
|
+
result = {}
|
|
36
|
+
for key, value in data.items():
|
|
37
|
+
flat_key = f'{prefix}_{key}' if prefix else key
|
|
38
|
+
|
|
39
|
+
if flat_key in fixed_fields:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
if isinstance(value, dict):
|
|
43
|
+
# 递归展平嵌套字典
|
|
44
|
+
nested = _flatten_dict(value, flat_key, fixed_fields)
|
|
45
|
+
result.update(nested)
|
|
46
|
+
elif isinstance(value, list):
|
|
47
|
+
# 列表转换为 JSON 字符串
|
|
48
|
+
result[flat_key] = json.dumps(value, ensure_ascii=False)
|
|
49
|
+
else:
|
|
50
|
+
# 其他类型直接使用
|
|
51
|
+
result[flat_key] = value
|
|
52
|
+
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
21
56
|
class Destination(ABC):
|
|
22
57
|
"""数据目的地抽象基类"""
|
|
23
58
|
|
|
@@ -90,8 +125,7 @@ class MilvusDestination(Destination):
|
|
|
90
125
|
'embeddings': item['embeddings'],
|
|
91
126
|
'text': item.get('text', ''),
|
|
92
127
|
'element_id': element_id,
|
|
93
|
-
'record_id': element_metadata.get('record_id', '')
|
|
94
|
-
'created_at': datetime.now().isoformat()
|
|
128
|
+
'record_id': element_metadata.get('record_id', '')
|
|
95
129
|
}
|
|
96
130
|
|
|
97
131
|
# 合并文件级别的 metadata 和元素级别的 metadata
|
|
@@ -103,17 +137,13 @@ class MilvusDestination(Destination):
|
|
|
103
137
|
fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
|
|
104
138
|
for key, value in merged_metadata.items():
|
|
105
139
|
if key not in fixed_fields:
|
|
106
|
-
# 特殊处理 data_source
|
|
140
|
+
# 特殊处理 data_source 字段:如果是字典则递归展平
|
|
107
141
|
if key == 'data_source' and isinstance(value, dict):
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
if isinstance(sub_value, (dict, list)):
|
|
114
|
-
insert_item[flat_key] = json.dumps(sub_value, ensure_ascii=False)
|
|
115
|
-
else:
|
|
116
|
-
insert_item[flat_key] = sub_value
|
|
142
|
+
# 递归展平 data_source 字典,包括嵌套的字典
|
|
143
|
+
flattened = _flatten_dict(value, 'data_source', fixed_fields)
|
|
144
|
+
insert_item.update(flattened)
|
|
145
|
+
elif key == 'coordinates' and isinstance(value, list):
|
|
146
|
+
insert_item[key] = value
|
|
117
147
|
elif isinstance(value, (dict, list)):
|
|
118
148
|
continue
|
|
119
149
|
else:
|
|
@@ -198,7 +198,9 @@ class Pipeline:
|
|
|
198
198
|
print(f" → 读取文件...")
|
|
199
199
|
file_bytes, data_source = self.source.read_file(file_path)
|
|
200
200
|
data_source = data_source or {}
|
|
201
|
-
|
|
201
|
+
# 转换为毫秒时间戳字符串
|
|
202
|
+
timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
203
|
+
data_source['date_processed'] = str(timestamp_ms)
|
|
202
204
|
print(f" ✓ 文件读取完成: {len(file_bytes)} bytes")
|
|
203
205
|
|
|
204
206
|
result = self.process_with_pipeline(file_bytes, file_path, data_source)
|
|
@@ -209,15 +211,8 @@ class Pipeline:
|
|
|
209
211
|
|
|
210
212
|
print(f" → 写入目的地...")
|
|
211
213
|
metadata = {
|
|
212
|
-
'
|
|
213
|
-
'
|
|
214
|
-
'processed_at': datetime.now().isoformat(),
|
|
215
|
-
'data_source': data_source,
|
|
216
|
-
'stats': {
|
|
217
|
-
'original_elements': stats.original_elements,
|
|
218
|
-
'chunked_elements': stats.chunked_elements,
|
|
219
|
-
'embedded_elements': stats.embedded_elements
|
|
220
|
-
}
|
|
214
|
+
'filename': file_path,
|
|
215
|
+
'processed_at': str(timestamp_ms),
|
|
221
216
|
}
|
|
222
217
|
|
|
223
218
|
success = self.destination.write(embedded_data, metadata)
|
|
@@ -20,6 +20,30 @@ from botocore.config import Config
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def _to_millis_timestamp_string(timestamp):
|
|
24
|
+
"""将时间戳转换为毫秒时间戳字符串
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
timestamp: 时间戳(秒或毫秒),可以是 int、float 或 None
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
str: 毫秒时间戳字符串,如果输入为 None 则返回空字符串
|
|
31
|
+
"""
|
|
32
|
+
if timestamp is None:
|
|
33
|
+
return ""
|
|
34
|
+
|
|
35
|
+
# 如果已经是毫秒时间戳(大于 1e12),直接转换
|
|
36
|
+
if isinstance(timestamp, (int, float)):
|
|
37
|
+
if timestamp > 1e12:
|
|
38
|
+
# 已经是毫秒时间戳
|
|
39
|
+
return str(int(timestamp))
|
|
40
|
+
else:
|
|
41
|
+
# 秒级时间戳,转换为毫秒
|
|
42
|
+
return str(int(timestamp * 1000))
|
|
43
|
+
|
|
44
|
+
return str(timestamp)
|
|
45
|
+
|
|
46
|
+
|
|
23
47
|
class Source(ABC):
|
|
24
48
|
"""数据源抽象基类"""
|
|
25
49
|
|
|
@@ -106,8 +130,8 @@ class S3Source(Source):
|
|
|
106
130
|
data_source = {
|
|
107
131
|
'url': f"s3://{self.bucket}/{normalized_key}",
|
|
108
132
|
'version': version,
|
|
109
|
-
'date_created': date_modified,
|
|
110
|
-
'date_modified': date_modified,
|
|
133
|
+
'date_created': _to_millis_timestamp_string(date_modified),
|
|
134
|
+
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
111
135
|
'record_locator': {
|
|
112
136
|
'server': server,
|
|
113
137
|
'protocol': 's3',
|
|
@@ -159,8 +183,8 @@ class LocalSource(Source):
|
|
|
159
183
|
data_source = {
|
|
160
184
|
'url': full_path.as_uri(),
|
|
161
185
|
'version': version,
|
|
162
|
-
'date_created': date_created,
|
|
163
|
-
'date_modified': date_modified,
|
|
186
|
+
'date_created': _to_millis_timestamp_string(date_created),
|
|
187
|
+
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
164
188
|
'record_locator': {
|
|
165
189
|
'protocol': 'file',
|
|
166
190
|
'remote_file_path': str(full_path)
|
|
@@ -211,8 +235,8 @@ class FtpSource(Source):
|
|
|
211
235
|
data_source = {
|
|
212
236
|
'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
|
|
213
237
|
'version': None,
|
|
214
|
-
'date_created':
|
|
215
|
-
'date_modified': date_modified,
|
|
238
|
+
'date_created': "",
|
|
239
|
+
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
216
240
|
'record_locator': {
|
|
217
241
|
'server': f"{self.host}:{self.port}",
|
|
218
242
|
'protocol': 'ftp',
|
|
@@ -311,8 +335,8 @@ class SmbSource(Source):
|
|
|
311
335
|
data_source = {
|
|
312
336
|
'url': smb_url,
|
|
313
337
|
'version': None,
|
|
314
|
-
'date_created': date_created,
|
|
315
|
-
'date_modified': date_modified,
|
|
338
|
+
'date_created': _to_millis_timestamp_string(date_created),
|
|
339
|
+
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
316
340
|
'record_locator': {
|
|
317
341
|
'server': self.host,
|
|
318
342
|
'share': self.share_name,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -331,9 +331,9 @@ source = SmbSource(
|
|
|
331
331
|
|
|
332
332
|
```python
|
|
333
333
|
destination = MilvusDestination(
|
|
334
|
-
db_path
|
|
335
|
-
collection_name
|
|
336
|
-
dimension
|
|
334
|
+
db_path='./milvus_pipeline.db', # 本地数据库文件
|
|
335
|
+
collection_name='my_collection', # 数据库collection名称
|
|
336
|
+
dimension=1024 # 向量维度,需与 embed API 返回一致
|
|
337
337
|
)
|
|
338
338
|
```
|
|
339
339
|
|
|
@@ -341,10 +341,10 @@ destination = MilvusDestination(
|
|
|
341
341
|
|
|
342
342
|
```python
|
|
343
343
|
destination = MilvusDestination(
|
|
344
|
-
db_path
|
|
345
|
-
collection_name
|
|
346
|
-
dimension
|
|
347
|
-
api_key
|
|
344
|
+
db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
|
|
345
|
+
collection_name='my_collection', # 数据库collection名称
|
|
346
|
+
dimension=1024, # 向量维度,需与 embed API 返回一致
|
|
347
|
+
api_key='your-api-key' # Zilliz Cloud API Key
|
|
348
348
|
)
|
|
349
349
|
```
|
|
350
350
|
|
|
@@ -354,7 +354,7 @@ destination = MilvusDestination(
|
|
|
354
354
|
|
|
355
355
|
```python
|
|
356
356
|
destination = LocalDestination(
|
|
357
|
-
output_dir
|
|
357
|
+
output_dir='./output'
|
|
358
358
|
)
|
|
359
359
|
```
|
|
360
360
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|