xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +111 -20
- xparse_client/_base.py +188 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +351 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +225 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +134 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +132 -0
- xparse_client/models/pipeline.py +134 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b8.dist-info/RECORD +68 -0
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -129
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -690
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.19.dist-info/METADATA +0 -1050
- xparse_client-0.2.19.dist-info/RECORD +0 -11
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
example/1_basic_api_usage.py,sha256=x_ZHEWXz6z7qp-sMLBvq7Vpu6nA7YxVCizyQpDday6M,5802
|
|
2
|
+
example/2_async_job.py,sha256=CT03sKz2dwTJRqSde8s5o75oe-p3ze43i0E1i_-Ai5Q,6352
|
|
3
|
+
example/3_local_workflow.py,sha256=ZxoT8N0Nz4_s-bkLF0l6UDqBJk_eZZi99WUtUhJseXs,8980
|
|
4
|
+
example/4_advanced_workflow.py,sha256=KH6gmzoKi6oeHIPDv9snw1YehynHTDqWVq1WFtoqGws,10156
|
|
5
|
+
example/README.md,sha256=d-DumaZwbV96K6ZTEryAlDSk9HUhRBg0gG_Yekx9PqA,2336
|
|
6
|
+
example/config_example.json,sha256=mBaLmKbgxFBKUgA-FtCTiLMtq24z0QRwKbPeYDGwA9I,2517
|
|
7
|
+
tests/conftest.py,sha256=h6sy9RNQmXTQNTu86h7ATDAg-9wI4cKjzKXUbNNH5q8,10037
|
|
8
|
+
tests/unit/__init__.py,sha256=C1308ox8hFFtq3A1vBAVPzL6r8HGVbEbC42ULCpFkjs,25
|
|
9
|
+
tests/unit/test_base.py,sha256=HRW2irYNAX863PBCtJWvN9EJ_UttfOwRI7CRnDdcjEk,11599
|
|
10
|
+
tests/unit/test_client.py,sha256=92geEJ6jBht7nFr5suRbJ6SmkvzCu_HGB5MhSl4Qmr8,3534
|
|
11
|
+
tests/unit/test_config.py,sha256=OAUKvmEtpC-eAglnDzhp1bUyMlUpaeqLAa1QwyxCUdw,5498
|
|
12
|
+
tests/unit/test_exceptions.py,sha256=BEaGSs34_jmdjUuT4e3S3QONDqldamiXNId04PPdMt4,5155
|
|
13
|
+
tests/unit/test_http.py,sha256=3h-O6k5rcVP_I1sPoISOYLdHAKtSxoPrDOfC3EJyNi0,16490
|
|
14
|
+
tests/unit/api/__init__.py,sha256=5HiLverbqHC-yFxXClrHBjHq62KoONLZPLMMOkYR3eU,23
|
|
15
|
+
tests/unit/api/test_extract.py,sha256=QX4_BAQN6MkRTJQocEf9u6HwOA_XoY_a2djMcuLBCTo,6959
|
|
16
|
+
tests/unit/api/test_local.py,sha256=_DXC8PattJ-CH9inlwDu9s1RCocTGVeYpKFpTiwDwa4,7541
|
|
17
|
+
tests/unit/api/test_parse.py,sha256=EUA3emZFAFefop3CcUgqyLGC1lhd5ujx6x-dGE5h3pw,12639
|
|
18
|
+
tests/unit/api/test_pipeline.py,sha256=0TC_upZbSllQfuz7uGOCvx28BDNCnX1POR92JLnfrIE,11067
|
|
19
|
+
tests/unit/api/test_workflows.py,sha256=lYVbOWph8ZqxszFQTgcFHnxgdp9jjJM35F-wKO3IUuY,3286
|
|
20
|
+
tests/unit/connectors/test_ftp.py,sha256=7NbV7cL9tIy03L7_sWoALrfDmjda_xOgfAoBfUnF9ks,14605
|
|
21
|
+
tests/unit/connectors/test_local_connectors.py,sha256=QS2T_eLXJtHSaOrbtwYrRSgAVrL3jkO3QVPoMEpi9d8,9855
|
|
22
|
+
tests/unit/connectors/test_milvus.py,sha256=X2VRv639QSntpjx_7u1cOwJg0HSKPumuDmd26FWxMKA,11911
|
|
23
|
+
tests/unit/connectors/test_qdrant.py,sha256=wCIG36CjbDhF0Jq53jnNzMliq5eXMWgnCPiIHtVBbew,12682
|
|
24
|
+
tests/unit/connectors/test_s3.py,sha256=Xbo-S9khgS1S3LM3BUIuYp-28YLEmEW0SP1O-iJ8g5g,18444
|
|
25
|
+
tests/unit/connectors/test_smb.py,sha256=yrI_xNeOe84vONyxXNOGLbWa7tcyxjzo9G5Z_Tjzn-w,12378
|
|
26
|
+
tests/unit/connectors/test_utils.py,sha256=RfnzTW5-9w2f5YeLbNTB_z3vVnsUSL7DwXNobo7xZUA,10302
|
|
27
|
+
tests/unit/models/test_local.py,sha256=QnxzWKOivqeITd0CPzZLEzGbaMDtiwsPiwyEHiPpcPY,1325
|
|
28
|
+
tests/unit/models/test_pipeline_stages.py,sha256=LR7OmMhVAAsV085x-l3d5jQ-efl04WMyrQFjVzXEvws,4049
|
|
29
|
+
tests/unit/models/test_workflows.py,sha256=Al4d3SQI1jTz585i1kD569TZx4IWNrhyf3_Le6-cq2k,1692
|
|
30
|
+
xparse_client/__init__.py,sha256=FyR7W0eteapDoPj2pU4VoTobM8XR2B7f_g4yVI-v2v8,3956
|
|
31
|
+
xparse_client/_base.py,sha256=IPYdHREM0CUvZ_kWUp4olYQfsfKzPTQoZuIrNSanKq4,4661
|
|
32
|
+
xparse_client/_client.py,sha256=ZGxZ9xhUitTP-eRhte57ekqUkrXxNubFDL5gojajA2s,6964
|
|
33
|
+
xparse_client/_config.py,sha256=BG3cEW5ywKhmtfHjfTFrdezs2Ugc9P2Y85tzu5OZZ4M,6480
|
|
34
|
+
xparse_client/_http.py,sha256=Y0-x8CZbHVbi4Jmln1d60oD_Pz1nwQAf-0sbON4B9qo,11443
|
|
35
|
+
xparse_client/exceptions.py,sha256=FNozgCFaQSsdeyxE6P-3CZItkTSfFJcIQVrddHWKN0s,9521
|
|
36
|
+
xparse_client/api/__init__.py,sha256=MZ3wBTDU666efOBv81MKKtc1UkjUsjzIEOpsnDUAHD4,203
|
|
37
|
+
xparse_client/api/extract.py,sha256=87tH-y4sXPKW35l-d0GmZdK9RIXWq4tfmGZXYJhIN60,3011
|
|
38
|
+
xparse_client/api/local.py,sha256=dARW6RjiNsxliWxs7cB6qQl9eTZNvyjBpOG8qXyaIaw,8214
|
|
39
|
+
xparse_client/api/parse.py,sha256=nvpu5kiN43QlyMW3N6vMruVADu6MdjS-TMbLFjBh-cc,5870
|
|
40
|
+
xparse_client/api/pipeline.py,sha256=bPCSpWjER-7c7L0iMQ_Xz1VTe5DARMsX5XIcrayGKzw,4530
|
|
41
|
+
xparse_client/api/workflows.py,sha256=29_Rf4bIH1fiJb_bWShkUm-et97ym5ymQFh5i3b5mDI,5945
|
|
42
|
+
xparse_client/connectors/__init__.py,sha256=BLGxk9i8BsVCr4RMrCIh8b9cywBqXXmcDHqfF7C_FX0,1113
|
|
43
|
+
xparse_client/connectors/_utils.py,sha256=Qb2OyUg8gvzyMBELx4h0GmjRSdT4d9C45v9m6NUNuMg,3515
|
|
44
|
+
xparse_client/connectors/destinations/__init__.py,sha256=hmvWAXj58yLmePMCEnqKbNhJ74UxErugSFxBxPhlb14,1401
|
|
45
|
+
xparse_client/connectors/destinations/base.py,sha256=DfwwH5S8VPwEUyvWqUKDV9Bq-7A59lqoZHruOjB7L50,3218
|
|
46
|
+
xparse_client/connectors/destinations/local.py,sha256=drZ5wkPMiYd8iOROmLJR3eoCXcYKUpAPPjSFpafeNW8,2542
|
|
47
|
+
xparse_client/connectors/destinations/milvus.py,sha256=MhElUsIMY1g6_c_bD-wjpWOM5PiuZ9bLsgXACW7EY1c,7339
|
|
48
|
+
xparse_client/connectors/destinations/qdrant.py,sha256=iU9NxZBhqp-94-1TkG32fDOjg8i4bT9Ox0F2E-2KODg,7677
|
|
49
|
+
xparse_client/connectors/destinations/s3.py,sha256=2aeTW0cNUVPLBL8up9InFfgXBcFDPtqFNGa5FbUphhk,4728
|
|
50
|
+
xparse_client/connectors/sources/__init__.py,sha256=830EDkdGJtPC2p1MhIP108uLibGeMoXcXAwPAEACPvQ,1143
|
|
51
|
+
xparse_client/connectors/sources/base.py,sha256=fbRWFWuIcw_XWrOPpPkBnQ0Pf2R9N7Ry4523rxWcOpc,1863
|
|
52
|
+
xparse_client/connectors/sources/ftp.py,sha256=RwH8eeY-raD4BlF51kb_0BJuM3Vxjbnjpob-f1x_VgY,8192
|
|
53
|
+
xparse_client/connectors/sources/local.py,sha256=ZYTGWdQmUpzdenn3JQ_p0u9-8x7xXk5Ani2GIAqTGyE,5156
|
|
54
|
+
xparse_client/connectors/sources/s3.py,sha256=RCzinmz9cCqp9cFcfEozBapa2nllFvJCw9P-aLeOJQA,7219
|
|
55
|
+
xparse_client/connectors/sources/smb.py,sha256=9oWrgatdmsXMrFnl-du-lau3RMxlMPw2aQy7SFknUDM,7797
|
|
56
|
+
xparse_client/models/__init__.py,sha256=zLBFirgXDisEeqKHqpsH1bN5p2zE7ZmCtVENZ69mlOg,1213
|
|
57
|
+
xparse_client/models/chunk.py,sha256=VXKrATgk7bhlPVbJTFOUvOWu9oEnNMgMF6AnQarJD7o,986
|
|
58
|
+
xparse_client/models/embed.py,sha256=Tg9iqOiLZy63m4T0q2z9cq3neuQbMnqDqT55-sVA7CE,1652
|
|
59
|
+
xparse_client/models/extract.py,sha256=i7js3EEozHMdQHqksZ6drxXnHqhePn3PlaieF8BdLFY,1033
|
|
60
|
+
xparse_client/models/local.py,sha256=hqKmyWTU_EhPk6qybtBzmbLybBBaiasTpqM4_zP-ipo,698
|
|
61
|
+
xparse_client/models/parse.py,sha256=COgxroHkJce_S7d2HGd_1zBYhkpCutGtmjyLnpI2_eI,3154
|
|
62
|
+
xparse_client/models/pipeline.py,sha256=24bDzhrVotQ8St6VLEJLvG2cZF0G2AMioKI34I_hJXI,3297
|
|
63
|
+
xparse_client/models/workflows.py,sha256=BivMdGOAmhP6oYLQSGAAN7yml2xb7vHHrpzwLgN_Afk,1754
|
|
64
|
+
xparse_client-0.3.0b8.dist-info/licenses/LICENSE,sha256=7iuki7DyWMGB8PBzsht7PUt0YjdIcPjrcXNyUFgMJsw,1070
|
|
65
|
+
xparse_client-0.3.0b8.dist-info/METADATA,sha256=mrVOWpID-szyT7O2GlKxymxTLlbWFZfeuIsoxLItmuk,25471
|
|
66
|
+
xparse_client-0.3.0b8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
67
|
+
xparse_client-0.3.0b8.dist-info/top_level.txt,sha256=oQGc_qysOmnSAaLjwB72wH8RBHRAmxB-_qb-Uj6u56o,28
|
|
68
|
+
xparse_client-0.3.0b8.dist-info/RECORD,,
|
xparse_client/pipeline/config.py
DELETED
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- encoding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
from dataclasses import dataclass, asdict
|
|
5
|
-
from typing import Dict, Any, Optional, Literal, List, Union
|
|
6
|
-
from .destinations import Destination
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class ParseConfig:
|
|
10
|
-
"""Parse 配置,支持动态字段"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
|
|
13
|
-
self.provider = provider
|
|
14
|
-
for key, value in kwargs.items():
|
|
15
|
-
setattr(self, key, value)
|
|
16
|
-
|
|
17
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
18
|
-
result = {'provider': self.provider}
|
|
19
|
-
for key, value in self.__dict__.items():
|
|
20
|
-
if not key.startswith('_') and key != 'provider':
|
|
21
|
-
result[key] = value
|
|
22
|
-
return result
|
|
23
|
-
|
|
24
|
-
def __repr__(self) -> str:
|
|
25
|
-
attrs = ', '.join(f'{k}={v!r}' for k, v in sorted(self.__dict__.items()) if not k.startswith('_'))
|
|
26
|
-
return f'ParseConfig({attrs})'
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class ChunkConfig:
|
|
31
|
-
"""Chunk 配置"""
|
|
32
|
-
strategy: Literal["basic", "by_title", "by_page"] = "basic"
|
|
33
|
-
include_orig_elements: bool = False
|
|
34
|
-
new_after_n_chars: int = 512
|
|
35
|
-
max_characters: int = 1024
|
|
36
|
-
overlap: int = 0
|
|
37
|
-
overlap_all: bool = False
|
|
38
|
-
|
|
39
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
40
|
-
return asdict(self)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@dataclass
|
|
44
|
-
class EmbedConfig:
|
|
45
|
-
"""Embed 配置"""
|
|
46
|
-
provider: Literal["qwen", "doubao"] = "qwen"
|
|
47
|
-
model_name: Literal[
|
|
48
|
-
"text-embedding-v3",
|
|
49
|
-
"text-embedding-v4",
|
|
50
|
-
"doubao-embedding-large-text-250515",
|
|
51
|
-
"doubao-embedding-text-240715"
|
|
52
|
-
] = "text-embedding-v3"
|
|
53
|
-
|
|
54
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
55
|
-
return asdict(self)
|
|
56
|
-
|
|
57
|
-
def validate(self) -> None:
|
|
58
|
-
provider_models = {
|
|
59
|
-
"qwen": ["text-embedding-v3", "text-embedding-v4"],
|
|
60
|
-
"doubao": ["doubao-embedding-large-text-250515", "doubao-embedding-text-240715"]
|
|
61
|
-
}
|
|
62
|
-
if self.provider not in provider_models:
|
|
63
|
-
raise ValueError(f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}")
|
|
64
|
-
if self.model_name not in provider_models[self.provider]:
|
|
65
|
-
raise ValueError(
|
|
66
|
-
f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@dataclass
|
|
71
|
-
class Stage:
|
|
72
|
-
"""Pipeline Stage 配置"""
|
|
73
|
-
type: Literal["parse", "chunk", "embed"]
|
|
74
|
-
config: Union[ParseConfig, ChunkConfig, EmbedConfig, Dict[str, Any]]
|
|
75
|
-
|
|
76
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
77
|
-
"""转换为字典格式,用于 API 请求"""
|
|
78
|
-
if isinstance(self.config, (ParseConfig, ChunkConfig, EmbedConfig)):
|
|
79
|
-
return {
|
|
80
|
-
'type': self.type,
|
|
81
|
-
'config': self.config.to_dict()
|
|
82
|
-
}
|
|
83
|
-
else:
|
|
84
|
-
# 如果 config 已经是字典,直接使用
|
|
85
|
-
return {
|
|
86
|
-
'type': self.type,
|
|
87
|
-
'config': self.config
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
def __repr__(self) -> str:
|
|
91
|
-
return f"Stage(type={self.type!r}, config={self.config!r})"
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
@dataclass
|
|
95
|
-
class PipelineStats:
|
|
96
|
-
"""Pipeline 统计信息"""
|
|
97
|
-
original_elements: int = 0
|
|
98
|
-
chunked_elements: int = 0
|
|
99
|
-
embedded_elements: int = 0
|
|
100
|
-
stages: Optional[List[Stage]] = None # 存储实际执行的 stages
|
|
101
|
-
record_id: Optional[str] = None # 记录 ID,用于标识需要写入 Milvus 的记录
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
@dataclass
|
|
105
|
-
class PipelineConfig:
|
|
106
|
-
"""Pipeline 配置"""
|
|
107
|
-
include_intermediate_results: bool = False
|
|
108
|
-
intermediate_results_destination: Optional[Destination] = None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
|
|
109
|
-
|
|
110
|
-
def __post_init__(self):
|
|
111
|
-
"""验证配置"""
|
|
112
|
-
if self.include_intermediate_results and not self.intermediate_results_destination:
|
|
113
|
-
raise ValueError("当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination")
|
|
114
|
-
|
|
115
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
116
|
-
return {
|
|
117
|
-
'include_intermediate_results': self.include_intermediate_results,
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
__all__ = [
|
|
122
|
-
'ParseConfig',
|
|
123
|
-
'ChunkConfig',
|
|
124
|
-
'EmbedConfig',
|
|
125
|
-
'Stage',
|
|
126
|
-
'PipelineStats',
|
|
127
|
-
'PipelineConfig',
|
|
128
|
-
]
|
|
129
|
-
|