xparse-client 0.2.20__py3-none-any.whl → 0.3.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +110 -20
- xparse_client/_base.py +179 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +350 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +188 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +132 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +136 -0
- xparse_client/models/pipeline.py +132 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b2.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b2.dist-info/RECORD +68 -0
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -163
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -860
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.20.dist-info/METADATA +0 -1050
- xparse_client-0.2.20.dist-info/RECORD +0 -11
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
example/1_basic_api_usage.py,sha256=x_ZHEWXz6z7qp-sMLBvq7Vpu6nA7YxVCizyQpDday6M,5802
|
|
2
|
+
example/2_async_job.py,sha256=MCjWii3ksmONiIyCWviBrKPZIwrlCPDvmUPP_F8qXPM,6353
|
|
3
|
+
example/3_local_workflow.py,sha256=ZxoT8N0Nz4_s-bkLF0l6UDqBJk_eZZi99WUtUhJseXs,8980
|
|
4
|
+
example/4_advanced_workflow.py,sha256=cgG0xke8-WNPxqx7iSTl9qgoJsVctXf5mjDkp-SaQho,10159
|
|
5
|
+
example/README.md,sha256=d-DumaZwbV96K6ZTEryAlDSk9HUhRBg0gG_Yekx9PqA,2336
|
|
6
|
+
example/config_example.json,sha256=mBaLmKbgxFBKUgA-FtCTiLMtq24z0QRwKbPeYDGwA9I,2517
|
|
7
|
+
tests/conftest.py,sha256=h6sy9RNQmXTQNTu86h7ATDAg-9wI4cKjzKXUbNNH5q8,10037
|
|
8
|
+
tests/unit/__init__.py,sha256=C1308ox8hFFtq3A1vBAVPzL6r8HGVbEbC42ULCpFkjs,25
|
|
9
|
+
tests/unit/test_base.py,sha256=HRW2irYNAX863PBCtJWvN9EJ_UttfOwRI7CRnDdcjEk,11599
|
|
10
|
+
tests/unit/test_client.py,sha256=92geEJ6jBht7nFr5suRbJ6SmkvzCu_HGB5MhSl4Qmr8,3534
|
|
11
|
+
tests/unit/test_config.py,sha256=OAUKvmEtpC-eAglnDzhp1bUyMlUpaeqLAa1QwyxCUdw,5498
|
|
12
|
+
tests/unit/test_exceptions.py,sha256=BEaGSs34_jmdjUuT4e3S3QONDqldamiXNId04PPdMt4,5155
|
|
13
|
+
tests/unit/test_http.py,sha256=3h-O6k5rcVP_I1sPoISOYLdHAKtSxoPrDOfC3EJyNi0,16490
|
|
14
|
+
tests/unit/api/__init__.py,sha256=5HiLverbqHC-yFxXClrHBjHq62KoONLZPLMMOkYR3eU,23
|
|
15
|
+
tests/unit/api/test_extract.py,sha256=QX4_BAQN6MkRTJQocEf9u6HwOA_XoY_a2djMcuLBCTo,6959
|
|
16
|
+
tests/unit/api/test_local.py,sha256=_DXC8PattJ-CH9inlwDu9s1RCocTGVeYpKFpTiwDwa4,7541
|
|
17
|
+
tests/unit/api/test_parse.py,sha256=EUA3emZFAFefop3CcUgqyLGC1lhd5ujx6x-dGE5h3pw,12639
|
|
18
|
+
tests/unit/api/test_pipeline.py,sha256=0TC_upZbSllQfuz7uGOCvx28BDNCnX1POR92JLnfrIE,11067
|
|
19
|
+
tests/unit/api/test_workflows.py,sha256=lYVbOWph8ZqxszFQTgcFHnxgdp9jjJM35F-wKO3IUuY,3286
|
|
20
|
+
tests/unit/connectors/test_ftp.py,sha256=7NbV7cL9tIy03L7_sWoALrfDmjda_xOgfAoBfUnF9ks,14605
|
|
21
|
+
tests/unit/connectors/test_local_connectors.py,sha256=QS2T_eLXJtHSaOrbtwYrRSgAVrL3jkO3QVPoMEpi9d8,9855
|
|
22
|
+
tests/unit/connectors/test_milvus.py,sha256=X2VRv639QSntpjx_7u1cOwJg0HSKPumuDmd26FWxMKA,11911
|
|
23
|
+
tests/unit/connectors/test_qdrant.py,sha256=wCIG36CjbDhF0Jq53jnNzMliq5eXMWgnCPiIHtVBbew,12682
|
|
24
|
+
tests/unit/connectors/test_s3.py,sha256=Xbo-S9khgS1S3LM3BUIuYp-28YLEmEW0SP1O-iJ8g5g,18444
|
|
25
|
+
tests/unit/connectors/test_smb.py,sha256=yrI_xNeOe84vONyxXNOGLbWa7tcyxjzo9G5Z_Tjzn-w,12378
|
|
26
|
+
tests/unit/connectors/test_utils.py,sha256=RfnzTW5-9w2f5YeLbNTB_z3vVnsUSL7DwXNobo7xZUA,10302
|
|
27
|
+
tests/unit/models/test_local.py,sha256=QnxzWKOivqeITd0CPzZLEzGbaMDtiwsPiwyEHiPpcPY,1325
|
|
28
|
+
tests/unit/models/test_pipeline_stages.py,sha256=vT7Pd0lRWdChTQJuCZ3iUBx8C0xFXQXTBVB6jtWtEKU,3972
|
|
29
|
+
tests/unit/models/test_workflows.py,sha256=Al4d3SQI1jTz585i1kD569TZx4IWNrhyf3_Le6-cq2k,1692
|
|
30
|
+
xparse_client/__init__.py,sha256=FyR7W0eteapDoPj2pU4VoTobM8XR2B7f_g4yVI-v2v8,3956
|
|
31
|
+
xparse_client/_base.py,sha256=Vx85kDns565zwv6Zl6_8jl6b43nNprfBHbqdCVPGGTc,4420
|
|
32
|
+
xparse_client/_client.py,sha256=ZGxZ9xhUitTP-eRhte57ekqUkrXxNubFDL5gojajA2s,6964
|
|
33
|
+
xparse_client/_config.py,sha256=BG3cEW5ywKhmtfHjfTFrdezs2Ugc9P2Y85tzu5OZZ4M,6480
|
|
34
|
+
xparse_client/_http.py,sha256=OrnrkTeuG5tEpuD-a1ixLIFPdiqnVIYtuZCi3uXAzJ4,11381
|
|
35
|
+
xparse_client/exceptions.py,sha256=FNozgCFaQSsdeyxE6P-3CZItkTSfFJcIQVrddHWKN0s,9521
|
|
36
|
+
xparse_client/api/__init__.py,sha256=MZ3wBTDU666efOBv81MKKtc1UkjUsjzIEOpsnDUAHD4,203
|
|
37
|
+
xparse_client/api/extract.py,sha256=87tH-y4sXPKW35l-d0GmZdK9RIXWq4tfmGZXYJhIN60,3011
|
|
38
|
+
xparse_client/api/local.py,sha256=vFfPfdVPMYtJQK76iPfGw1DdFxNqfFiMIEPWw4jkrwo,6246
|
|
39
|
+
xparse_client/api/parse.py,sha256=nvpu5kiN43QlyMW3N6vMruVADu6MdjS-TMbLFjBh-cc,5870
|
|
40
|
+
xparse_client/api/pipeline.py,sha256=HscuwRUOmcdUEjzqSyAMdNkcLyK_XbF_THjNH5DXsec,4356
|
|
41
|
+
xparse_client/api/workflows.py,sha256=29_Rf4bIH1fiJb_bWShkUm-et97ym5ymQFh5i3b5mDI,5945
|
|
42
|
+
xparse_client/connectors/__init__.py,sha256=BLGxk9i8BsVCr4RMrCIh8b9cywBqXXmcDHqfF7C_FX0,1113
|
|
43
|
+
xparse_client/connectors/_utils.py,sha256=Qb2OyUg8gvzyMBELx4h0GmjRSdT4d9C45v9m6NUNuMg,3515
|
|
44
|
+
xparse_client/connectors/destinations/__init__.py,sha256=hmvWAXj58yLmePMCEnqKbNhJ74UxErugSFxBxPhlb14,1401
|
|
45
|
+
xparse_client/connectors/destinations/base.py,sha256=DfwwH5S8VPwEUyvWqUKDV9Bq-7A59lqoZHruOjB7L50,3218
|
|
46
|
+
xparse_client/connectors/destinations/local.py,sha256=drZ5wkPMiYd8iOROmLJR3eoCXcYKUpAPPjSFpafeNW8,2542
|
|
47
|
+
xparse_client/connectors/destinations/milvus.py,sha256=MhElUsIMY1g6_c_bD-wjpWOM5PiuZ9bLsgXACW7EY1c,7339
|
|
48
|
+
xparse_client/connectors/destinations/qdrant.py,sha256=Mo1wCwtsM7OIMLZ0HsEBnvE8N9BGnyTGBGJ1B8oYIew,7661
|
|
49
|
+
xparse_client/connectors/destinations/s3.py,sha256=2aeTW0cNUVPLBL8up9InFfgXBcFDPtqFNGa5FbUphhk,4728
|
|
50
|
+
xparse_client/connectors/sources/__init__.py,sha256=830EDkdGJtPC2p1MhIP108uLibGeMoXcXAwPAEACPvQ,1143
|
|
51
|
+
xparse_client/connectors/sources/base.py,sha256=fbRWFWuIcw_XWrOPpPkBnQ0Pf2R9N7Ry4523rxWcOpc,1863
|
|
52
|
+
xparse_client/connectors/sources/ftp.py,sha256=bjOjnd-2l0MZ2EAcGM3jvuTP7FMWBiQaEbgHzhIl3og,8170
|
|
53
|
+
xparse_client/connectors/sources/local.py,sha256=ZYTGWdQmUpzdenn3JQ_p0u9-8x7xXk5Ani2GIAqTGyE,5156
|
|
54
|
+
xparse_client/connectors/sources/s3.py,sha256=RCzinmz9cCqp9cFcfEozBapa2nllFvJCw9P-aLeOJQA,7219
|
|
55
|
+
xparse_client/connectors/sources/smb.py,sha256=_F496mipGZJ0y17H-sYTlqAf2uJScxGyit7kirTyDKc,7786
|
|
56
|
+
xparse_client/models/__init__.py,sha256=zLBFirgXDisEeqKHqpsH1bN5p2zE7ZmCtVENZ69mlOg,1213
|
|
57
|
+
xparse_client/models/chunk.py,sha256=VXKrATgk7bhlPVbJTFOUvOWu9oEnNMgMF6AnQarJD7o,986
|
|
58
|
+
xparse_client/models/embed.py,sha256=Tg9iqOiLZy63m4T0q2z9cq3neuQbMnqDqT55-sVA7CE,1652
|
|
59
|
+
xparse_client/models/extract.py,sha256=ZwMy0MsTt775Bq3UsZ9WlXFNCLrSnN0j1RgXQ-vFNqo,915
|
|
60
|
+
xparse_client/models/local.py,sha256=hqKmyWTU_EhPk6qybtBzmbLybBBaiasTpqM4_zP-ipo,698
|
|
61
|
+
xparse_client/models/parse.py,sha256=K34Fodo3emOPZJJQeu3MLjvA7HkVMxAuXk20UvO4lfQ,3297
|
|
62
|
+
xparse_client/models/pipeline.py,sha256=KVmTYqyi6sbGMR9UQcEgMHFqhcSBPnoRgtGL4xp8QEA,3086
|
|
63
|
+
xparse_client/models/workflows.py,sha256=BivMdGOAmhP6oYLQSGAAN7yml2xb7vHHrpzwLgN_Afk,1754
|
|
64
|
+
xparse_client-0.3.0b2.dist-info/licenses/LICENSE,sha256=7iuki7DyWMGB8PBzsht7PUt0YjdIcPjrcXNyUFgMJsw,1070
|
|
65
|
+
xparse_client-0.3.0b2.dist-info/METADATA,sha256=QYHCW6pdcMpJKkK3GRKFNMs9comTFatvSG1jXs_5Zv4,25453
|
|
66
|
+
xparse_client-0.3.0b2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
67
|
+
xparse_client-0.3.0b2.dist-info/top_level.txt,sha256=oQGc_qysOmnSAaLjwB72wH8RBHRAmxB-_qb-Uj6u56o,28
|
|
68
|
+
xparse_client-0.3.0b2.dist-info/RECORD,,
|
xparse_client/pipeline/config.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- encoding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
from dataclasses import asdict, dataclass
|
|
5
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
|
-
|
|
7
|
-
from .destinations import Destination
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class ParseConfig:
|
|
11
|
-
"""Parse 配置,支持动态字段"""
|
|
12
|
-
|
|
13
|
-
def __init__(
|
|
14
|
-
self,
|
|
15
|
-
provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin",
|
|
16
|
-
**kwargs,
|
|
17
|
-
):
|
|
18
|
-
self.provider = provider
|
|
19
|
-
for key, value in kwargs.items():
|
|
20
|
-
setattr(self, key, value)
|
|
21
|
-
|
|
22
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
23
|
-
result = {"provider": self.provider}
|
|
24
|
-
for key, value in self.__dict__.items():
|
|
25
|
-
if not key.startswith("_") and key != "provider":
|
|
26
|
-
result[key] = value
|
|
27
|
-
return result
|
|
28
|
-
|
|
29
|
-
def __repr__(self) -> str:
|
|
30
|
-
attrs = ", ".join(
|
|
31
|
-
f"{k}={v!r}"
|
|
32
|
-
for k, v in sorted(self.__dict__.items())
|
|
33
|
-
if not k.startswith("_")
|
|
34
|
-
)
|
|
35
|
-
return f"ParseConfig({attrs})"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@dataclass
|
|
39
|
-
class ChunkConfig:
|
|
40
|
-
"""Chunk 配置"""
|
|
41
|
-
|
|
42
|
-
strategy: Literal["basic", "by_title", "by_page"] = "basic"
|
|
43
|
-
include_orig_elements: bool = False
|
|
44
|
-
new_after_n_chars: int = 512
|
|
45
|
-
max_characters: int = 1024
|
|
46
|
-
overlap: int = 0
|
|
47
|
-
overlap_all: bool = False
|
|
48
|
-
|
|
49
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
50
|
-
return asdict(self)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@dataclass
|
|
54
|
-
class EmbedConfig:
|
|
55
|
-
"""Embed 配置"""
|
|
56
|
-
|
|
57
|
-
provider: Literal["qwen", "doubao"] = "qwen"
|
|
58
|
-
model_name: Literal[
|
|
59
|
-
"text-embedding-v3",
|
|
60
|
-
"text-embedding-v4",
|
|
61
|
-
"doubao-embedding-large-text-250515",
|
|
62
|
-
"doubao-embedding-text-240715",
|
|
63
|
-
] = "text-embedding-v3"
|
|
64
|
-
|
|
65
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
66
|
-
return asdict(self)
|
|
67
|
-
|
|
68
|
-
def validate(self) -> None:
|
|
69
|
-
provider_models = {
|
|
70
|
-
"qwen": ["text-embedding-v3", "text-embedding-v4"],
|
|
71
|
-
"doubao": [
|
|
72
|
-
"doubao-embedding-large-text-250515",
|
|
73
|
-
"doubao-embedding-text-240715",
|
|
74
|
-
],
|
|
75
|
-
}
|
|
76
|
-
if self.provider not in provider_models:
|
|
77
|
-
raise ValueError(
|
|
78
|
-
f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}"
|
|
79
|
-
)
|
|
80
|
-
if self.model_name not in provider_models[self.provider]:
|
|
81
|
-
raise ValueError(
|
|
82
|
-
f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
@dataclass
|
|
87
|
-
class ExtractConfig:
|
|
88
|
-
"""Extract 配置"""
|
|
89
|
-
|
|
90
|
-
schema: Dict[str, Any] # 必填,JSON Schema 定义
|
|
91
|
-
generate_citations: bool = False
|
|
92
|
-
stamp: bool = False
|
|
93
|
-
|
|
94
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
95
|
-
return asdict(self)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@dataclass
|
|
99
|
-
class Stage:
|
|
100
|
-
"""Pipeline Stage 配置"""
|
|
101
|
-
|
|
102
|
-
type: Literal["parse", "chunk", "embed", "extract"]
|
|
103
|
-
config: Union[ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Dict[str, Any]]
|
|
104
|
-
|
|
105
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
106
|
-
"""转换为字典格式,用于 API 请求"""
|
|
107
|
-
if isinstance(
|
|
108
|
-
self.config, (ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig)
|
|
109
|
-
):
|
|
110
|
-
return {"type": self.type, "config": self.config.to_dict()}
|
|
111
|
-
else:
|
|
112
|
-
# 如果 config 已经是字典,直接使用
|
|
113
|
-
return {"type": self.type, "config": self.config}
|
|
114
|
-
|
|
115
|
-
def __repr__(self) -> str:
|
|
116
|
-
return f"Stage(type={self.type!r}, config={self.config!r})"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
@dataclass
|
|
120
|
-
class PipelineStats:
|
|
121
|
-
"""Pipeline 统计信息"""
|
|
122
|
-
|
|
123
|
-
original_elements: int = 0
|
|
124
|
-
chunked_elements: int = 0
|
|
125
|
-
embedded_elements: int = 0
|
|
126
|
-
stages: Optional[List[Stage]] = None # 存储实际执行的 stages
|
|
127
|
-
record_id: Optional[str] = None # 记录 ID,用于标识需要写入 Milvus 的记录
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@dataclass
|
|
131
|
-
class PipelineConfig:
|
|
132
|
-
"""Pipeline 配置"""
|
|
133
|
-
|
|
134
|
-
include_intermediate_results: bool = False
|
|
135
|
-
intermediate_results_destination: Optional[Destination] = (
|
|
136
|
-
None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
def __post_init__(self):
|
|
140
|
-
"""验证配置"""
|
|
141
|
-
if (
|
|
142
|
-
self.include_intermediate_results
|
|
143
|
-
and not self.intermediate_results_destination
|
|
144
|
-
):
|
|
145
|
-
raise ValueError(
|
|
146
|
-
"当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination"
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
150
|
-
return {
|
|
151
|
-
"include_intermediate_results": self.include_intermediate_results,
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
__all__ = [
|
|
156
|
-
"ParseConfig",
|
|
157
|
-
"ChunkConfig",
|
|
158
|
-
"EmbedConfig",
|
|
159
|
-
"ExtractConfig",
|
|
160
|
-
"Stage",
|
|
161
|
-
"PipelineStats",
|
|
162
|
-
"PipelineConfig",
|
|
163
|
-
]
|