xparse-client 0.2.19__tar.gz → 0.2.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xparse_client-0.2.19 → xparse_client-0.2.20}/PKG-INFO +1 -1
- {xparse_client-0.2.19 → xparse_client-0.2.20}/pyproject.toml +1 -1
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/__init__.py +2 -1
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/pipeline/config.py +66 -32
- xparse_client-0.2.20/xparse_client/pipeline/pipeline.py +860 -0
- xparse_client-0.2.19/xparse_client/pipeline/pipeline.py +0 -690
- {xparse_client-0.2.19 → xparse_client-0.2.20}/LICENSE +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/MANIFEST.in +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/README.md +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/requirements.txt +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/setup.cfg +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/pipeline/__init__.py +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/pipeline/destinations.py +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/pipeline/sources.py +0 -0
- {xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client.egg-info/SOURCES.txt +0 -0
|
@@ -8,7 +8,7 @@ logging.basicConfig(
|
|
|
8
8
|
encoding='utf-8'
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
|
|
11
|
+
from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Stage, PipelineStats, PipelineConfig
|
|
12
12
|
from .pipeline.sources import Source, S3Source, LocalSource, FtpSource, SmbSource
|
|
13
13
|
from .pipeline.destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
|
|
14
14
|
from .pipeline.pipeline import Pipeline, create_pipeline_from_config
|
|
@@ -17,6 +17,7 @@ __all__ = [
|
|
|
17
17
|
'ParseConfig',
|
|
18
18
|
'ChunkConfig',
|
|
19
19
|
'EmbedConfig',
|
|
20
|
+
'ExtractConfig',
|
|
20
21
|
'Stage',
|
|
21
22
|
'PipelineStats',
|
|
22
23
|
'PipelineConfig',
|
|
@@ -1,34 +1,44 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- encoding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
from dataclasses import
|
|
5
|
-
from typing import
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
|
+
|
|
6
7
|
from .destinations import Destination
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class ParseConfig:
|
|
10
11
|
"""Parse 配置,支持动态字段"""
|
|
11
12
|
|
|
12
|
-
def __init__(
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin",
|
|
16
|
+
**kwargs,
|
|
17
|
+
):
|
|
13
18
|
self.provider = provider
|
|
14
19
|
for key, value in kwargs.items():
|
|
15
20
|
setattr(self, key, value)
|
|
16
21
|
|
|
17
22
|
def to_dict(self) -> Dict[str, Any]:
|
|
18
|
-
result = {
|
|
23
|
+
result = {"provider": self.provider}
|
|
19
24
|
for key, value in self.__dict__.items():
|
|
20
|
-
if not key.startswith(
|
|
25
|
+
if not key.startswith("_") and key != "provider":
|
|
21
26
|
result[key] = value
|
|
22
27
|
return result
|
|
23
28
|
|
|
24
29
|
def __repr__(self) -> str:
|
|
25
|
-
attrs =
|
|
26
|
-
|
|
30
|
+
attrs = ", ".join(
|
|
31
|
+
f"{k}={v!r}"
|
|
32
|
+
for k, v in sorted(self.__dict__.items())
|
|
33
|
+
if not k.startswith("_")
|
|
34
|
+
)
|
|
35
|
+
return f"ParseConfig({attrs})"
|
|
27
36
|
|
|
28
37
|
|
|
29
38
|
@dataclass
|
|
30
39
|
class ChunkConfig:
|
|
31
40
|
"""Chunk 配置"""
|
|
41
|
+
|
|
32
42
|
strategy: Literal["basic", "by_title", "by_page"] = "basic"
|
|
33
43
|
include_orig_elements: bool = False
|
|
34
44
|
new_after_n_chars: int = 512
|
|
@@ -43,12 +53,13 @@ class ChunkConfig:
|
|
|
43
53
|
@dataclass
|
|
44
54
|
class EmbedConfig:
|
|
45
55
|
"""Embed 配置"""
|
|
56
|
+
|
|
46
57
|
provider: Literal["qwen", "doubao"] = "qwen"
|
|
47
58
|
model_name: Literal[
|
|
48
59
|
"text-embedding-v3",
|
|
49
60
|
"text-embedding-v4",
|
|
50
61
|
"doubao-embedding-large-text-250515",
|
|
51
|
-
"doubao-embedding-text-240715"
|
|
62
|
+
"doubao-embedding-text-240715",
|
|
52
63
|
] = "text-embedding-v3"
|
|
53
64
|
|
|
54
65
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -57,35 +68,49 @@ class EmbedConfig:
|
|
|
57
68
|
def validate(self) -> None:
|
|
58
69
|
provider_models = {
|
|
59
70
|
"qwen": ["text-embedding-v3", "text-embedding-v4"],
|
|
60
|
-
"doubao": [
|
|
71
|
+
"doubao": [
|
|
72
|
+
"doubao-embedding-large-text-250515",
|
|
73
|
+
"doubao-embedding-text-240715",
|
|
74
|
+
],
|
|
61
75
|
}
|
|
62
76
|
if self.provider not in provider_models:
|
|
63
|
-
raise ValueError(
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}"
|
|
79
|
+
)
|
|
64
80
|
if self.model_name not in provider_models[self.provider]:
|
|
65
81
|
raise ValueError(
|
|
66
82
|
f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
|
|
67
83
|
)
|
|
68
84
|
|
|
69
85
|
|
|
86
|
+
@dataclass
|
|
87
|
+
class ExtractConfig:
|
|
88
|
+
"""Extract 配置"""
|
|
89
|
+
|
|
90
|
+
schema: Dict[str, Any] # 必填,JSON Schema 定义
|
|
91
|
+
generate_citations: bool = False
|
|
92
|
+
stamp: bool = False
|
|
93
|
+
|
|
94
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
95
|
+
return asdict(self)
|
|
96
|
+
|
|
97
|
+
|
|
70
98
|
@dataclass
|
|
71
99
|
class Stage:
|
|
72
100
|
"""Pipeline Stage 配置"""
|
|
73
|
-
|
|
74
|
-
|
|
101
|
+
|
|
102
|
+
type: Literal["parse", "chunk", "embed", "extract"]
|
|
103
|
+
config: Union[ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Dict[str, Any]]
|
|
75
104
|
|
|
76
105
|
def to_dict(self) -> Dict[str, Any]:
|
|
77
106
|
"""转换为字典格式,用于 API 请求"""
|
|
78
|
-
if isinstance(
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}
|
|
107
|
+
if isinstance(
|
|
108
|
+
self.config, (ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig)
|
|
109
|
+
):
|
|
110
|
+
return {"type": self.type, "config": self.config.to_dict()}
|
|
83
111
|
else:
|
|
84
112
|
# 如果 config 已经是字典,直接使用
|
|
85
|
-
return {
|
|
86
|
-
'type': self.type,
|
|
87
|
-
'config': self.config
|
|
88
|
-
}
|
|
113
|
+
return {"type": self.type, "config": self.config}
|
|
89
114
|
|
|
90
115
|
def __repr__(self) -> str:
|
|
91
116
|
return f"Stage(type={self.type!r}, config={self.config!r})"
|
|
@@ -94,6 +119,7 @@ class Stage:
|
|
|
94
119
|
@dataclass
|
|
95
120
|
class PipelineStats:
|
|
96
121
|
"""Pipeline 统计信息"""
|
|
122
|
+
|
|
97
123
|
original_elements: int = 0
|
|
98
124
|
chunked_elements: int = 0
|
|
99
125
|
embedded_elements: int = 0
|
|
@@ -104,26 +130,34 @@ class PipelineStats:
|
|
|
104
130
|
@dataclass
|
|
105
131
|
class PipelineConfig:
|
|
106
132
|
"""Pipeline 配置"""
|
|
133
|
+
|
|
107
134
|
include_intermediate_results: bool = False
|
|
108
|
-
intermediate_results_destination: Optional[Destination] =
|
|
135
|
+
intermediate_results_destination: Optional[Destination] = (
|
|
136
|
+
None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
|
|
137
|
+
)
|
|
109
138
|
|
|
110
139
|
def __post_init__(self):
|
|
111
140
|
"""验证配置"""
|
|
112
|
-
if
|
|
113
|
-
|
|
141
|
+
if (
|
|
142
|
+
self.include_intermediate_results
|
|
143
|
+
and not self.intermediate_results_destination
|
|
144
|
+
):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination"
|
|
147
|
+
)
|
|
114
148
|
|
|
115
149
|
def to_dict(self) -> Dict[str, Any]:
|
|
116
150
|
return {
|
|
117
|
-
|
|
151
|
+
"include_intermediate_results": self.include_intermediate_results,
|
|
118
152
|
}
|
|
119
153
|
|
|
120
154
|
|
|
121
155
|
__all__ = [
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
156
|
+
"ParseConfig",
|
|
157
|
+
"ChunkConfig",
|
|
158
|
+
"EmbedConfig",
|
|
159
|
+
"ExtractConfig",
|
|
160
|
+
"Stage",
|
|
161
|
+
"PipelineStats",
|
|
162
|
+
"PipelineConfig",
|
|
128
163
|
]
|
|
129
|
-
|