xparse-client 0.2.19__tar.gz → 0.2.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.19
3
+ Version: 0.2.20
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xparse-client"
7
- version = "0.2.19"
7
+ version = "0.2.20"
8
8
  description = "面向Agent和RAG的新一代文档处理 AI Infra"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -8,7 +8,7 @@ logging.basicConfig(
8
8
  encoding='utf-8'
9
9
  )
10
10
 
11
- from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
11
+ from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Stage, PipelineStats, PipelineConfig
12
12
  from .pipeline.sources import Source, S3Source, LocalSource, FtpSource, SmbSource
13
13
  from .pipeline.destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
14
14
  from .pipeline.pipeline import Pipeline, create_pipeline_from_config
@@ -17,6 +17,7 @@ __all__ = [
17
17
  'ParseConfig',
18
18
  'ChunkConfig',
19
19
  'EmbedConfig',
20
+ 'ExtractConfig',
20
21
  'Stage',
21
22
  'PipelineStats',
22
23
  'PipelineConfig',
@@ -1,34 +1,44 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- encoding: utf-8 -*-
3
3
 
4
- from dataclasses import dataclass, asdict
5
- from typing import Dict, Any, Optional, Literal, List, Union
4
+ from dataclasses import asdict, dataclass
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
+
6
7
  from .destinations import Destination
7
8
 
8
9
 
9
10
  class ParseConfig:
10
11
  """Parse 配置,支持动态字段"""
11
12
 
12
- def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
13
+ def __init__(
14
+ self,
15
+ provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin",
16
+ **kwargs,
17
+ ):
13
18
  self.provider = provider
14
19
  for key, value in kwargs.items():
15
20
  setattr(self, key, value)
16
21
 
17
22
  def to_dict(self) -> Dict[str, Any]:
18
- result = {'provider': self.provider}
23
+ result = {"provider": self.provider}
19
24
  for key, value in self.__dict__.items():
20
- if not key.startswith('_') and key != 'provider':
25
+ if not key.startswith("_") and key != "provider":
21
26
  result[key] = value
22
27
  return result
23
28
 
24
29
  def __repr__(self) -> str:
25
- attrs = ', '.join(f'{k}={v!r}' for k, v in sorted(self.__dict__.items()) if not k.startswith('_'))
26
- return f'ParseConfig({attrs})'
30
+ attrs = ", ".join(
31
+ f"{k}={v!r}"
32
+ for k, v in sorted(self.__dict__.items())
33
+ if not k.startswith("_")
34
+ )
35
+ return f"ParseConfig({attrs})"
27
36
 
28
37
 
29
38
  @dataclass
30
39
  class ChunkConfig:
31
40
  """Chunk 配置"""
41
+
32
42
  strategy: Literal["basic", "by_title", "by_page"] = "basic"
33
43
  include_orig_elements: bool = False
34
44
  new_after_n_chars: int = 512
@@ -43,12 +53,13 @@ class ChunkConfig:
43
53
  @dataclass
44
54
  class EmbedConfig:
45
55
  """Embed 配置"""
56
+
46
57
  provider: Literal["qwen", "doubao"] = "qwen"
47
58
  model_name: Literal[
48
59
  "text-embedding-v3",
49
60
  "text-embedding-v4",
50
61
  "doubao-embedding-large-text-250515",
51
- "doubao-embedding-text-240715"
62
+ "doubao-embedding-text-240715",
52
63
  ] = "text-embedding-v3"
53
64
 
54
65
  def to_dict(self) -> Dict[str, Any]:
@@ -57,35 +68,49 @@ class EmbedConfig:
57
68
  def validate(self) -> None:
58
69
  provider_models = {
59
70
  "qwen": ["text-embedding-v3", "text-embedding-v4"],
60
- "doubao": ["doubao-embedding-large-text-250515", "doubao-embedding-text-240715"]
71
+ "doubao": [
72
+ "doubao-embedding-large-text-250515",
73
+ "doubao-embedding-text-240715",
74
+ ],
61
75
  }
62
76
  if self.provider not in provider_models:
63
- raise ValueError(f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}")
77
+ raise ValueError(
78
+ f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}"
79
+ )
64
80
  if self.model_name not in provider_models[self.provider]:
65
81
  raise ValueError(
66
82
  f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
67
83
  )
68
84
 
69
85
 
86
+ @dataclass
87
+ class ExtractConfig:
88
+ """Extract 配置"""
89
+
90
+ schema: Dict[str, Any] # 必填,JSON Schema 定义
91
+ generate_citations: bool = False
92
+ stamp: bool = False
93
+
94
+ def to_dict(self) -> Dict[str, Any]:
95
+ return asdict(self)
96
+
97
+
70
98
  @dataclass
71
99
  class Stage:
72
100
  """Pipeline Stage 配置"""
73
- type: Literal["parse", "chunk", "embed"]
74
- config: Union[ParseConfig, ChunkConfig, EmbedConfig, Dict[str, Any]]
101
+
102
+ type: Literal["parse", "chunk", "embed", "extract"]
103
+ config: Union[ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Dict[str, Any]]
75
104
 
76
105
  def to_dict(self) -> Dict[str, Any]:
77
106
  """转换为字典格式,用于 API 请求"""
78
- if isinstance(self.config, (ParseConfig, ChunkConfig, EmbedConfig)):
79
- return {
80
- 'type': self.type,
81
- 'config': self.config.to_dict()
82
- }
107
+ if isinstance(
108
+ self.config, (ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig)
109
+ ):
110
+ return {"type": self.type, "config": self.config.to_dict()}
83
111
  else:
84
112
  # 如果 config 已经是字典,直接使用
85
- return {
86
- 'type': self.type,
87
- 'config': self.config
88
- }
113
+ return {"type": self.type, "config": self.config}
89
114
 
90
115
  def __repr__(self) -> str:
91
116
  return f"Stage(type={self.type!r}, config={self.config!r})"
@@ -94,6 +119,7 @@ class Stage:
94
119
  @dataclass
95
120
  class PipelineStats:
96
121
  """Pipeline 统计信息"""
122
+
97
123
  original_elements: int = 0
98
124
  chunked_elements: int = 0
99
125
  embedded_elements: int = 0
@@ -104,26 +130,34 @@ class PipelineStats:
104
130
  @dataclass
105
131
  class PipelineConfig:
106
132
  """Pipeline 配置"""
133
+
107
134
  include_intermediate_results: bool = False
108
- intermediate_results_destination: Optional[Destination] = None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
135
+ intermediate_results_destination: Optional[Destination] = (
136
+ None # 支持 Destination 类型,如 LocalDestination、S3Destination 等
137
+ )
109
138
 
110
139
  def __post_init__(self):
111
140
  """验证配置"""
112
- if self.include_intermediate_results and not self.intermediate_results_destination:
113
- raise ValueError("当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination")
141
+ if (
142
+ self.include_intermediate_results
143
+ and not self.intermediate_results_destination
144
+ ):
145
+ raise ValueError(
146
+ "当 include_intermediate_results 为 True 时,必须设置 intermediate_results_destination"
147
+ )
114
148
 
115
149
  def to_dict(self) -> Dict[str, Any]:
116
150
  return {
117
- 'include_intermediate_results': self.include_intermediate_results,
151
+ "include_intermediate_results": self.include_intermediate_results,
118
152
  }
119
153
 
120
154
 
121
155
  __all__ = [
122
- 'ParseConfig',
123
- 'ChunkConfig',
124
- 'EmbedConfig',
125
- 'Stage',
126
- 'PipelineStats',
127
- 'PipelineConfig',
156
+ "ParseConfig",
157
+ "ChunkConfig",
158
+ "EmbedConfig",
159
+ "ExtractConfig",
160
+ "Stage",
161
+ "PipelineStats",
162
+ "PipelineConfig",
128
163
  ]
129
-