PyPI - xparse-client - Versions diffs - 0.2.19__tar.gz → 0.2.20__tar.gz - Mend

xparse-client 0.2.19tar.gz → 0.2.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{xparse_client-0.2.19 → xparse_client-0.2.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.19
+Version: 0.2.20
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline

{xparse_client-0.2.19 → xparse_client-0.2.20}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xparse-client"
-version = "0.2.19"
+version = "0.2.20"
 description = "面向Agent和RAG的新一代文档处理 AI Infra"
 readme = "README.md"
 license = "MIT"

{xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ logging.basicConfig(
     encoding='utf-8'
 )
-from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
+from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Stage, PipelineStats, PipelineConfig
 from .pipeline.sources import Source, S3Source, LocalSource, FtpSource, SmbSource
 from .pipeline.destinations import Destination, MilvusDestination, QdrantDestination, LocalDestination, S3Destination
 from .pipeline.pipeline import Pipeline, create_pipeline_from_config
@@ -17,6 +17,7 @@ __all__ = [
     'ParseConfig',
     'ChunkConfig',
     'EmbedConfig',
+    'ExtractConfig',
     'Stage',
     'PipelineStats',
     'PipelineConfig',

{xparse_client-0.2.19 → xparse_client-0.2.20}/xparse_client/pipeline/config.py RENAMED Viewed

@@ -1,34 +1,44 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from dataclasses import dataclass, asdict
-from typing import Dict, Any, Optional, Literal, List, Union
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Literal, Optional, Union
 from .destinations import Destination
 class ParseConfig:
     """Parse 配置，支持动态字段"""
-    def __init__(self, provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin", **kwargs):
+    def __init__(
+        self,
+        provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin",
+        **kwargs,
+    ):
         self.provider = provider
         for key, value in kwargs.items():
             setattr(self, key, value)
     def to_dict(self) -> Dict[str, Any]:
-        result = {'provider': self.provider}
+        result = {"provider": self.provider}
         for key, value in self.__dict__.items():
-            if not key.startswith('_') and key != 'provider':
+            if not key.startswith("_") and key != "provider":
                 result[key] = value
         return result
     def __repr__(self) -> str:
-        attrs = ', '.join(f'{k}={v!r}' for k, v in sorted(self.__dict__.items()) if not k.startswith('_'))
-        return f'ParseConfig({attrs})'
+        attrs = ", ".join(
+            f"{k}={v!r}"
+            for k, v in sorted(self.__dict__.items())
+            if not k.startswith("_")
+        )
+        return f"ParseConfig({attrs})"
 @dataclass
 class ChunkConfig:
     """Chunk 配置"""
     strategy: Literal["basic", "by_title", "by_page"] = "basic"
     include_orig_elements: bool = False
     new_after_n_chars: int = 512
@@ -43,12 +53,13 @@ class ChunkConfig:
 @dataclass
 class EmbedConfig:
     """Embed 配置"""
     provider: Literal["qwen", "doubao"] = "qwen"
     model_name: Literal[
         "text-embedding-v3",
         "text-embedding-v4",
         "doubao-embedding-large-text-250515",
-        "doubao-embedding-text-240715"
+        "doubao-embedding-text-240715",
     ] = "text-embedding-v3"
     def to_dict(self) -> Dict[str, Any]:
@@ -57,35 +68,49 @@ class EmbedConfig:
     def validate(self) -> None:
         provider_models = {
             "qwen": ["text-embedding-v3", "text-embedding-v4"],
-            "doubao": ["doubao-embedding-large-text-250515", "doubao-embedding-text-240715"]
+            "doubao": [
+                "doubao-embedding-large-text-250515",
+                "doubao-embedding-text-240715",
+            ],
         }
         if self.provider not in provider_models:
-            raise ValueError(f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}")
+            raise ValueError(
+                f"不支持的 provider: {self.provider}, 支持的有: {list(provider_models.keys())}"
+            )
         if self.model_name not in provider_models[self.provider]:
             raise ValueError(
                 f"provider '{self.provider}' 不支持模型 '{self.model_name}', 支持的模型: {provider_models[self.provider]}"
             )
+@dataclass
+class ExtractConfig:
+    """Extract 配置"""
+    schema: Dict[str, Any]  # 必填，JSON Schema 定义
+    generate_citations: bool = False
+    stamp: bool = False
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
 @dataclass
 class Stage:
     """Pipeline Stage 配置"""
-    type: Literal["parse", "chunk", "embed"]
-    config: Union[ParseConfig, ChunkConfig, EmbedConfig, Dict[str, Any]]
+    type: Literal["parse", "chunk", "embed", "extract"]
+    config: Union[ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig, Dict[str, Any]]
     def to_dict(self) -> Dict[str, Any]:
         """转换为字典格式，用于 API 请求"""
-        if isinstance(self.config, (ParseConfig, ChunkConfig, EmbedConfig)):
-            return {
-                'type': self.type,
-                'config': self.config.to_dict()
-            }
+        if isinstance(
+            self.config, (ParseConfig, ChunkConfig, EmbedConfig, ExtractConfig)
+        ):
+            return {"type": self.type, "config": self.config.to_dict()}
         else:
             # 如果 config 已经是字典，直接使用
-            return {
-                'type': self.type,
-                'config': self.config
-            }
+            return {"type": self.type, "config": self.config}
     def __repr__(self) -> str:
         return f"Stage(type={self.type!r}, config={self.config!r})"
@@ -94,6 +119,7 @@ class Stage:
 @dataclass
 class PipelineStats:
     """Pipeline 统计信息"""
     original_elements: int = 0
     chunked_elements: int = 0
     embedded_elements: int = 0
@@ -104,26 +130,34 @@ class PipelineStats:
 @dataclass
 class PipelineConfig:
     """Pipeline 配置"""
     include_intermediate_results: bool = False
-    intermediate_results_destination: Optional[Destination] = None  # 支持 Destination 类型，如 LocalDestination、S3Destination 等
+    intermediate_results_destination: Optional[Destination] = (
+        None  # 支持 Destination 类型，如 LocalDestination、S3Destination 等
+    )
     def __post_init__(self):
         """验证配置"""
-        if self.include_intermediate_results and not self.intermediate_results_destination:
-            raise ValueError("当 include_intermediate_results 为 True 时，必须设置 intermediate_results_destination")
+        if (
+            self.include_intermediate_results
+            and not self.intermediate_results_destination
+        ):
+            raise ValueError(
+                "当 include_intermediate_results 为 True 时，必须设置 intermediate_results_destination"
+            )
     def to_dict(self) -> Dict[str, Any]:
         return {
-            'include_intermediate_results': self.include_intermediate_results,
+            "include_intermediate_results": self.include_intermediate_results,
         }
 __all__ = [
-    'ParseConfig',
-    'ChunkConfig',
-    'EmbedConfig',
-    'Stage',
-    'PipelineStats',
-    'PipelineConfig',
+    "ParseConfig",
+    "ChunkConfig",
+    "EmbedConfig",
+    "ExtractConfig",
+    "Stage",
+    "PipelineStats",
+    "PipelineConfig",
 ]

xparse-client 0.2.19__tar.gz → 0.2.20__tar.gz

xparse-client 0.2.19tar.gz → 0.2.20tar.gz