PyPI - xparse-client - Versions diffs - 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl - Mend

xparse-client 0.2.11py3-none-any.whl → 0.3.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

example/1_basic_api_usage.py +198 -0
example/2_async_job.py +210 -0
example/3_local_workflow.py +300 -0
example/4_advanced_workflow.py +327 -0
example/README.md +128 -0
example/config_example.json +95 -0
tests/conftest.py +310 -0
tests/unit/__init__.py +1 -0
tests/unit/api/__init__.py +1 -0
tests/unit/api/test_extract.py +232 -0
tests/unit/api/test_local.py +231 -0
tests/unit/api/test_parse.py +374 -0
tests/unit/api/test_pipeline.py +369 -0
tests/unit/api/test_workflows.py +108 -0
tests/unit/connectors/test_ftp.py +525 -0
tests/unit/connectors/test_local_connectors.py +324 -0
tests/unit/connectors/test_milvus.py +368 -0
tests/unit/connectors/test_qdrant.py +399 -0
tests/unit/connectors/test_s3.py +598 -0
tests/unit/connectors/test_smb.py +442 -0
tests/unit/connectors/test_utils.py +335 -0
tests/unit/models/test_local.py +54 -0
tests/unit/models/test_pipeline_stages.py +144 -0
tests/unit/models/test_workflows.py +55 -0
tests/unit/test_base.py +437 -0
tests/unit/test_client.py +110 -0
tests/unit/test_config.py +160 -0
tests/unit/test_exceptions.py +182 -0
tests/unit/test_http.py +562 -0
xparse_client/__init__.py +111 -20
xparse_client/_base.py +179 -0
xparse_client/_client.py +218 -0
xparse_client/_config.py +221 -0
xparse_client/_http.py +350 -0
xparse_client/api/__init__.py +14 -0
xparse_client/api/extract.py +109 -0
xparse_client/api/local.py +215 -0
xparse_client/api/parse.py +209 -0
xparse_client/api/pipeline.py +134 -0
xparse_client/api/workflows.py +204 -0
xparse_client/connectors/__init__.py +45 -0
xparse_client/connectors/_utils.py +138 -0
xparse_client/connectors/destinations/__init__.py +45 -0
xparse_client/connectors/destinations/base.py +116 -0
xparse_client/connectors/destinations/local.py +91 -0
xparse_client/connectors/destinations/milvus.py +229 -0
xparse_client/connectors/destinations/qdrant.py +238 -0
xparse_client/connectors/destinations/s3.py +163 -0
xparse_client/connectors/sources/__init__.py +45 -0
xparse_client/connectors/sources/base.py +74 -0
xparse_client/connectors/sources/ftp.py +278 -0
xparse_client/connectors/sources/local.py +176 -0
xparse_client/connectors/sources/s3.py +232 -0
xparse_client/connectors/sources/smb.py +259 -0
xparse_client/exceptions.py +398 -0
xparse_client/models/__init__.py +60 -0
xparse_client/models/chunk.py +39 -0
xparse_client/models/embed.py +62 -0
xparse_client/models/extract.py +41 -0
xparse_client/models/local.py +38 -0
xparse_client/models/parse.py +136 -0
xparse_client/models/pipeline.py +134 -0
xparse_client/models/workflows.py +74 -0
xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
xparse_client-0.3.0b3.dist-info/RECORD +68 -0
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
example/run_pipeline.py +0 -506
example/run_pipeline_test.py +0 -458
xparse_client/pipeline/__init__.py +0 -3
xparse_client/pipeline/config.py +0 -129
xparse_client/pipeline/destinations.py +0 -487
xparse_client/pipeline/pipeline.py +0 -622
xparse_client/pipeline/sources.py +0 -585
xparse_client-0.2.11.dist-info/METADATA +0 -1050
xparse_client-0.2.11.dist-info/RECORD +0 -13

xparse_client/models/chunk.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Chunk API 数据模型"""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, Field
+class ChunkConfig(BaseModel):
+    """分块配置
+    Attributes:
+        strategy: 分块策略
+        include_orig_elements: 是否包含原始元素
+        new_after_n_chars: 多少字符后创建新块
+        max_characters: 块的最大字符数
+        overlap: 重叠字符数
+        overlap_all: 是否所有块都重叠
+    Example:
+        >>> config = ChunkConfig(
+        ...     strategy="basic",
+        ...     max_characters=1024,
+        ...     overlap=50
+        ... )
+    """
+    strategy: Literal["basic", "by_title", "by_page"] = "basic"
+    include_orig_elements: bool = False
+    new_after_n_chars: int = Field(default=512, ge=1)
+    max_characters: int = Field(default=1024, ge=1)
+    overlap: int = Field(default=0, ge=0)
+    overlap_all: bool = False
+    model_config = {"extra": "allow"}
+__all__ = ["ChunkConfig"]

xparse_client/models/embed.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Embed API 数据模型"""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, field_validator
+class EmbedConfig(BaseModel):
+    """向量化配置
+    Attributes:
+        provider: 向量化引擎提供商
+        model_name: 模型名称
+    Example:
+        >>> config = EmbedConfig(
+        ...     provider="qwen",
+        ...     model_name="text-embedding-v3"
+        ... )
+    """
+    provider: Literal["qwen", "doubao"] = "qwen"
+    model_name: Literal[
+        "text-embedding-v3",
+        "text-embedding-v4",
+        "doubao-embedding-large-text-250515",
+        "doubao-embedding-text-240715",
+    ] = "text-embedding-v3"
+    model_config = {"extra": "allow"}
+    @field_validator("model_name")
+    @classmethod
+    def validate_model_for_provider(cls, v: str, info) -> str:
+        """验证 model_name 与 provider 是否匹配"""
+        provider = info.data.get("provider", "qwen")
+        provider_models = {
+            "qwen": ["text-embedding-v3", "text-embedding-v4"],
+            "doubao": [
+                "doubao-embedding-large-text-250515",
+                "doubao-embedding-text-240715",
+            ],
+        }
+        if provider not in provider_models:
+            raise ValueError(
+                f"不支持的 provider: {provider}, 支持的有: {list(provider_models.keys())}"
+            )
+        if v not in provider_models[provider]:
+            raise ValueError(
+                f"provider '{provider}' 不支持模型 '{v}', "
+                f"支持的模型: {provider_models[provider]}"
+            )
+        return v
+__all__ = ["EmbedConfig"]

xparse_client/models/extract.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Extract API 数据模型"""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+class ExtractConfig(BaseModel):
+    """抽取配置
+    Attributes:
+        schema: JSON Schema 定义抽取的结构
+        generate_citations: 是否生成引用
+        stamp: 是否添加时间戳
+    Example:
+        >>> schema = {
+        ...     "type": "object",
+        ...     "properties": {
+        ...         "invoice_number": {"type": "string"},
+        ...         "total_amount": {"type": "number"}
+        ...     }
+        ... }
+        >>> config = ExtractConfig(schema=schema)
+    """
+    schema: dict[str, Any] = Field(
+        default_factory=dict,
+        description="JSON Schema 定义抽取的结构"
+    )
+    generate_citations: bool = False
+    stamp: bool = False
+    model_config = {
+        "extra": "allow",
+    }
+__all__ = ["ExtractConfig"]

xparse_client/models/local.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Local API 数据模型
+本地批处理工作流的请求/响应模型。
+"""
+from dataclasses import dataclass
+@dataclass
+class FailedFile:
+    """失败文件信息
+    Attributes:
+        file_path: 文件路径
+        error: 错误信息
+        retry_count: 重试次数
+    """
+    file_path: str
+    error: str
+    retry_count: int
+@dataclass
+class WorkflowResult:
+    """工作流执行结果
+    Attributes:
+        total: 总文件数
+        success: 成功数
+        failed: 失败数
+        failed_files: 失败文件列表
+        duration: 总耗时（秒）
+    """
+    total: int
+    success: int
+    failed: int
+    failed_files: list[FailedFile]
+    duration: float

xparse_client/models/parse.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Parse API 数据模型"""
+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+class ParseConfig(BaseModel):
+    """解析配置
+    Attributes:
+        provider: 解析引擎提供商
+    Example:
+        >>> config = ParseConfig(provider="textin")
+    """
+    provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin"
+    model_config = {"extra": "allow"}
+class ElementMetadata(BaseModel):
+    """元素元数据
+    Attributes:
+        page_number: 页码
+        coordinates: 坐标信息
+    """
+    page_number: int | None = None
+    coordinates: list[float] | None = None
+    model_config = {"extra": "allow"}
+class Element(BaseModel):
+    """解析后的文档元素
+    Attributes:
+        element_id: 元素唯一标识
+        type: 元素类型（如 text, title, table, image 等）
+        text: 元素文本内容
+        metadata: 元素元数据
+        embeddings: 向量嵌入（如果执行了 embed 阶段）
+    """
+    element_id: str
+    type: str
+    text: str = ""
+    metadata: ElementMetadata | None = None
+    embeddings: list[float] | None = None
+    model_config = {"extra": "allow"}
+class ParseResponse(BaseModel):
+    """统一响应模型
+    用于 Parse、Extract、Pipeline 等 API 的响应。
+    Attributes:
+        elements: 解析后的元素列表（Parse/Chunk/Embed 返回）
+        extract_result: 提取结果（Extract 返回）
+        success_count: 成功数量
+        consume_time: 耗时
+        record_id: 记录 ID
+    """
+    elements: list[Element] = Field(default_factory=list)
+    extract_result: dict[str, Any] | None = None  # Extract API 返回
+    success_count: int | None = None
+    consume_time: str | None = None
+    record_id: str | None = None
+    model_config = {"extra": "allow"}
+class AsyncJobResponse(BaseModel):
+    """异步任务创建响应
+    Attributes:
+        job_id: 任务 ID
+    """
+    job_id: str
+class JobStatusResponse(BaseModel):
+    """异步任务状态响应
+    异步查询接口只返回任务状态和结果 URL，不直接返回解析结果。
+    如需获取解析结果，需要另外下载 result_url 的内容。
+    Attributes:
+        job_id: 任务 ID
+        file_id: 文件 ID
+        status: 任务状态（"scheduled", "in_progress", "completed", "failed"）
+        result_url: 结果文件 URL（任务完成时返回，需要另外下载）
+        error_message: 错误信息（任务失败时返回）
+    """
+    job_id: str
+    file_id: str | None = None
+    status: str  # "scheduled", "in_progress", "completed", "failed"
+    result_url: str | None = None
+    error_message: str | None = None
+    model_config = {"extra": "allow"}
+    @property
+    def is_completed(self) -> bool:
+        """任务是否已完成"""
+        return self.status == "completed"
+    @property
+    def is_failed(self) -> bool:
+        """任务是否失败"""
+        return self.status == "failed"
+    @property
+    def is_running(self) -> bool:
+        """任务是否正在运行"""
+        return self.status in ("scheduled", "in_progress")
+__all__ = [
+    "ParseConfig",
+    "ElementMetadata",
+    "Element",
+    "ParseResponse",
+    "AsyncJobResponse",
+    "JobStatusResponse",
+]

xparse_client/models/pipeline.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Pipeline API 数据模型"""
+from __future__ import annotations
+from typing import Any, Literal, Union
+from pydantic import BaseModel, Field
+from .chunk import ChunkConfig
+from .embed import EmbedConfig
+from .extract import ExtractConfig
+from .parse import Element, ParseConfig
+class ParseStage(BaseModel):
+    """Parse 阶段配置
+    Example:
+        >>> stage = ParseStage(
+        ...     type="parse",
+        ...     config=ParseConfig(provider="textin")
+        ... )
+    """
+    type: Literal["parse"] = "parse"
+    config: ParseConfig = Field(default_factory=ParseConfig)
+class ChunkStage(BaseModel):
+    """Chunk 阶段配置
+    Example:
+        >>> stage = ChunkStage(
+        ...     type="chunk",
+        ...     config=ChunkConfig(strategy="basic")
+        ... )
+    """
+    type: Literal["chunk"] = "chunk"
+    config: ChunkConfig = Field(default_factory=ChunkConfig)
+class EmbedStage(BaseModel):
+    """Embed 阶段配置
+    Example:
+        >>> stage = EmbedStage(
+        ...     type="embed",
+        ...     config=EmbedConfig(provider="qwen")
+        ... )
+    """
+    type: Literal["embed"] = "embed"
+    config: EmbedConfig = Field(default_factory=EmbedConfig)
+class ExtractStage(BaseModel):
+    """Extract 阶段配置
+    Example:
+        >>> stage = ExtractStage(
+        ...     type="extract",
+        ...     config=ExtractConfig(schema_={"type": "object"})
+        ... )
+    """
+    type: Literal["extract"] = "extract"
+    config: ExtractConfig
+# PipelineStage 是所有 Stage 的 Union
+PipelineStage = Union[ParseStage, ChunkStage, EmbedStage, ExtractStage]
+class PipelineStats(BaseModel):
+    """Pipeline 统计信息
+    Attributes:
+        success_count: 成功数量
+        total_time: 总耗时
+        original_elements: 原始元素数量
+        chunked_elements: 分块后元素数量
+        embedded_elements: 嵌入后元素数量
+        record_id: 记录 ID
+    """
+    success_count: int | None = None
+    total_time: str | None = None
+    original_elements: int | None = None
+    chunked_elements: int | None = None
+    embedded_elements: int | None = None
+    record_id: str | None = None
+    model_config = {"extra": "allow"}
+class PipelineConfig(BaseModel):
+    """Pipeline 全局配置
+    Attributes:
+        include_intermediate_results: 是否包含中间结果
+        intermediate_results_destination: 中间结果保存目的地（仅支持 LocalDestination）
+    """
+    include_intermediate_results: bool = False
+    intermediate_results_destination: Any = None  # LocalDestination 对象
+    model_config = {"extra": "allow", "arbitrary_types_allowed": True}
+class PipelineResponse(BaseModel):
+    """Pipeline 执行响应
+    Attributes:
+        elements: 处理后的元素列表
+        stats: 统计信息
+        extract_result: extract 阶段的结果（如果有）
+        intermediate_results: 中间结果（如果请求）
+    """
+    elements: list[Element] = Field(default_factory=list)
+    stats: PipelineStats | None = None
+    extract_result: dict[str, Any] | None = None
+    intermediate_results: list[dict[str, Any]] | None = None
+    model_config = {"extra": "allow"}
+__all__ = [
+    "PipelineStage",
+    "PipelineStats",
+    "PipelineConfig",
+    "PipelineResponse",
+]

xparse_client/models/workflows.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Workflows API 数据模型
+远程工作流管理的请求/响应模型。
+"""
+from __future__ import annotations
+from enum import Enum
+from pydantic import BaseModel
+from .pipeline import PipelineStage
+class WorkflowState(str, Enum):
+    """工作流状态"""
+    ACTIVE = "active"
+    PAUSED = "paused"
+    ARCHIVED = "archived"
+class Schedule(BaseModel):
+    """调度配置
+    Attributes:
+        cron: cron 表达式
+    Example:
+        >>> schedule = Schedule(cron="0 0 * * *")  # 每天午夜执行
+    """
+    cron: str
+class WorkflowInformation(BaseModel):
+    """工作流信息
+    Attributes:
+        workflow_id: 工作流 ID
+        name: 工作流名称
+        source_id: 远程数据源 ID
+        destination_id: 远程目的地 ID
+        stages: 处理阶段列表
+        schedule: 调度配置（可选）
+        state: 工作流状态
+        created_at: 创建时间
+        updated_at: 更新时间
+    Example:
+        >>> workflow = WorkflowInformation(
+        ...     workflow_id="wf_123",
+        ...     name="daily-processing",
+        ...     source_id="src_456",
+        ...     destination_id="dst_789",
+        ...     stages=[ParseStage(config=ParseConfig())],
+        ...     schedule=Schedule(cron="0 0 * * *"),
+        ...     state=WorkflowState.ACTIVE,
+        ...     created_at="2026-01-27T10:00:00Z",
+        ...     updated_at="2026-01-27T10:00:00Z"
+        ... )
+    """
+    workflow_id: str
+    name: str
+    source_id: str
+    destination_id: str
+    stages: list[PipelineStage]
+    schedule: Schedule | None = None
+    state: WorkflowState
+    created_at: str
+    updated_at: str
+    model_config = {"extra": "allow"}
+__all__ = ["WorkflowInformation", "WorkflowState", "Schedule"]

xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

xparse-client 0.2.11py3-none-any.whl → 0.3.0b3py3-none-any.whl