PyPI - xparse-client - Versions diffs - 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl - Mend

xparse-client 0.2.11py3-none-any.whl → 0.3.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

example/1_basic_api_usage.py +198 -0
example/2_async_job.py +210 -0
example/3_local_workflow.py +300 -0
example/4_advanced_workflow.py +327 -0
example/README.md +128 -0
example/config_example.json +95 -0
tests/conftest.py +310 -0
tests/unit/__init__.py +1 -0
tests/unit/api/__init__.py +1 -0
tests/unit/api/test_extract.py +232 -0
tests/unit/api/test_local.py +231 -0
tests/unit/api/test_parse.py +374 -0
tests/unit/api/test_pipeline.py +369 -0
tests/unit/api/test_workflows.py +108 -0
tests/unit/connectors/test_ftp.py +525 -0
tests/unit/connectors/test_local_connectors.py +324 -0
tests/unit/connectors/test_milvus.py +368 -0
tests/unit/connectors/test_qdrant.py +399 -0
tests/unit/connectors/test_s3.py +598 -0
tests/unit/connectors/test_smb.py +442 -0
tests/unit/connectors/test_utils.py +335 -0
tests/unit/models/test_local.py +54 -0
tests/unit/models/test_pipeline_stages.py +144 -0
tests/unit/models/test_workflows.py +55 -0
tests/unit/test_base.py +437 -0
tests/unit/test_client.py +110 -0
tests/unit/test_config.py +160 -0
tests/unit/test_exceptions.py +182 -0
tests/unit/test_http.py +562 -0
xparse_client/__init__.py +111 -20
xparse_client/_base.py +179 -0
xparse_client/_client.py +218 -0
xparse_client/_config.py +221 -0
xparse_client/_http.py +350 -0
xparse_client/api/__init__.py +14 -0
xparse_client/api/extract.py +109 -0
xparse_client/api/local.py +215 -0
xparse_client/api/parse.py +209 -0
xparse_client/api/pipeline.py +134 -0
xparse_client/api/workflows.py +204 -0
xparse_client/connectors/__init__.py +45 -0
xparse_client/connectors/_utils.py +138 -0
xparse_client/connectors/destinations/__init__.py +45 -0
xparse_client/connectors/destinations/base.py +116 -0
xparse_client/connectors/destinations/local.py +91 -0
xparse_client/connectors/destinations/milvus.py +229 -0
xparse_client/connectors/destinations/qdrant.py +238 -0
xparse_client/connectors/destinations/s3.py +163 -0
xparse_client/connectors/sources/__init__.py +45 -0
xparse_client/connectors/sources/base.py +74 -0
xparse_client/connectors/sources/ftp.py +278 -0
xparse_client/connectors/sources/local.py +176 -0
xparse_client/connectors/sources/s3.py +232 -0
xparse_client/connectors/sources/smb.py +259 -0
xparse_client/exceptions.py +398 -0
xparse_client/models/__init__.py +60 -0
xparse_client/models/chunk.py +39 -0
xparse_client/models/embed.py +62 -0
xparse_client/models/extract.py +41 -0
xparse_client/models/local.py +38 -0
xparse_client/models/parse.py +136 -0
xparse_client/models/pipeline.py +134 -0
xparse_client/models/workflows.py +74 -0
xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
xparse_client-0.3.0b3.dist-info/RECORD +68 -0
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
{xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
example/run_pipeline.py +0 -506
example/run_pipeline_test.py +0 -458
xparse_client/pipeline/__init__.py +0 -3
xparse_client/pipeline/config.py +0 -129
xparse_client/pipeline/destinations.py +0 -487
xparse_client/pipeline/pipeline.py +0 -622
xparse_client/pipeline/sources.py +0 -585
xparse_client-0.2.11.dist-info/METADATA +0 -1050
xparse_client-0.2.11.dist-info/RECORD +0 -13

xparse_client/api/parse.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Parse API - 文档解析
+支持同步和异步两种解析模式。
+Example:
+    >>> # 同步解析
+    >>> result = client.parse.partition(file=file_bytes, filename="doc.pdf")
+    >>>
+    >>> # 异步解析
+    >>> job = client.parse.create_async_job(file=file_bytes, filename="doc.pdf")
+    >>> result = client.parse.get_result(job_id=job.job_id)
+"""
+from __future__ import annotations
+import json
+import time
+from typing import TYPE_CHECKING
+from .._base import BaseAPI
+from ..exceptions import APIError, RequestTimeoutError
+from ..models.parse import (
+    AsyncJobResponse,
+    JobStatusResponse,
+    ParseConfig,
+    ParseResponse,
+)
+if TYPE_CHECKING:
+    pass
+class Parse(BaseAPI):
+    """Parse API - 文档解析
+    提供文档解析功能，支持同步和异步两种模式。
+    同步模式:
+        直接返回解析结果，适用于小文件或需要立即获取结果的场景。
+    异步模式:
+        返回 job_id，通过轮询获取结果，适用于大文件处理。
+    Attributes:
+        _base_path: API 路径前缀
+    """
+    _base_path = "/api/xparse"
+    def partition(
+        self,
+        *,
+        file: bytes,
+        filename: str,
+        config: ParseConfig | None = None,
+    ) -> ParseResponse:
+        """同步解析文档
+        Args:
+            file: 文件内容（字节）
+            filename: 文件名（用于确定文件类型）
+            config: 解析配置（可选）
+        Returns:
+            ParseResponse: 解析结果，包含元素列表
+        Raises:
+            ValidationError: 参数验证失败
+            APIError: API 调用失败
+        Example:
+            >>> result = client.parse.partition(
+            ...     file=file_bytes,
+            ...     filename="document.pdf",
+            ...     config=ParseConfig(provider="textin")
+            ... )
+            >>> for element in result.elements:
+            ...     print(f"{element.type}: {element.text[:50]}")
+        """
+        files = {"file": (filename, file)}
+        data = {}
+        if config:
+            data["config"] = json.dumps(config.model_dump(), ensure_ascii=False)
+        response = self._post("/parse/sync", files=files, data=data)
+        return self._parse_response(response, ParseResponse)
+    def create_async_job(
+        self,
+        *,
+        file: bytes,
+        filename: str,
+        config: ParseConfig | None = None,
+        webhook: str | None = None,
+    ) -> AsyncJobResponse:
+        """创建异步解析任务
+        创建一个异步任务来处理文档，适用于大文件或批量处理场景。
+        Args:
+            file: 文件内容
+            filename: 文件名
+            config: 解析配置（可选）
+            webhook: 任务完成后的回调 URL（可选）
+        Returns:
+            AsyncJobResponse: 包含 job_id
+        Example:
+            >>> job = client.parse.create_async_job(
+            ...     file=file_bytes,
+            ...     filename="large_doc.pdf",
+            ...     webhook="https://example.com/callback"
+            ... )
+            >>> print(f"任务已创建: {job.job_id}")
+        """
+        files = {"file": (filename, file)}
+        data = {}
+        if config:
+            data["config"] = json.dumps(config.model_dump(), ensure_ascii=False)
+        if webhook:
+            data["webhook"] = webhook
+        response = self._post("/parse/async", files=files, data=data)
+        return self._parse_response(response, AsyncJobResponse)
+    def get_result(self, *, job_id: str) -> JobStatusResponse:
+        """获取异步解析结果
+        查询异步任务的状态和结果。
+        Args:
+            job_id: 任务 ID
+        Returns:
+            JobStatusResponse: 任务状态和结果
+        Example:
+            >>> result = client.parse.get_result(job_id="job_abc123")
+            >>> if result.is_completed:
+            ...     print(f"解析完成，共 {len(result.elements)} 个元素")
+            >>> elif result.is_failed:
+            ...     print(f"解析失败: {result.error_message}")
+        """
+        response = self._get(f"/parse/async/{job_id}")
+        return self._parse_response(response, JobStatusResponse)
+    def wait_for_result(
+        self,
+        *,
+        job_id: str,
+        timeout_seconds: float = 3600,
+        poll_interval_seconds: float = 5,
+    ) -> JobStatusResponse:
+        """等待异步任务完成
+        轮询任务状态直到完成或超时。
+        Args:
+            job_id: 任务 ID
+            timeout_seconds: 超时时间（秒），默认 3600
+            poll_interval_seconds: 轮询间隔（秒），默认 5
+        Returns:
+            JobStatusResponse: 完成的任务结果
+        Raises:
+            RequestTimeoutError: 等待超时
+            APIError: 任务失败
+        Example:
+            >>> job = client.parse.create_async_job(file=file_bytes, filename="doc.pdf")
+            >>> result = client.parse.wait_for_result(
+            ...     job_id=job.job_id,
+            ...     timeout_seconds=600,
+            ...     poll_interval_seconds=10
+            ... )
+        """
+        start_time = time.time()
+        while True:
+            result = self.get_result(job_id=job_id)
+            if result.is_completed:
+                return result
+            if result.is_failed:
+                raise APIError(
+                    f"解析任务失败: {result.error_message}",
+                    details={"job_id": job_id},
+                )
+            elapsed = time.time() - start_time
+            if elapsed > timeout_seconds:
+                raise RequestTimeoutError(
+                    f"等待解析任务超时: {job_id}",
+                    timeout_seconds=timeout_seconds,
+                )
+            time.sleep(poll_interval_seconds)
+__all__ = ["Parse"]

xparse_client/api/pipeline.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Pipeline API - 自定义流水线
+支持自定义 stages 组合执行。
+Example:
+    >>> stages = [
+    ...     PipelineStage(type="parse", config={"provider": "textin"}),
+    ...     PipelineStage(type="chunk", config={"strategy": "basic"}),
+    ...     PipelineStage(type="embed", config={"provider": "qwen"})
+    ... ]
+    >>> result = client.pipeline.execute(
+    ...     file=file_bytes,
+    ...     filename="doc.pdf",
+    ...     stages=stages
+    ... )
+"""
+from __future__ import annotations
+import json
+from typing import TYPE_CHECKING, Any
+from .._base import BaseAPI
+from ..models.pipeline import (
+    PipelineConfig,
+    PipelineResponse,
+    PipelineStage,
+)
+if TYPE_CHECKING:
+    pass
+class PipelineAPI(BaseAPI):
+    """Pipeline API - 自定义流水线
+    支持灵活的 stages 组合：
+    - parse: 文档解析（必选，必须是第一个 stage）
+    - chunk: 文本分块
+    - embed: 向量化
+    - extract: 信息抽取（只能紧跟在 parse 后面）
+    Example:
+        >>> # RAG 场景: parse -> chunk -> embed
+        >>> stages = [
+        ...     PipelineStage(type="parse", config={"provider": "textin"}),
+        ...     PipelineStage(type="chunk", config={"strategy": "basic", "max_characters": 1000}),
+        ...     PipelineStage(type="embed", config={"provider": "qwen", "model": "text-embedding-v3"})
+        ... ]
+        >>> result = client.pipeline.execute(file=file_bytes, filename="doc.pdf", stages=stages)
+        >>>
+        >>> # 结构化抽取场景: parse -> extract
+        >>> stages = [
+        ...     PipelineStage(type="parse"),
+        ...     PipelineStage(type="extract", config={"schema": {...}})
+        ... ]
+        >>> result = client.pipeline.execute(file=file_bytes, filename="invoice.pdf", stages=stages)
+    """
+    _base_path = "/api/xparse"
+    def execute(
+        self,
+        *,
+        file: bytes,
+        filename: str,
+        stages: list[PipelineStage | dict[str, Any]],
+        config: PipelineConfig | None = None,
+        data_source: dict[str, Any] | None = None,
+    ) -> PipelineResponse:
+        """同步执行 Pipeline
+        Args:
+            file: 文件内容
+            filename: 文件名
+            stages: 阶段配置列表（可以是 PipelineStage 对象或字典）
+            config: Pipeline 全局配置（可选）
+            data_source: 数据源信息（可选，用于追踪）
+        Returns:
+            PipelineResponse: 执行结果
+        Example:
+            >>> # 使用 PipelineStage 对象
+            >>> stages = [
+            ...     PipelineStage(type="parse", config={"provider": "textin"}),
+            ...     PipelineStage(type="chunk", config={"strategy": "basic"}),
+            ...     PipelineStage(type="embed", config={"provider": "qwen"})
+            ... ]
+            >>>
+            >>> # 或者使用字典
+            >>> stages = [
+            ...     {"type": "parse", "config": {"provider": "textin"}},
+            ...     {"type": "chunk", "config": {"strategy": "basic"}},
+            ...     {"type": "embed", "config": {"provider": "qwen"}}
+            ... ]
+            >>>
+            >>> result = client.pipeline.execute(
+            ...     file=file_bytes,
+            ...     filename="doc.pdf",
+            ...     stages=stages
+            ... )
+            >>> print(f"处理了 {len(result.elements)} 个元素")
+        """
+        files = {"file": (filename, file)}
+        # 转换 stages 为可序列化格式
+        stages_data = []
+        for stage in stages:
+            if hasattr(stage, 'model_dump'):
+                # Pydantic model (ParseStage, ChunkStage, etc.)
+                # 使用 by_alias=True 来确保 schema_ 序列化为 schema
+                stages_data.append(stage.model_dump(by_alias=True))
+            else:
+                # 字典格式
+                stages_data.append(stage)
+        data = {
+            "stages": json.dumps(stages_data, ensure_ascii=False)
+        }
+        if config:
+            # 只序列化 API 需要的字段（排除 intermediate_results_destination）
+            config_dict = config.model_dump(exclude={"intermediate_results_destination"})
+            data["config"] = json.dumps(config_dict, ensure_ascii=False)
+        if data_source:
+            data["data_source"] = json.dumps(data_source, ensure_ascii=False)
+        response = self._post("/pipeline", files=files, data=data)
+        return self._parse_response(response, PipelineResponse)
+__all__ = ["PipelineAPI"]

xparse_client/api/workflows.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""Workflows API - 远程工作流管理
+管理服务端工作流资源，支持创建、列出、更新、删除和运行。
+与 local.run_workflow 的区别：
+- workflows: 远程管理，在服务端执行，支持 cron 定时，source/destination 使用 ID
+- local: 本地执行，同步阻塞，source/destination 使用本地对象
+Example:
+    >>> from xparse_client import XParseClient
+    >>> from xparse_client.models import ParseStage, ParseConfig, Schedule
+    >>>
+    >>> client = XParseClient(app_id="...", secret_code="...")
+    >>>
+    >>> # 创建工作流（目前骨架实现，未连接真实 API）
+    >>> workflow = client.workflows.create(
+    ...     name="daily-processing",
+    ...     source_id="src_123",
+    ...     destination_id="dst_456",
+    ...     stages=[ParseStage(config=ParseConfig())],
+    ...     schedule=Schedule(cron="0 0 * * *")
+    ... )
+"""
+from __future__ import annotations
+import builtins
+from typing import TYPE_CHECKING
+from .._base import BaseAPI
+from ..models.pipeline import PipelineStage
+from ..models.workflows import Schedule, WorkflowInformation, WorkflowState
+if TYPE_CHECKING:
+    pass
+class Workflows(BaseAPI):
+    """Workflows API - 远程工作流管理
+    管理服务端工作流，支持创建、查询、更新、删除和手动触发。
+    注意：当前为骨架实现，方法会抛出 NotImplementedError。
+    Attributes:
+        _base_path: API 路径前缀
+    """
+    _base_path = "/api/xparse"
+    def create(
+        self,
+        *,
+        name: str,
+        source_id: str,
+        destination_id: str,
+        stages: builtins.list[PipelineStage],
+        schedule: Schedule | None = None,
+    ) -> WorkflowInformation:
+        """创建工作流
+        在服务端创建一个远程工作流配置。
+        Args:
+            name: 工作流名称
+            source_id: 远程数据源 ID（通过 sources API 创建）
+            destination_id: 远程目的地 ID（通过 destinations API 创建）
+            stages: 处理阶段列表
+            schedule: 调度配置（可选，不提供则手动触发）
+        Returns:
+            WorkflowInformation: 创建的工作流信息
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> workflow = client.workflows.create(
+            ...     name="daily-docs",
+            ...     source_id="src_s3_123",
+            ...     destination_id="dst_milvus_456",
+            ...     stages=[ParseStage(config=ParseConfig())],
+            ...     schedule=Schedule(cron="0 0 * * *")
+            ... )
+        """
+        raise NotImplementedError("Workflows.create 尚未实现")
+    def list(
+        self,
+        *,
+        state: WorkflowState | None = None,
+        limit: int = 100,
+        offset: int = 0,
+    ) -> builtins.list[WorkflowInformation]:
+        """列出工作流
+        查询所有工作流，可按状态过滤。
+        Args:
+            state: 按状态过滤（可选）
+            limit: 返回数量限制
+            offset: 偏移量
+        Returns:
+            List[WorkflowInformation]: 工作流列表
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> workflows = client.workflows.list(state=WorkflowState.ACTIVE)
+        """
+        raise NotImplementedError("Workflows.list 尚未实现")
+    def get(self, *, workflow_id: str) -> WorkflowInformation:
+        """获取工作流详情
+        Args:
+            workflow_id: 工作流 ID
+        Returns:
+            WorkflowInformation: 工作流信息
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> workflow = client.workflows.get(workflow_id="wf_123")
+        """
+        raise NotImplementedError("Workflows.get 尚未实现")
+    def update(
+        self,
+        *,
+        workflow_id: str,
+        name: str | None = None,
+        stages: builtins.list[PipelineStage] | None = None,
+        schedule: Schedule | None = None,
+        state: WorkflowState | None = None,
+    ) -> WorkflowInformation:
+        """更新工作流
+        更新工作流配置。
+        Args:
+            workflow_id: 工作流 ID
+            name: 新名称（可选）
+            stages: 新的处理阶段（可选）
+            schedule: 新的调度配置（可选）
+            state: 新状态（可选，用于暂停/恢复）
+        Returns:
+            WorkflowInformation: 更新后的工作流信息
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> workflow = client.workflows.update(
+            ...     workflow_id="wf_123",
+            ...     state=WorkflowState.PAUSED
+            ... )
+        """
+        raise NotImplementedError("Workflows.update 尚未实现")
+    def delete(self, *, workflow_id: str) -> bool:
+        """删除工作流
+        Args:
+            workflow_id: 工作流 ID
+        Returns:
+            bool: 是否删除成功
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> client.workflows.delete(workflow_id="wf_123")
+        """
+        raise NotImplementedError("Workflows.delete 尚未实现")
+    def run(self, *, workflow_id: str) -> str:
+        """手动触发工作流执行
+        立即执行一次工作流，不等待定时调度。
+        Args:
+            workflow_id: 工作流 ID
+        Returns:
+            str: 执行任务 ID
+        Raises:
+            NotImplementedError: 骨架实现，暂未连接真实 API
+        Example:
+            >>> task_id = client.workflows.run(workflow_id="wf_123")
+            >>> print(f"任务已创建: {task_id}")
+        """
+        raise NotImplementedError("Workflows.run 尚未实现")
+__all__ = ["Workflows"]

xparse_client/connectors/__init__.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""连接器模块
+提供从各种数据源读取文件和向各种目的地写入结果的能力。
+数据源 (Sources):
+- LocalSource: 本地文件系统
+- S3Source: S3/MinIO 对象存储
+- FtpSource: FTP 服务器
+- SmbSource: SMB/CIFS 共享
+目的地 (Destinations):
+- LocalDestination: 本地文件系统
+- S3Destination: S3/MinIO 对象存储
+- MilvusDestination: Milvus/Zilliz 向量数据库
+- QdrantDestination: Qdrant 向量数据库
+Example:
+    >>> from xparse_client.connectors import LocalSource, LocalDestination
+    >>> source = LocalSource(directory="./docs", pattern=["*.pdf"])
+    >>> dest = LocalDestination(output_dir="./output")
+"""
+from .destinations import (
+    Destination,
+    LocalDestination,
+    MilvusDestination,
+    QdrantDestination,
+    S3Destination,
+)
+from .sources import FtpSource, LocalSource, S3Source, SmbSource, Source
+__all__ = [
+    # Sources
+    "Source",
+    "LocalSource",
+    "S3Source",
+    "FtpSource",
+    "SmbSource",
+    # Destinations
+    "Destination",
+    "LocalDestination",
+    "S3Destination",
+    "MilvusDestination",
+    "QdrantDestination",
+]

xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

xparse-client 0.2.11py3-none-any.whl → 0.3.0b3py3-none-any.whl