PyPI - xparse-client - Versions diffs - 0.2.20__py3-none-any.whl → 0.3.0b2__py3-none-any.whl - Mend

xparse-client 0.2.20py3-none-any.whl → 0.3.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

example/1_basic_api_usage.py +198 -0
example/2_async_job.py +210 -0
example/3_local_workflow.py +300 -0
example/4_advanced_workflow.py +327 -0
example/README.md +128 -0
example/config_example.json +95 -0
tests/conftest.py +310 -0
tests/unit/__init__.py +1 -0
tests/unit/api/__init__.py +1 -0
tests/unit/api/test_extract.py +232 -0
tests/unit/api/test_local.py +231 -0
tests/unit/api/test_parse.py +374 -0
tests/unit/api/test_pipeline.py +369 -0
tests/unit/api/test_workflows.py +108 -0
tests/unit/connectors/test_ftp.py +525 -0
tests/unit/connectors/test_local_connectors.py +324 -0
tests/unit/connectors/test_milvus.py +368 -0
tests/unit/connectors/test_qdrant.py +399 -0
tests/unit/connectors/test_s3.py +598 -0
tests/unit/connectors/test_smb.py +442 -0
tests/unit/connectors/test_utils.py +335 -0
tests/unit/models/test_local.py +54 -0
tests/unit/models/test_pipeline_stages.py +144 -0
tests/unit/models/test_workflows.py +55 -0
tests/unit/test_base.py +437 -0
tests/unit/test_client.py +110 -0
tests/unit/test_config.py +160 -0
tests/unit/test_exceptions.py +182 -0
tests/unit/test_http.py +562 -0
xparse_client/__init__.py +110 -20
xparse_client/_base.py +179 -0
xparse_client/_client.py +218 -0
xparse_client/_config.py +221 -0
xparse_client/_http.py +350 -0
xparse_client/api/__init__.py +14 -0
xparse_client/api/extract.py +109 -0
xparse_client/api/local.py +188 -0
xparse_client/api/parse.py +209 -0
xparse_client/api/pipeline.py +132 -0
xparse_client/api/workflows.py +204 -0
xparse_client/connectors/__init__.py +45 -0
xparse_client/connectors/_utils.py +138 -0
xparse_client/connectors/destinations/__init__.py +45 -0
xparse_client/connectors/destinations/base.py +116 -0
xparse_client/connectors/destinations/local.py +91 -0
xparse_client/connectors/destinations/milvus.py +229 -0
xparse_client/connectors/destinations/qdrant.py +238 -0
xparse_client/connectors/destinations/s3.py +163 -0
xparse_client/connectors/sources/__init__.py +45 -0
xparse_client/connectors/sources/base.py +74 -0
xparse_client/connectors/sources/ftp.py +278 -0
xparse_client/connectors/sources/local.py +176 -0
xparse_client/connectors/sources/s3.py +232 -0
xparse_client/connectors/sources/smb.py +259 -0
xparse_client/exceptions.py +398 -0
xparse_client/models/__init__.py +60 -0
xparse_client/models/chunk.py +39 -0
xparse_client/models/embed.py +62 -0
xparse_client/models/extract.py +41 -0
xparse_client/models/local.py +38 -0
xparse_client/models/parse.py +136 -0
xparse_client/models/pipeline.py +132 -0
xparse_client/models/workflows.py +74 -0
xparse_client-0.3.0b2.dist-info/METADATA +1075 -0
xparse_client-0.3.0b2.dist-info/RECORD +68 -0
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/WHEEL +1 -1
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/licenses/LICENSE +1 -1
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/top_level.txt +2 -0
xparse_client/pipeline/__init__.py +0 -3
xparse_client/pipeline/config.py +0 -163
xparse_client/pipeline/destinations.py +0 -489
xparse_client/pipeline/pipeline.py +0 -860
xparse_client/pipeline/sources.py +0 -583
xparse_client-0.2.20.dist-info/METADATA +0 -1050
xparse_client-0.2.20.dist-info/RECORD +0 -11

xparse_client/connectors/_utils.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""连接器公共工具函数"""
+from __future__ import annotations
+import json
+import logging
+from datetime import datetime, timezone
+from fnmatch import fnmatch
+from typing import Any
+logger = logging.getLogger(__name__)
+def normalize_wildcard_patterns(pattern: list[str] | None) -> list[str] | None:
+    """规范化通配符模式列表
+    Args:
+        pattern: 通配符模式列表，如 ["*.pdf", "*.docx"]
+    Returns:
+        规范化后的模式列表，None 表示匹配所有文件
+    Raises:
+        ValueError: pattern 类型错误
+    """
+    if pattern is None:
+        return None
+    if not isinstance(pattern, list):
+        raise ValueError(f"pattern 必须是列表类型，当前类型: {type(pattern).__name__}")
+    # 清理空字符串
+    normalized = [p.strip() for p in pattern if p and p.strip()]
+    # 空列表或包含 "*" 表示匹配所有
+    if not normalized or "*" in normalized:
+        return None
+    return normalized
+def match_file_pattern(file_path: str, patterns: list[str] | None) -> bool:
+    """检查文件路径是否匹配通配符模式
+    Args:
+        file_path: 文件路径
+        patterns: 通配符模式列表
+    Returns:
+        是否匹配
+    """
+    if patterns is None:
+        return True
+    # 获取文件名用于匹配
+    filename = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path
+    return any(fnmatch(filename, p) for p in patterns)
+def to_millis_timestamp(timestamp: float | None) -> str:
+    """将时间戳转换为毫秒字符串
+    Args:
+        timestamp: Unix 时间戳（秒或毫秒）
+    Returns:
+        毫秒时间戳字符串，如果输入为 None 则返回空字符串
+    """
+    if timestamp is None:
+        return ""
+    # 如果已经是毫秒（大于 1e12），直接返回
+    if timestamp > 1e12:
+        return str(int(timestamp))
+    # 秒转毫秒
+    return str(int(timestamp * 1000))
+def get_current_millis_timestamp() -> str:
+    """获取当前时间的毫秒时间戳字符串"""
+    return str(int(datetime.now(timezone.utc).timestamp() * 1000))
+def flatten_dict(
+    data: dict[str, Any],
+    prefix: str = "",
+    exclude_fields: set | None = None,
+) -> dict[str, Any]:
+    """递归展平嵌套字典
+    用于将 metadata 中的嵌套结构展平为向量数据库的 payload。
+    Args:
+        data: 要展平的字典
+        prefix: 键的前缀
+        exclude_fields: 需要排除的字段集合
+    Returns:
+        展平后的字典
+    Example:
+        >>> data = {"a": {"b": 1, "c": 2}, "d": 3}
+        >>> flatten_dict(data, "prefix")
+        {"prefix_a_b": 1, "prefix_a_c": 2, "prefix_d": 3}
+    """
+    if exclude_fields is None:
+        exclude_fields = set()
+    result = {}
+    for key, value in data.items():
+        flat_key = f"{prefix}_{key}" if prefix else key
+        if flat_key in exclude_fields:
+            continue
+        if isinstance(value, dict):
+            # 递归展平嵌套字典
+            nested = flatten_dict(value, flat_key, exclude_fields)
+            result.update(nested)
+        elif isinstance(value, list):
+            # 列表转换为 JSON 字符串
+            result[flat_key] = json.dumps(value, ensure_ascii=False)
+        else:
+            # 其他类型直接使用
+            result[flat_key] = value
+    return result
+__all__ = [
+    "normalize_wildcard_patterns",
+    "match_file_pattern",
+    "to_millis_timestamp",
+    "get_current_millis_timestamp",
+    "flatten_dict",
+]

xparse_client/connectors/destinations/__init__.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""目的地连接器
+提供将处理结果写入各种目的地的能力。
+可用的目的地：
+- LocalDestination: 本地文件系统
+- S3Destination: S3/MinIO 对象存储（需要 pip install xparse-client[s3]）
+- MilvusDestination: Milvus/Zilliz 向量数据库（需要 pip install xparse-client[milvus]）
+- QdrantDestination: Qdrant 向量数据库（需要 pip install xparse-client[qdrant]）
+Example:
+    >>> from xparse_client.connectors.destinations import LocalDestination
+    >>> dest = LocalDestination(output_dir="./output")
+    >>> dest.write(elements, {"filename": "doc.pdf", "record_id": "xxx"})
+"""
+from .base import Destination, VectorDestinationMixin
+from .local import LocalDestination
+# 懒加载其他 Destination，避免强依赖
+__all__ = [
+    "Destination",
+    "VectorDestinationMixin",
+    "LocalDestination",
+    "S3Destination",
+    "MilvusDestination",
+    "QdrantDestination",
+]
+def __getattr__(name: str):
+    """懒加载 Destination 类"""
+    if name == "S3Destination":
+        from .s3 import S3Destination
+        return S3Destination
+    elif name == "MilvusDestination":
+        from .milvus import MilvusDestination
+        return MilvusDestination
+    elif name == "QdrantDestination":
+        from .qdrant import QdrantDestination
+        return QdrantDestination
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

xparse_client/connectors/destinations/base.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""目的地抽象基类"""
+from __future__ import annotations
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+logger = logging.getLogger(__name__)
+class Destination(ABC):
+    """目的地抽象基类
+    所有目的地必须实现 write() 方法。
+    Example:
+        >>> class MyDestination(Destination):
+        ...     def write(self, data, metadata):
+        ...         # 写入逻辑
+        ...         return True
+    """
+    @abstractmethod
+    def write(self, data: list[dict[str, Any]], metadata: dict[str, Any]) -> bool:
+        """写入数据
+        Args:
+            data: 要写入的数据列表（元素列表或结构化数据）
+            metadata: 元数据，包含 filename, record_id, processed_at 等
+        Returns:
+            是否写入成功
+        Raises:
+            DestinationError: 写入失败
+        """
+        raise NotImplementedError
+    def close(self) -> None:  # noqa: B027
+        """关闭连接
+        子类可以重写此方法来释放资源。
+        """
+        pass
+    def __enter__(self) -> Destination:
+        """上下文管理器入口"""
+        return self
+    def __exit__(self, *args: Any) -> None:
+        """上下文管理器退出，自动关闭连接"""
+        self.close()
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}>"
+class VectorDestinationMixin:
+    """向量数据库目的地的公共逻辑
+    提供 Milvus 和 Qdrant 共用的 payload 处理逻辑。
+    """
+    @staticmethod
+    def prepare_payload(
+        item: dict[str, Any],
+        metadata: dict[str, Any],
+        record_id: str,
+        fixed_fields: set | None = None,
+    ) -> dict[str, Any]:
+        """准备向量数据库的 payload
+        Args:
+            item: 单条数据（包含 embeddings, text, metadata 等）
+            metadata: 文件级别的元数据
+            record_id: 记录 ID
+            fixed_fields: 需要排除的固定字段
+        Returns:
+            处理后的 payload 字典
+        """
+        from .._utils import flatten_dict
+        if fixed_fields is None:
+            fixed_fields = {"embeddings", "text", "element_id", "record_id", "metadata"}
+        # 合并元素级和文件级 metadata
+        element_metadata = item.get("metadata", {})
+        merged_metadata = {**element_metadata, **metadata}
+        payload = {
+            "text": item.get("text", ""),
+            "record_id": record_id,
+        }
+        for key, value in merged_metadata.items():
+            if key in fixed_fields:
+                continue
+            if key == "data_source" and isinstance(value, dict):
+                # 展平 data_source 字典
+                flattened = flatten_dict(value, "data_source", fixed_fields)
+                payload.update(flattened)
+            elif key == "coordinates" and isinstance(value, list):
+                payload[key] = value
+            elif isinstance(value, (dict, list)):
+                # 跳过复杂类型（或在子类中特殊处理）
+                continue
+            else:
+                payload[key] = value
+        return payload
+__all__ = ["Destination", "VectorDestinationMixin"]

xparse_client/connectors/destinations/local.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""本地文件系统目的地"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Any
+from ...exceptions import DestinationError
+from .base import Destination
+logger = logging.getLogger(__name__)
+class LocalDestination(Destination):
+    """本地文件系统目的地
+    将处理结果写入本地 JSON 文件。
+    Attributes:
+        output_dir: 输出目录
+    Example:
+        >>> dest = LocalDestination(output_dir="./output")
+        >>> dest.write(elements, {"filename": "doc.pdf"})
+    """
+    def __init__(self, output_dir: str) -> None:
+        """初始化本地目的地
+        Args:
+            output_dir: 输出目录路径
+        Raises:
+            DestinationError: 创建目录失败
+        """
+        self.output_dir = Path(output_dir)
+        try:
+            self.output_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f"本地输出目录: {self.output_dir}")
+        except Exception as e:
+            raise DestinationError(
+                f"创建输出目录失败: {e}",
+                connector_type="local",
+                operation="init",
+            ) from e
+    def write(self, data: list[dict[str, Any]], metadata: dict[str, Any]) -> bool:
+        """写入数据到本地文件
+        Args:
+            data: 要写入的数据列表
+            metadata: 元数据，包含 filename（必须）和 stage（可选）
+        Returns:
+            是否写入成功
+        Raises:
+            DestinationError: 写入失败
+        """
+        try:
+            filename = metadata.get("filename", "output")
+            base_name = Path(filename).stem
+            stage = metadata.get("stage")  # 用于区分中间结果
+            # 构建输出文件名
+            if stage:
+                output_file = self.output_dir / f"{base_name}_{stage}.json"
+            else:
+                output_file = self.output_dir / f"{base_name}.json"
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            logger.info(f"写入本地文件: {output_file}")
+            return True
+        except Exception as e:
+            raise DestinationError(
+                f"写入本地文件失败: {e}",
+                connector_type="local",
+                operation="write",
+            ) from e
+    def __repr__(self) -> str:
+        return f"<LocalDestination output_dir={self.output_dir}>"
+__all__ = ["LocalDestination"]

xparse_client/connectors/destinations/milvus.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""Milvus/Zilliz 向量数据库目的地（懒加载 pymilvus）"""
+from __future__ import annotations
+import logging
+import uuid
+from typing import Any
+from ...exceptions import DestinationError
+from .base import Destination, VectorDestinationMixin
+logger = logging.getLogger(__name__)
+def _get_pymilvus():
+    """懒加载 pymilvus"""
+    try:
+        from pymilvus import DataType, MilvusClient
+        return MilvusClient, DataType
+    except ImportError as e:
+        raise ImportError(
+            "使用 MilvusDestination 需要安装 pymilvus: pip install xparse-client[milvus]"
+        ) from e
+class MilvusDestination(Destination, VectorDestinationMixin):
+    """Milvus/Zilliz 向量数据库目的地
+    支持 Milvus 本地部署（Milvus Lite）和 Zilliz Cloud。
+    Attributes:
+        db_path: 数据库路径或 Zilliz URL
+        collection_name: Collection 名称
+        dimension: 向量维度
+    Example:
+        >>> # Milvus Lite（本地）
+        >>> dest = MilvusDestination(
+        ...     db_path="./milvus.db",
+        ...     collection_name="documents",
+        ...     dimension=1024,
+        ... )
+        >>>
+        >>> # Zilliz Cloud
+        >>> dest = MilvusDestination(
+        ...     db_path="https://xxx.zillizcloud.com",
+        ...     collection_name="documents",
+        ...     dimension=1024,
+        ...     token="your-token",
+        ... )
+    """
+    def __init__(
+        self,
+        db_path: str,
+        collection_name: str,
+        dimension: int,
+        api_key: str | None = None,
+        token: str | None = None,
+    ) -> None:
+        """初始化 Milvus 目的地
+        Args:
+            db_path: 数据库路径（本地）或 Zilliz Cloud URL
+            collection_name: Collection 名称
+            dimension: 向量维度
+            api_key: API Key（与 token 功能相同）
+            token: Token（与 api_key 功能相同）
+        Raises:
+            DestinationError: 连接失败
+        """
+        MilvusClient, DataType = _get_pymilvus()
+        self.db_path = db_path
+        self.collection_name = collection_name
+        self.dimension = dimension
+        client_kwargs = {"uri": db_path}
+        if api_key:
+            client_kwargs["token"] = api_key
+        elif token:
+            client_kwargs["token"] = token
+        try:
+            self.client = MilvusClient(**client_kwargs)
+            # 创建 Collection（如果不存在）
+            if not self.client.has_collection(collection_name):
+                schema = self.client.create_schema(
+                    auto_id=False,
+                    enable_dynamic_field=True,
+                )
+                schema.add_field(
+                    field_name="element_id",
+                    datatype=DataType.VARCHAR,
+                    max_length=128,
+                    is_primary=True,
+                )
+                schema.add_field(
+                    field_name="embeddings",
+                    datatype=DataType.FLOAT_VECTOR,
+                    dim=dimension,
+                )
+                schema.add_field(
+                    field_name="text",
+                    datatype=DataType.VARCHAR,
+                    max_length=65535,
+                )
+                schema.add_field(
+                    field_name="record_id",
+                    datatype=DataType.VARCHAR,
+                    max_length=200,
+                )
+                index_params = self.client.prepare_index_params()
+                index_params.add_index(
+                    field_name="embeddings",
+                    index_type="AUTOINDEX",
+                    metric_type="COSINE",
+                )
+                self.client.create_collection(
+                    collection_name=collection_name,
+                    schema=schema,
+                    index_params=index_params,
+                )
+                logger.info(f"Milvus Collection 创建: {collection_name}")
+            else:
+                logger.info(f"Milvus Collection 已存在: {collection_name}")
+        except ImportError:
+            raise
+        except Exception as e:
+            raise DestinationError(
+                f"Milvus 连接失败: {e}",
+                connector_type="milvus",
+                operation="connect",
+                details={"db_path": db_path, "collection_name": collection_name},
+            ) from e
+    def write(self, data: list[dict[str, Any]], metadata: dict[str, Any]) -> bool:
+        """写入向量数据到 Milvus
+        Args:
+            data: 包含 embeddings 的元素列表
+            metadata: 元数据，必须包含 record_id
+        Returns:
+            是否写入成功
+        Raises:
+            DestinationError: 写入失败
+        """
+        record_id = metadata.get("record_id")
+        if not record_id:
+            logger.warning("没有 record_id，跳过写入")
+            return False
+        try:
+            # 删除旧记录
+            try:
+                result = self.client.delete(
+                    collection_name=self.collection_name,
+                    filter=f'record_id == "{record_id}"',
+                )
+                deleted = (
+                    result
+                    if isinstance(result, int)
+                    else result.get("delete_count", 0)
+                    if isinstance(result, dict)
+                    else 0
+                )
+                if deleted > 0:
+                    logger.info(f"删除 Milvus 旧记录: record_id={record_id}, 数量={deleted}")
+            except Exception as e:
+                logger.warning(f"删除旧记录失败: {e}")
+            # 准备插入数据
+            fixed_fields = {"embeddings", "text", "element_id", "record_id", "metadata"}
+            insert_data = []
+            for item in data:
+                if "embeddings" not in item or not item["embeddings"]:
+                    continue
+                element_id = (
+                    item.get("element_id") or item.get("id") or str(uuid.uuid4())
+                )
+                insert_item = {
+                    "element_id": element_id,
+                    "embeddings": item["embeddings"],
+                    "text": item.get("text", ""),
+                    "record_id": record_id,
+                }
+                # 添加 payload 字段
+                payload = self.prepare_payload(item, metadata, record_id, fixed_fields)
+                for k, v in payload.items():
+                    if k not in insert_item:
+                        insert_item[k] = v
+                insert_data.append(insert_item)
+            if not insert_data:
+                logger.warning("没有有效的向量数据")
+                return False
+            self.client.insert(
+                collection_name=self.collection_name,
+                data=insert_data,
+            )
+            logger.info(f"写入 Milvus: {len(insert_data)} 条")
+            return True
+        except Exception as e:
+            raise DestinationError(
+                f"写入 Milvus 失败: {e}",
+                connector_type="milvus",
+                operation="write",
+            ) from e
+    def __repr__(self) -> str:
+        return f"<MilvusDestination db_path={self.db_path} collection={self.collection_name}>"
+__all__ = ["MilvusDestination"]

xparse-client 0.2.20__py3-none-any.whl → 0.3.0b2__py3-none-any.whl

xparse-client 0.2.20py3-none-any.whl → 0.3.0b2py3-none-any.whl