PyPI - xparse-client - Versions diffs - 0.3.0b1__tar.gz → 0.3.0b6__tar.gz - Mend

xparse-client 0.3.0b1tar.gz → 0.3.0b6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{xparse_client-0.3.0b1/xparse_client.egg-info → xparse_client-0.3.0b6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.3.0b1
+Version: 0.3.0b6
 Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
 Author-email: INTSIG-TEXTIN <support@textin.com>
 License-Expression: MIT
@@ -213,17 +213,17 @@ schema = {
         "author": {"type": "string", "description": "作者"},
         "date": {"type": "string", "description": "日期"}
     },
-    "required": ["title"]
+    "required": ["title", "author", "date"]
 }
 with open("document.pdf", "rb") as f:
     result = client.extract.extract(
         file=f,
         filename="document.pdf",
-        config=ExtractConfig(schema=schema)
+        extract_config=ExtractConfig(schema=schema)
     )
-print(result.extracted_data)
+print(result.result)
 ```
 ### 3. 本地批处理

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/README.md RENAMED Viewed

@@ -164,17 +164,17 @@ schema = {
         "author": {"type": "string", "description": "作者"},
         "date": {"type": "string", "description": "日期"}
     },
-    "required": ["title"]
+    "required": ["title", "author", "date"]
 }
 with open("document.pdf", "rb") as f:
     result = client.extract.extract(
         file=f,
         filename="document.pdf",
-        config=ExtractConfig(schema=schema)
+        extract_config=ExtractConfig(schema=schema)
     )
-print(result.extracted_data)
+print(result.result)
 ```
 ### 3. 本地批处理

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/2_async_job.py RENAMED Viewed

@@ -165,7 +165,7 @@ def example_3_error_handling():
         except Exception as e:
             print(f"\n⏸️  捕获到超时异常: {type(e).__name__}")
-            print(f"  这是正常的，演示了如何处理超时情况")
+            print("  这是正常的，演示了如何处理超时情况")
     finally:
         test_file.unlink(missing_ok=True)

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/example/4_advanced_workflow.py RENAMED Viewed

@@ -91,12 +91,12 @@ def example_1_config_file():
     XParseClient.from_env()
     # 演示如何从配置构建 source 和 destination（不实际创建）
-    print(f"\n✅ 配置加载成功!")
-    print(f"  - Source 配置:")
+    print("\n✅ 配置加载成功!")
+    print("  - Source 配置:")
     print(f"      目录: {loaded_config['source']['directory']}")
     print(f"      模式: {loaded_config['source']['pattern']}")
     print(f"      递归: {loaded_config['source']['recursive']}")
-    print(f"  - Destination 配置:")
+    print("  - Destination 配置:")
     print(f"      输出目录: {loaded_config['destination']['output_dir']}")
     # 构建 stages

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xparse-client"
-version = "0.3.0b1"
+version = "0.3.0b6"
 description = "面向 Agent 和 RAG 的文档处理 Pipeline 客户端"
 readme = "README.md"
 license = "MIT"

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/tests/unit/models/test_pipeline_stages.py RENAMED Viewed

@@ -89,8 +89,8 @@ def test_extract_stage_with_config():
     assert stage.type == "extract"
     assert stage.config.generate_citations is True
-    # 序列化检查
-    dumped = stage.model_dump()
+    # 序列化检查（使用 by_alias=True 来确保 schema_ 序列化为 schema）
+    dumped = stage.model_dump(by_alias=True)
     assert "schema" in dumped["config"]
     assert dumped["config"]["schema"] == schema

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_base.py RENAMED Viewed

@@ -6,7 +6,7 @@
 from __future__ import annotations
 from abc import ABC
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
 import httpx
@@ -14,7 +14,16 @@ if TYPE_CHECKING:
     from ._config import SDKConfiguration
     from ._http import HTTPClient
-T = TypeVar("T")
+class PydanticModel(Protocol):
+    """Pydantic 模型协议"""
+    @classmethod
+    def model_validate(cls, obj: Any) -> Any:
+        ...
+T = TypeVar("T", bound=PydanticModel)
 class BaseAPI(ABC):  # noqa: B024
@@ -130,7 +139,7 @@ class BaseAPI(ABC):  # noqa: B024
         if isinstance(data, dict) and "data" in data:
             data = data["data"]
-        return model_class.model_validate(data)
+        return cast(T, model_class.model_validate(data))
     def _parse_list_response(
         self,
@@ -156,7 +165,7 @@ class BaseAPI(ABC):  # noqa: B024
         if not isinstance(data, list):
             data = [data]
-        return [model_class.model_validate(item) for item in data]
+        return [cast(T, model_class.model_validate(item)) for item in data]
     def _parse_raw_response(self, response: httpx.Response) -> dict[str, Any]:
         """解析原始 JSON 响应
@@ -171,9 +180,9 @@ class BaseAPI(ABC):  # noqa: B024
         # 处理标准响应格式
         if isinstance(data, dict) and "data" in data:
-            return data["data"]
+            return cast(dict[str, Any], data["data"])
-        return data
+        return cast(dict[str, Any], data)
 __all__ = ["BaseAPI"]

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/_http.py RENAMED Viewed

@@ -53,7 +53,8 @@ def _extract_error_message(response: httpx.Response) -> str:
                     return str(data[key])
             # 如果有嵌套的 error 对象
             if "error" in data and isinstance(data["error"], dict):
-                return data["error"].get("message", str(data["error"]))
+                error_dict = data["error"]
+                return str(error_dict.get("message", error_dict))
         return str(data)
     except Exception:
         # 非 JSON 响应，返回文本内容
@@ -106,7 +107,7 @@ def raise_for_status(response: httpx.Response) -> None:
         message = _extract_error_message(response)
         response_body = response.text[:1000] if response.text else None
-        common_kwargs = {
+        common_kwargs: dict[str, Any] = {
             "status_code": status_code,
             "request_id": request_id,
             "response_body": response_body,
@@ -140,24 +141,24 @@ def raise_for_status(response: httpx.Response) -> None:
                 message = data.get("message", f"业务错误 code: {code}")
                 response_body = response.text[:1000] if response.text else None
-                common_kwargs = {
+                error_kwargs: dict[str, Any] = {
                     "request_id": request_id,
                     "response_body": response_body,
                 }
                 # 根据业务 code 映射到相应异常
                 if code == 400:
-                    raise ValidationError(message, details=common_kwargs)
+                    raise ValidationError(message, details=error_kwargs)
                 elif code == 401:
-                    raise AuthenticationError(message, **common_kwargs)
+                    raise AuthenticationError(message, **error_kwargs)
                 elif code == 403:
-                    raise PermissionDeniedError(message, **common_kwargs)
+                    raise PermissionDeniedError(message, **error_kwargs)
                 elif code == 404:
-                    raise NotFoundError(message, **common_kwargs)
+                    raise NotFoundError(message, **error_kwargs)
                 elif code >= 500:
-                    raise ServerError(message, **common_kwargs)
+                    raise ServerError(message, **error_kwargs)
                 else:
-                    raise APIError(message, **common_kwargs)
+                    raise APIError(message, **error_kwargs)
     except (ValueError, KeyError):
         # 如果无法解析 JSON 或没有 code 字段，认为是成功
         pass

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/extract.py RENAMED Viewed

@@ -79,7 +79,8 @@ class Extract(BaseAPI):
             ...             "properties": {
             ...                 "invoice_number": {"type": "string"},
             ...                 "total_amount": {"type": "number"}
-            ...             }
+            ...             },
+            ...             "required": ["invoice_number", "total_amount"]
             ...         },
             ...         generate_citations=True
             ...     )
@@ -89,15 +90,11 @@ class Extract(BaseAPI):
         data = {}
         if parse_config:
-            data["parse_config"] = json.dumps(
-                parse_config.model_dump(), ensure_ascii=False
-            )
+            data["parse_config"] = json.dumps(parse_config.model_dump(), ensure_ascii=False)
         # 处理 extract_config
         if extract_config:
-            data["extract_config"] = json.dumps(
-                extract_config.model_dump(), ensure_ascii=False
-            )
+            data["extract_config"] = json.dumps(extract_config.model_dump(), ensure_ascii=False)
         else:
             raise ValueError("extract_config is required")
@@ -105,5 +102,4 @@ class Extract(BaseAPI):
         return self._parse_response(response, ParseResponse)
 __all__ = ["Extract"]

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/local.py RENAMED Viewed

@@ -24,11 +24,11 @@ Example:
 from __future__ import annotations
-from typing import TYPE_CHECKING, Callable, Literal
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from .._base import BaseAPI
 from ..models.local import FailedFile, WorkflowResult
-from ..models.pipeline import PipelineStage
+from ..models.pipeline import PipelineConfig, PipelineStage
 if TYPE_CHECKING:
     pass
@@ -53,8 +53,9 @@ class Local(BaseAPI):
         self,
         source,  # Union[LocalSource, S3Source, FtpSource, SmbSource]
         destination,  # Union[LocalDestination, S3Destination, MilvusDestination, QdrantDestination]
-        stages: list[PipelineStage],
+        stages: list[PipelineStage | dict[str, Any]],
         *,
+        pipeline_config: PipelineConfig | None = None,
         progress_callback: Callable[[int, int, str], None] | None = None,
         on_error: Literal["stop", "continue", "retry"] = "stop",
         max_retries: int = 3,
@@ -75,6 +76,7 @@ class Local(BaseAPI):
             source: 数据源（本地 Connector 对象）
             destination: 输出目的地（本地 Connector 对象）
             stages: 处理阶段列表
+            pipeline_config: Pipeline 配置（包含中间结果保存配置）
             progress_callback: 进度回调函数 (current, total, message) -> None
             on_error: 错误处理策略 ("stop"|"continue"|"retry")
             max_retries: 最大重试次数（on_error="retry" 时生效）
@@ -125,20 +127,56 @@ class Local(BaseAPI):
                 # 调用 Pipeline API
                 from .pipeline import PipelineAPI
                 pipeline_api = PipelineAPI(self._config, self._http)
                 result = pipeline_api.execute(
                     file=file_bytes,
                     filename=filename,
                     stages=stages,
+                    config=pipeline_config,
                     data_source=data_source,
                 )
+                # 处理中间结果
+                if (
+                    pipeline_config
+                    and pipeline_config.include_intermediate_results
+                    and pipeline_config.intermediate_results_destination
+                    and hasattr(result, "intermediate_results")
+                    and result.intermediate_results
+                ):
+                    for stage_result in result.intermediate_results:
+                        stage_name = stage_result.get("stage")
+                        elements = stage_result.get("elements", [])
+                        if stage_name and elements:
+                            # 转换 elements 为字典列表
+                            elements_data = []
+                            for elem in elements:
+                                if hasattr(elem, "model_dump"):
+                                    elements_data.append(elem.model_dump())
+                                elif isinstance(elem, dict):
+                                    elements_data.append(elem)
+                                else:
+                                    elements_data.append(elem)
+                            metadata_with_stage = {
+                                "filename": filename,
+                                "file_path": file_path,
+                                "stage": stage_name,
+                            }
+                            if result.stats and hasattr(result.stats, "record_id"):
+                                metadata_with_stage["record_id"] = result.stats.record_id
+                            pipeline_config.intermediate_results_destination.write(
+                                elements_data, metadata_with_stage
+                            )
                 # 准备写入数据
                 # 如果是向量数据库，写入 embeddings
-                if hasattr(result, 'elements') and result.elements:
+                if hasattr(result, "elements") and result.elements:
                     elements_data = []
                     for elem in result.elements:
-                        elem_dict = elem.model_dump() if hasattr(elem, 'model_dump') else elem
+                        elem_dict = elem.model_dump() if hasattr(elem, "model_dump") else elem
                         elements_data.append(elem_dict)
                 else:
                     elements_data = []
@@ -148,8 +186,8 @@ class Local(BaseAPI):
                     "filename": filename,
                     "file_path": file_path,
                 }
-                if hasattr(result, 'record_id'):
-                    metadata["record_id"] = result.record_id
+                if result.stats and hasattr(result.stats, "record_id"):
+                    metadata["record_id"] = result.stats.record_id
                 destination.write(elements_data, metadata)
@@ -164,11 +202,13 @@ class Local(BaseAPI):
                 else:
                     # continue: 记录失败并继续下一个文件
                     failed += 1
-                    failed_files.append(FailedFile(
-                        file_path=file_path,
-                        error=error_msg,
-                        retry_count=0,  # HTTP客户端内部已经处理了重试
-                    ))
+                    failed_files.append(
+                        FailedFile(
+                            file_path=file_path,
+                            error=error_msg,
+                            retry_count=0,  # HTTP客户端内部已经处理了重试
+                        )
+                    )
         # 计算总耗时
         duration = time.time() - start_time

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/api/pipeline.py RENAMED Viewed

@@ -120,7 +120,9 @@ class PipelineAPI(BaseAPI):
         }
         if config:
-            data["config"] = json.dumps(config.model_dump(), ensure_ascii=False)
+            # 只序列化 API 需要的字段（排除 intermediate_results_destination）
+            config_dict = config.model_dump(exclude={"intermediate_results_destination"})
+            data["config"] = json.dumps(config_dict, ensure_ascii=False)
         if data_source:
             data["data_source"] = json.dumps(data_source, ensure_ascii=False)

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/destinations/qdrant.py RENAMED Viewed

@@ -90,7 +90,7 @@ class QdrantDestination(Destination, VectorDestinationMixin):
         self.dimension = dimension
         self._PointStruct = PointStruct
-        client_kwargs = {"url": url}
+        client_kwargs: dict[str, Any] = {"url": url}
         if api_key:
             client_kwargs["api_key"] = api_key
         if prefer_grpc:

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/ftp.py RENAMED Viewed

@@ -90,7 +90,7 @@ class FtpSource(Source):
             SourceError: 列出文件失败
         """
         try:
-            files = []
+            files: list[str] = []
             current_dir = self.client.pwd()
             if self.recursive:
@@ -178,7 +178,7 @@ class FtpSource(Source):
         # 回退到 LIST 命令
         try:
-            lines = []
+            lines: list[str] = []
             self.client.retrlines("LIST", lines.append)
             for line in lines:
                 parts = line.split()

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/connectors/sources/smb.py RENAMED Viewed

@@ -125,7 +125,7 @@ class SmbSource(Source):
             SourceError: 列出文件失败
         """
         try:
-            files = []
+            files: list[str] = []
             base_path = "/" if not self.path else f"/{self.path}"
             self._list_recursive(base_path, base_path, files)

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/extract.py RENAMED Viewed

@@ -11,7 +11,7 @@ class ExtractConfig(BaseModel):
     """抽取配置
     Attributes:
-        schema: JSON Schema 定义抽取的结构
+        schema_: JSON Schema 定义抽取的结构（序列化为 schema）
         generate_citations: 是否生成引用
         stamp: 是否添加时间戳
@@ -23,11 +23,12 @@ class ExtractConfig(BaseModel):
         ...         "total_amount": {"type": "number"}
         ...     }
         ... }
-        >>> config = ExtractConfig(schema=schema)
+        >>> config = ExtractConfig(schema_=schema)
     """
-    schema: dict[str, Any] = Field(
+    schema_: dict[str, Any] = Field(
         default_factory=dict,
+        alias="schema",
         description="JSON Schema 定义抽取的结构"
     )
     generate_citations: bool = False
@@ -35,6 +36,7 @@ class ExtractConfig(BaseModel):
     model_config = {
         "extra": "allow",
+        "populate_by_name": True,  # 允许使用 schema_ 和 schema 两种名称
     }

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/parse.py RENAMED Viewed

@@ -63,17 +63,13 @@ class ParseResponse(BaseModel):
     Attributes:
         elements: 解析后的元素列表（Parse/Chunk/Embed 返回）
-        extract_result: 提取结果（Extract 返回）
+        result: 提取结果（Extract 返回）
         success_count: 成功数量
-        consume_time: 耗时
-        record_id: 记录 ID
     """
     elements: list[Element] = Field(default_factory=list)
-    extract_result: dict[str, Any] | None = None  # Extract API 返回
+    result: dict[str, Any] | None = None  # Extract API 返回
     success_count: int | None = None
-    consume_time: str | None = None
-    record_id: str | None = None
     model_config = {"extra": "allow"}

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6}/xparse_client/models/pipeline.py RENAMED Viewed

@@ -99,11 +99,13 @@ class PipelineConfig(BaseModel):
     Attributes:
         include_intermediate_results: 是否包含中间结果
+        intermediate_results_destination: 中间结果保存目的地（仅支持 LocalDestination）
     """
     include_intermediate_results: bool = False
+    intermediate_results_destination: Any = None  # LocalDestination 对象
-    model_config = {"extra": "allow"}
+    model_config = {"extra": "allow", "arbitrary_types_allowed": True}
 class PipelineResponse(BaseModel):

{xparse_client-0.3.0b1 → xparse_client-0.3.0b6/xparse_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.3.0b1
+Version: 0.3.0b6
 Summary: 面向 Agent 和 RAG 的文档处理 Pipeline 客户端
 Author-email: INTSIG-TEXTIN <support@textin.com>
 License-Expression: MIT
@@ -213,17 +213,17 @@ schema = {
         "author": {"type": "string", "description": "作者"},
         "date": {"type": "string", "description": "日期"}
     },
-    "required": ["title"]
+    "required": ["title", "author", "date"]
 }
 with open("document.pdf", "rb") as f:
     result = client.extract.extract(
         file=f,
         filename="document.pdf",
-        config=ExtractConfig(schema=schema)
+        extract_config=ExtractConfig(schema=schema)
     )
-print(result.extracted_data)
+print(result.result)
 ```
 ### 3. 本地批处理