xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
  69. example/run_pipeline.py +0 -506
  70. example/run_pipeline_test.py +0 -458
  71. xparse_client/pipeline/__init__.py +0 -3
  72. xparse_client/pipeline/config.py +0 -129
  73. xparse_client/pipeline/destinations.py +0 -487
  74. xparse_client/pipeline/pipeline.py +0 -622
  75. xparse_client/pipeline/sources.py +0 -585
  76. xparse_client-0.2.11.dist-info/METADATA +0 -1050
  77. xparse_client-0.2.11.dist-info/RECORD +0 -13
@@ -0,0 +1,39 @@
1
+ """Chunk API 数据模型"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class ChunkConfig(BaseModel):
11
+ """分块配置
12
+
13
+ Attributes:
14
+ strategy: 分块策略
15
+ include_orig_elements: 是否包含原始元素
16
+ new_after_n_chars: 多少字符后创建新块
17
+ max_characters: 块的最大字符数
18
+ overlap: 重叠字符数
19
+ overlap_all: 是否所有块都重叠
20
+
21
+ Example:
22
+ >>> config = ChunkConfig(
23
+ ... strategy="basic",
24
+ ... max_characters=1024,
25
+ ... overlap=50
26
+ ... )
27
+ """
28
+
29
+ strategy: Literal["basic", "by_title", "by_page"] = "basic"
30
+ include_orig_elements: bool = False
31
+ new_after_n_chars: int = Field(default=512, ge=1)
32
+ max_characters: int = Field(default=1024, ge=1)
33
+ overlap: int = Field(default=0, ge=0)
34
+ overlap_all: bool = False
35
+
36
+ model_config = {"extra": "allow"}
37
+
38
+
39
+ __all__ = ["ChunkConfig"]
@@ -0,0 +1,62 @@
1
+ """Embed API 数据模型"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, field_validator
8
+
9
+
10
+ class EmbedConfig(BaseModel):
11
+ """向量化配置
12
+
13
+ Attributes:
14
+ provider: 向量化引擎提供商
15
+ model_name: 模型名称
16
+
17
+ Example:
18
+ >>> config = EmbedConfig(
19
+ ... provider="qwen",
20
+ ... model_name="text-embedding-v3"
21
+ ... )
22
+ """
23
+
24
+ provider: Literal["qwen", "doubao"] = "qwen"
25
+ model_name: Literal[
26
+ "text-embedding-v3",
27
+ "text-embedding-v4",
28
+ "doubao-embedding-large-text-250515",
29
+ "doubao-embedding-text-240715",
30
+ ] = "text-embedding-v3"
31
+
32
+ model_config = {"extra": "allow"}
33
+
34
+ @field_validator("model_name")
35
+ @classmethod
36
+ def validate_model_for_provider(cls, v: str, info) -> str:
37
+ """验证 model_name 与 provider 是否匹配"""
38
+ provider = info.data.get("provider", "qwen")
39
+
40
+ provider_models = {
41
+ "qwen": ["text-embedding-v3", "text-embedding-v4"],
42
+ "doubao": [
43
+ "doubao-embedding-large-text-250515",
44
+ "doubao-embedding-text-240715",
45
+ ],
46
+ }
47
+
48
+ if provider not in provider_models:
49
+ raise ValueError(
50
+ f"不支持的 provider: {provider}, 支持的有: {list(provider_models.keys())}"
51
+ )
52
+
53
+ if v not in provider_models[provider]:
54
+ raise ValueError(
55
+ f"provider '{provider}' 不支持模型 '{v}', "
56
+ f"支持的模型: {provider_models[provider]}"
57
+ )
58
+
59
+ return v
60
+
61
+
62
+ __all__ = ["EmbedConfig"]
@@ -0,0 +1,41 @@
1
+ """Extract API 数据模型"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class ExtractConfig(BaseModel):
11
+ """抽取配置
12
+
13
+ Attributes:
14
+ schema: JSON Schema 定义抽取的结构
15
+ generate_citations: 是否生成引用
16
+ stamp: 是否添加时间戳
17
+
18
+ Example:
19
+ >>> schema = {
20
+ ... "type": "object",
21
+ ... "properties": {
22
+ ... "invoice_number": {"type": "string"},
23
+ ... "total_amount": {"type": "number"}
24
+ ... }
25
+ ... }
26
+ >>> config = ExtractConfig(schema=schema)
27
+ """
28
+
29
+ schema: dict[str, Any] = Field(
30
+ default_factory=dict,
31
+ description="JSON Schema 定义抽取的结构"
32
+ )
33
+ generate_citations: bool = False
34
+ stamp: bool = False
35
+
36
+ model_config = {
37
+ "extra": "allow",
38
+ }
39
+
40
+
41
+ __all__ = ["ExtractConfig"]
@@ -0,0 +1,38 @@
1
+ """Local API 数据模型
2
+
3
+ 本地批处理工作流的请求/响应模型。
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass
10
+ class FailedFile:
11
+ """失败文件信息
12
+
13
+ Attributes:
14
+ file_path: 文件路径
15
+ error: 错误信息
16
+ retry_count: 重试次数
17
+ """
18
+ file_path: str
19
+ error: str
20
+ retry_count: int
21
+
22
+
23
+ @dataclass
24
+ class WorkflowResult:
25
+ """工作流执行结果
26
+
27
+ Attributes:
28
+ total: 总文件数
29
+ success: 成功数
30
+ failed: 失败数
31
+ failed_files: 失败文件列表
32
+ duration: 总耗时(秒)
33
+ """
34
+ total: int
35
+ success: int
36
+ failed: int
37
+ failed_files: list[FailedFile]
38
+ duration: float
@@ -0,0 +1,136 @@
1
+ """Parse API 数据模型"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class ParseConfig(BaseModel):
11
+ """解析配置
12
+
13
+ Attributes:
14
+ provider: 解析引擎提供商
15
+
16
+ Example:
17
+ >>> config = ParseConfig(provider="textin")
18
+ """
19
+
20
+ provider: Literal["textin", "mineru", "paddle", "textin-lite"] = "textin"
21
+
22
+ model_config = {"extra": "allow"}
23
+
24
+
25
+ class ElementMetadata(BaseModel):
26
+ """元素元数据
27
+
28
+ Attributes:
29
+ page_number: 页码
30
+ coordinates: 坐标信息
31
+ """
32
+
33
+ page_number: int | None = None
34
+ coordinates: list[float] | None = None
35
+
36
+ model_config = {"extra": "allow"}
37
+
38
+
39
+ class Element(BaseModel):
40
+ """解析后的文档元素
41
+
42
+ Attributes:
43
+ element_id: 元素唯一标识
44
+ type: 元素类型(如 text, title, table, image 等)
45
+ text: 元素文本内容
46
+ metadata: 元素元数据
47
+ embeddings: 向量嵌入(如果执行了 embed 阶段)
48
+ """
49
+
50
+ element_id: str
51
+ type: str
52
+ text: str = ""
53
+ metadata: ElementMetadata | None = None
54
+ embeddings: list[float] | None = None
55
+
56
+ model_config = {"extra": "allow"}
57
+
58
+
59
+ class ParseResponse(BaseModel):
60
+ """统一响应模型
61
+
62
+ 用于 Parse、Extract、Pipeline 等 API 的响应。
63
+
64
+ Attributes:
65
+ elements: 解析后的元素列表(Parse/Chunk/Embed 返回)
66
+ extract_result: 提取结果(Extract 返回)
67
+ success_count: 成功数量
68
+ consume_time: 耗时
69
+ record_id: 记录 ID
70
+ """
71
+
72
+ elements: list[Element] = Field(default_factory=list)
73
+ extract_result: dict[str, Any] | None = None # Extract API 返回
74
+ success_count: int | None = None
75
+ consume_time: str | None = None
76
+ record_id: str | None = None
77
+
78
+ model_config = {"extra": "allow"}
79
+
80
+
81
+ class AsyncJobResponse(BaseModel):
82
+ """异步任务创建响应
83
+
84
+ Attributes:
85
+ job_id: 任务 ID
86
+ """
87
+
88
+ job_id: str
89
+
90
+
91
+ class JobStatusResponse(BaseModel):
92
+ """异步任务状态响应
93
+
94
+ 异步查询接口只返回任务状态和结果 URL,不直接返回解析结果。
95
+ 如需获取解析结果,需要另外下载 result_url 的内容。
96
+
97
+ Attributes:
98
+ job_id: 任务 ID
99
+ file_id: 文件 ID
100
+ status: 任务状态("scheduled", "in_progress", "completed", "failed")
101
+ result_url: 结果文件 URL(任务完成时返回,需要另外下载)
102
+ error_message: 错误信息(任务失败时返回)
103
+ """
104
+
105
+ job_id: str
106
+ file_id: str | None = None
107
+ status: str # "scheduled", "in_progress", "completed", "failed"
108
+ result_url: str | None = None
109
+ error_message: str | None = None
110
+
111
+ model_config = {"extra": "allow"}
112
+
113
+ @property
114
+ def is_completed(self) -> bool:
115
+ """任务是否已完成"""
116
+ return self.status == "completed"
117
+
118
+ @property
119
+ def is_failed(self) -> bool:
120
+ """任务是否失败"""
121
+ return self.status == "failed"
122
+
123
+ @property
124
+ def is_running(self) -> bool:
125
+ """任务是否正在运行"""
126
+ return self.status in ("scheduled", "in_progress")
127
+
128
+
129
+ __all__ = [
130
+ "ParseConfig",
131
+ "ElementMetadata",
132
+ "Element",
133
+ "ParseResponse",
134
+ "AsyncJobResponse",
135
+ "JobStatusResponse",
136
+ ]
@@ -0,0 +1,134 @@
1
+ """Pipeline API 数据模型"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal, Union
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from .chunk import ChunkConfig
10
+ from .embed import EmbedConfig
11
+ from .extract import ExtractConfig
12
+ from .parse import Element, ParseConfig
13
+
14
+
15
+ class ParseStage(BaseModel):
16
+ """Parse 阶段配置
17
+
18
+ Example:
19
+ >>> stage = ParseStage(
20
+ ... type="parse",
21
+ ... config=ParseConfig(provider="textin")
22
+ ... )
23
+ """
24
+
25
+ type: Literal["parse"] = "parse"
26
+ config: ParseConfig = Field(default_factory=ParseConfig)
27
+
28
+
29
+ class ChunkStage(BaseModel):
30
+ """Chunk 阶段配置
31
+
32
+ Example:
33
+ >>> stage = ChunkStage(
34
+ ... type="chunk",
35
+ ... config=ChunkConfig(strategy="basic")
36
+ ... )
37
+ """
38
+
39
+ type: Literal["chunk"] = "chunk"
40
+ config: ChunkConfig = Field(default_factory=ChunkConfig)
41
+
42
+
43
+ class EmbedStage(BaseModel):
44
+ """Embed 阶段配置
45
+
46
+ Example:
47
+ >>> stage = EmbedStage(
48
+ ... type="embed",
49
+ ... config=EmbedConfig(provider="qwen")
50
+ ... )
51
+ """
52
+
53
+ type: Literal["embed"] = "embed"
54
+ config: EmbedConfig = Field(default_factory=EmbedConfig)
55
+
56
+
57
+ class ExtractStage(BaseModel):
58
+ """Extract 阶段配置
59
+
60
+ Example:
61
+ >>> stage = ExtractStage(
62
+ ... type="extract",
63
+ ... config=ExtractConfig(schema_={"type": "object"})
64
+ ... )
65
+ """
66
+
67
+ type: Literal["extract"] = "extract"
68
+ config: ExtractConfig
69
+
70
+
71
+ # PipelineStage 是所有 Stage 的 Union
72
+ PipelineStage = Union[ParseStage, ChunkStage, EmbedStage, ExtractStage]
73
+
74
+
75
+ class PipelineStats(BaseModel):
76
+ """Pipeline 统计信息
77
+
78
+ Attributes:
79
+ success_count: 成功数量
80
+ total_time: 总耗时
81
+ original_elements: 原始元素数量
82
+ chunked_elements: 分块后元素数量
83
+ embedded_elements: 嵌入后元素数量
84
+ record_id: 记录 ID
85
+ """
86
+
87
+ success_count: int | None = None
88
+ total_time: str | None = None
89
+ original_elements: int | None = None
90
+ chunked_elements: int | None = None
91
+ embedded_elements: int | None = None
92
+ record_id: str | None = None
93
+
94
+ model_config = {"extra": "allow"}
95
+
96
+
97
+ class PipelineConfig(BaseModel):
98
+ """Pipeline 全局配置
99
+
100
+ Attributes:
101
+ include_intermediate_results: 是否包含中间结果
102
+ intermediate_results_destination: 中间结果保存目的地(仅支持 LocalDestination)
103
+ """
104
+
105
+ include_intermediate_results: bool = False
106
+ intermediate_results_destination: Any = None # LocalDestination 对象
107
+
108
+ model_config = {"extra": "allow", "arbitrary_types_allowed": True}
109
+
110
+
111
+ class PipelineResponse(BaseModel):
112
+ """Pipeline 执行响应
113
+
114
+ Attributes:
115
+ elements: 处理后的元素列表
116
+ stats: 统计信息
117
+ extract_result: extract 阶段的结果(如果有)
118
+ intermediate_results: 中间结果(如果请求)
119
+ """
120
+
121
+ elements: list[Element] = Field(default_factory=list)
122
+ stats: PipelineStats | None = None
123
+ extract_result: dict[str, Any] | None = None
124
+ intermediate_results: list[dict[str, Any]] | None = None
125
+
126
+ model_config = {"extra": "allow"}
127
+
128
+
129
+ __all__ = [
130
+ "PipelineStage",
131
+ "PipelineStats",
132
+ "PipelineConfig",
133
+ "PipelineResponse",
134
+ ]
@@ -0,0 +1,74 @@
1
+ """Workflows API 数据模型
2
+
3
+ 远程工作流管理的请求/响应模型。
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from enum import Enum
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from .pipeline import PipelineStage
13
+
14
+
15
+ class WorkflowState(str, Enum):
16
+ """工作流状态"""
17
+ ACTIVE = "active"
18
+ PAUSED = "paused"
19
+ ARCHIVED = "archived"
20
+
21
+
22
+ class Schedule(BaseModel):
23
+ """调度配置
24
+
25
+ Attributes:
26
+ cron: cron 表达式
27
+
28
+ Example:
29
+ >>> schedule = Schedule(cron="0 0 * * *") # 每天午夜执行
30
+ """
31
+ cron: str
32
+
33
+
34
+ class WorkflowInformation(BaseModel):
35
+ """工作流信息
36
+
37
+ Attributes:
38
+ workflow_id: 工作流 ID
39
+ name: 工作流名称
40
+ source_id: 远程数据源 ID
41
+ destination_id: 远程目的地 ID
42
+ stages: 处理阶段列表
43
+ schedule: 调度配置(可选)
44
+ state: 工作流状态
45
+ created_at: 创建时间
46
+ updated_at: 更新时间
47
+
48
+ Example:
49
+ >>> workflow = WorkflowInformation(
50
+ ... workflow_id="wf_123",
51
+ ... name="daily-processing",
52
+ ... source_id="src_456",
53
+ ... destination_id="dst_789",
54
+ ... stages=[ParseStage(config=ParseConfig())],
55
+ ... schedule=Schedule(cron="0 0 * * *"),
56
+ ... state=WorkflowState.ACTIVE,
57
+ ... created_at="2026-01-27T10:00:00Z",
58
+ ... updated_at="2026-01-27T10:00:00Z"
59
+ ... )
60
+ """
61
+ workflow_id: str
62
+ name: str
63
+ source_id: str
64
+ destination_id: str
65
+ stages: list[PipelineStage]
66
+ schedule: Schedule | None = None
67
+ state: WorkflowState
68
+ created_at: str
69
+ updated_at: str
70
+
71
+ model_config = {"extra": "allow"}
72
+
73
+
74
+ __all__ = ["WorkflowInformation", "WorkflowState", "Schedule"]