xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +188 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +351 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +225 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +132 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b8.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -129
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -690
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.19.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.19.dist-info/RECORD +0 -11
tests/conftest.py ADDED
@@ -0,0 +1,310 @@
1
+ """pytest 配置和共享 fixtures"""
2
+
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture
7
+ def app_id() -> str:
8
+ """测试用 app_id"""
9
+ return "test-app-id"
10
+
11
+
12
+ @pytest.fixture
13
+ def secret_code() -> str:
14
+ """测试用 secret_code"""
15
+ return "test-secret-code"
16
+
17
+
18
+ @pytest.fixture
19
+ def server_url() -> str:
20
+ """测试用服务器地址"""
21
+ return "https://api.test.com"
22
+
23
+
24
+ @pytest.fixture
25
+ def sample_pdf_bytes() -> bytes:
26
+ """测试用 PDF 文件内容(模拟)"""
27
+ return b"%PDF-1.4 sample pdf content"
28
+
29
+
30
+ @pytest.fixture
31
+ def sample_response_data() -> dict:
32
+ """测试用 API 响应数据"""
33
+ return {
34
+ "code": 200,
35
+ "data": {
36
+ "elements": [
37
+ {
38
+ "element_id": "elem_001",
39
+ "type": "text",
40
+ "text": "这是一段测试文本",
41
+ "metadata": {"page_number": 1},
42
+ }
43
+ ],
44
+ "success_count": 1,
45
+ "record_id": "rec_123",
46
+ },
47
+ }
48
+
49
+
50
+ # ============================================================================
51
+ # 新增 Fixtures(Phase 1 测试补充)
52
+ # ============================================================================
53
+
54
+ @pytest.fixture
55
+ def parse_success_response() -> dict:
56
+ """Parse API 成功响应(真实格式)"""
57
+ return {
58
+ "code": 200,
59
+ "data": {
60
+ "elements": [
61
+ {
62
+ "element_id": "d0c8f3bf754cf4b071b62402d6932ddcb10f3c7e",
63
+ "type": "Header",
64
+ "metadata": {
65
+ "category_depth": -1,
66
+ "coordinates": [0.1277, 0.0909, 0.3025, 0.0909, 0.3025, 0.1171, 0.1277, 0.1176],
67
+ "data_source": {
68
+ "date_created": "2026-01-27T09:47:51Z",
69
+ "date_modified": "2026-01-27T09:47:51Z",
70
+ "date_processed": "2026-01-27T09:47:51Z",
71
+ "record_locator": {
72
+ "protocol": "file",
73
+ "remote_file_path": "temp/test.pdf"
74
+ },
75
+ "url": "file://temp/test.pdf",
76
+ "version": "2026-01-27T09:47:51Z"
77
+ },
78
+ "filename": "test.pdf",
79
+ "filetype": "application/pdf",
80
+ "last_modified": "2026-01-27T09:47:51Z",
81
+ "page_height": 1683,
82
+ "page_number": 1,
83
+ "page_width": 1190
84
+ },
85
+ "text": "测试标题"
86
+ }
87
+ ],
88
+ "job_id": "5f9f10b8b4c8467882f67ae16bd48746",
89
+ "success_count": 1
90
+ },
91
+ "message": "success",
92
+ "x_request_id": "e4193bf242b8bb095295a6e7df0cacb1"
93
+ }
94
+
95
+
96
+ @pytest.fixture
97
+ def extract_success_response() -> dict:
98
+ """Extract API 成功响应(真实格式)"""
99
+ return {
100
+ "code": 200,
101
+ "data": {
102
+ "job_id": "f76c31eedc124580a6daccba25e8ae27",
103
+ "result": {
104
+ "success_count": 1,
105
+ "extracted_schema": {
106
+ "姓名": "陈君曜"
107
+ },
108
+ "citations": {
109
+ "姓名": {
110
+ "bounding_regions": [
111
+ {
112
+ "page_number": 1,
113
+ "position": [175, 86, 245, 86, 245, 112, 175, 112],
114
+ "text": "陈君曜"
115
+ }
116
+ ],
117
+ "value": "陈君曜"
118
+ }
119
+ },
120
+ "pages": [
121
+ {
122
+ "page_number": 1,
123
+ "image_id": "",
124
+ "height": 566,
125
+ "width": 800,
126
+ "angle": 0,
127
+ "status": "Success",
128
+ "durations": 0
129
+ }
130
+ ]
131
+ },
132
+ "success_count": 1
133
+ },
134
+ "message": "success",
135
+ "x_request_id": "eaf843de96084fd635b901394e500b40"
136
+ }
137
+
138
+
139
+ @pytest.fixture
140
+ def business_error_response() -> dict:
141
+ """业务错误响应(HTTP 200 + code 非 200)"""
142
+ return {
143
+ "code": 400,
144
+ "message": "参数错误:schema 格式不正确",
145
+ "data": None,
146
+ "x_request_id": "abc123"
147
+ }
148
+
149
+
150
+ @pytest.fixture
151
+ def pipeline_parse_chunk_embed_response() -> dict:
152
+ """Pipeline API 响应(parse + chunk + embed,含中间结果)"""
153
+ return {
154
+ "code": 200,
155
+ "data": {
156
+ "elements": [
157
+ {
158
+ "element_id": "elem_001",
159
+ "type": "CompositeElement",
160
+ "metadata": {
161
+ "coordinates": [0.05, 0.02, 0.94, 0.02, 0.94, 0.21, 0.05, 0.21],
162
+ "data_source": {
163
+ "date_created": "2026-01-28T06:44:39Z",
164
+ "date_modified": "2026-01-28T06:44:39Z",
165
+ "date_processed": "2026-01-28T06:44:39Z",
166
+ "record_locator": {
167
+ "protocol": "file",
168
+ "remote_file_path": "temp/test.pdf"
169
+ },
170
+ "url": "file://temp/test.pdf",
171
+ "version": "2026-01-28T06:44:39Z"
172
+ },
173
+ "filename": "test.pdf",
174
+ "filetype": "application/pdf",
175
+ "is_continuation": False,
176
+ "page_height": 821,
177
+ "page_number": 1,
178
+ "page_width": 1270
179
+ },
180
+ "text": "测试文本内容",
181
+ "embeddings": [0.01, -0.105, 0.025] + [0.0] * 1021 # 1024 维向量
182
+ }
183
+ ],
184
+ "intermediate_results": [
185
+ {
186
+ "stage": "parse",
187
+ "elements": [
188
+ {
189
+ "element_id": "elem_parse_001",
190
+ "type": "Header",
191
+ "text": "原始标题",
192
+ "metadata": {}
193
+ }
194
+ ]
195
+ },
196
+ {
197
+ "stage": "chunk",
198
+ "elements": [
199
+ {
200
+ "element_id": "elem_001",
201
+ "type": "CompositeElement",
202
+ "text": "测试文本内容",
203
+ "metadata": {}
204
+ }
205
+ ]
206
+ }
207
+ ],
208
+ "job_id": "job_123",
209
+ "stats": {
210
+ "original_elements": 15,
211
+ "chunked_elements": 3,
212
+ "embedded_elements": 3,
213
+ "stages": [
214
+ {"type": "parse", "config": {"provider": "textin"}},
215
+ {"type": "chunk", "config": {"strategy": "basic"}},
216
+ {"type": "embed", "config": {"provider": "qwen", "model_name": "text-embedding-v4"}}
217
+ ],
218
+ "record_id": "rec_123",
219
+ "success_count": 1
220
+ }
221
+ },
222
+ "message": "success",
223
+ "x_request_id": "req_123"
224
+ }
225
+
226
+
227
+ @pytest.fixture
228
+ def pipeline_parse_extract_response() -> dict:
229
+ """Pipeline API 响应(parse + extract,含中间结果)"""
230
+ return {
231
+ "code": 200,
232
+ "data": {
233
+ "extract_result": {
234
+ "success_count": 1,
235
+ "extracted_schema": {"购买方": "杭州百世网络技术有限公司"},
236
+ "citations": {
237
+ "购买方": {
238
+ "bounding_regions": [
239
+ {
240
+ "page_number": 1,
241
+ "position": [224, 177, 541, 177, 541, 197, 224, 197],
242
+ "text": "杭州百世网络技术有限公司"
243
+ }
244
+ ],
245
+ "value": "杭州百世网络技术有限公司"
246
+ }
247
+ },
248
+ "pages": [{"page_number": 1, "height": 821, "width": 1270}]
249
+ },
250
+ "intermediate_results": [
251
+ {
252
+ "stage": "parse",
253
+ "elements": [
254
+ {
255
+ "element_id": "elem_001",
256
+ "type": "Image",
257
+ "text": "",
258
+ "metadata": {}
259
+ }
260
+ ]
261
+ }
262
+ ],
263
+ "job_id": "job_456",
264
+ "stats": {
265
+ "original_elements": 15,
266
+ "stages": [
267
+ {"type": "parse", "config": {"provider": "textin"}},
268
+ {"type": "extract", "config": {"schema": {}}}
269
+ ],
270
+ "record_id": "rec_456",
271
+ "success_count": 1
272
+ }
273
+ },
274
+ "message": "success",
275
+ "x_request_id": "req_456"
276
+ }
277
+
278
+
279
+ # ============================================================================
280
+ # 可选依赖检查 Helper
281
+ # ============================================================================
282
+
283
+ # 检查可选依赖是否安装
284
+ try:
285
+ import boto3 # noqa: F401
286
+
287
+ has_boto3 = True
288
+ except ImportError:
289
+ has_boto3 = False
290
+
291
+ try:
292
+ import pymilvus # noqa: F401
293
+
294
+ has_pymilvus = True
295
+ except ImportError:
296
+ has_pymilvus = False
297
+
298
+ try:
299
+ import qdrant_client # noqa: F401
300
+
301
+ has_qdrant = True
302
+ except ImportError:
303
+ has_qdrant = False
304
+
305
+ try:
306
+ from smb.SMBConnection import SMBConnection # noqa: F401
307
+
308
+ has_smb = True
309
+ except ImportError:
310
+ has_smb = False
tests/unit/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """单元测试模块"""
@@ -0,0 +1 @@
1
+ """API 测试模块"""
@@ -0,0 +1,232 @@
1
+ """Extract API 测试
2
+
3
+ 测试 Extract API 的所有功能,包括:
4
+ - 基础提取功能
5
+ - parse_config 支持
6
+ - extract_config 参数验证
7
+ - 错误处理
8
+ """
9
+
10
+ import httpx
11
+ import pytest
12
+ import respx
13
+
14
+ from xparse_client import XParseClient
15
+ from xparse_client.exceptions import ServerError, ValidationError
16
+ from xparse_client.models import ExtractConfig, ParseConfig
17
+
18
+ # ============================================================================
19
+ # 基础功能测试
20
+ # ============================================================================
21
+
22
+
23
+ def test_extract_api_exists():
24
+ """测试 client.extract 属性存在"""
25
+ client = XParseClient(app_id="test", secret_code="test")
26
+
27
+ assert hasattr(client, "extract")
28
+ assert client.extract is not None
29
+
30
+
31
+ @respx.mock
32
+ def test_extract_basic(server_url, extract_success_response):
33
+ """测试基础提取功能"""
34
+ # Mock API
35
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
36
+ return_value=httpx.Response(200, json=extract_success_response)
37
+ )
38
+
39
+ # 初始化客户端
40
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
41
+
42
+ # 创建 extract_config
43
+ extract_config = ExtractConfig(
44
+ schema_={
45
+ "type": "object",
46
+ "properties": {
47
+ "姓名": {"type": "string"}
48
+ },
49
+ "required": ["姓名"]
50
+ }
51
+ )
52
+
53
+ # 执行提取
54
+ result = client.extract.extract(
55
+ file=b"test pdf content",
56
+ filename="test.pdf",
57
+ extract_config=extract_config
58
+ )
59
+
60
+ # 验证结果
61
+ assert result is not None
62
+ assert result.job_id == "f76c31eedc124580a6daccba25e8ae27"
63
+ assert result.result is not None
64
+ assert "extracted_schema" in result.result
65
+ # 验证请求包含 extract_config
66
+ request = respx.calls.last.request
67
+ assert b"extract_config" in request.content
68
+
69
+
70
+ @respx.mock
71
+ def test_extract_with_parse_config(server_url, extract_success_response):
72
+ """测试指定 parse_config"""
73
+ # Mock API
74
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
75
+ return_value=httpx.Response(200, json=extract_success_response)
76
+ )
77
+
78
+ # 初始化客户端
79
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
80
+
81
+ # 创建配置
82
+ parse_config = ParseConfig(provider="mineru")
83
+ extract_config = ExtractConfig(
84
+ schema_={
85
+ "type": "object",
86
+ "properties": {"姓名": {"type": "string"}}
87
+ }
88
+ )
89
+
90
+ # 执行提取
91
+ result = client.extract.extract(
92
+ file=b"test",
93
+ filename="test.pdf",
94
+ parse_config=parse_config,
95
+ extract_config=extract_config
96
+ )
97
+
98
+ # 验证结果
99
+ assert result is not None
100
+
101
+ # 验证请求中包含 parse_config
102
+ request = respx.calls.last.request
103
+ assert b"parse_config" in request.content
104
+ assert b"mineru" in request.content
105
+
106
+
107
+ def test_extract_missing_extract_config():
108
+ """测试缺少 extract_config 时抛出 ValueError"""
109
+ client = XParseClient(app_id="test", secret_code="test")
110
+
111
+ # 不提供 extract_config 应该抛出 ValueError
112
+ with pytest.raises(ValueError) as exc_info:
113
+ client.extract.extract(
114
+ file=b"test",
115
+ filename="test.pdf",
116
+ extract_config=None
117
+ )
118
+
119
+ assert "extract_config is required" in str(exc_info.value)
120
+
121
+
122
+ # ============================================================================
123
+ # 错误处理测试
124
+ # ============================================================================
125
+
126
+
127
+ @respx.mock
128
+ def test_extract_invalid_schema_error(server_url, business_error_response):
129
+ """测试无效 schema 错误(业务 code 400)"""
130
+ # Mock API 返回业务错误
131
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
132
+ return_value=httpx.Response(200, json=business_error_response)
133
+ )
134
+
135
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
136
+ extract_config = ExtractConfig(schema_={"invalid": "schema"})
137
+
138
+ # 应该抛出 ValidationError
139
+ with pytest.raises(ValidationError) as exc_info:
140
+ client.extract.extract(
141
+ file=b"test",
142
+ filename="test.pdf",
143
+ extract_config=extract_config
144
+ )
145
+
146
+ assert "参数错误" in str(exc_info.value)
147
+
148
+
149
+ @respx.mock
150
+ def test_extract_server_error(server_url):
151
+ """测试服务器错误(HTTP 500 或业务 code 500)"""
152
+ # Mock API 返回 500 错误
153
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
154
+ return_value=httpx.Response(500, json={"message": "服务器内部错误"})
155
+ )
156
+
157
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
158
+ extract_config = ExtractConfig(
159
+ schema_={
160
+ "type": "object",
161
+ "properties": {"name": {"type": "string"}}
162
+ }
163
+ )
164
+
165
+ # 应该抛出 ServerError
166
+ with pytest.raises(ServerError) as exc_info:
167
+ client.extract.extract(
168
+ file=b"test",
169
+ filename="test.pdf",
170
+ extract_config=extract_config
171
+ )
172
+
173
+ assert "服务器" in str(exc_info.value)
174
+
175
+
176
+ # ============================================================================
177
+ # ExtractConfig 参数测试
178
+ # ============================================================================
179
+
180
+
181
+ @respx.mock
182
+ def test_extract_with_generate_citations(server_url, extract_success_response):
183
+ """测试 generate_citations 参数"""
184
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
185
+ return_value=httpx.Response(200, json=extract_success_response)
186
+ )
187
+
188
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
189
+ extract_config = ExtractConfig(
190
+ schema_={"type": "object", "properties": {"姓名": {"type": "string"}}},
191
+ generate_citations=True
192
+ )
193
+
194
+ result = client.extract.extract(
195
+ file=b"test",
196
+ filename="test.pdf",
197
+ extract_config=extract_config
198
+ )
199
+
200
+ # 验证请求中包含 generate_citations
201
+ request = respx.calls.last.request
202
+ assert b"generate_citations" in request.content
203
+
204
+ # 验证结果
205
+ assert result is not None
206
+
207
+
208
+ @respx.mock
209
+ def test_extract_with_stamp(server_url, extract_success_response):
210
+ """测试 stamp 参数(印章提取)"""
211
+ respx.post(f"{server_url}/api/xparse/extract/sync").mock(
212
+ return_value=httpx.Response(200, json=extract_success_response)
213
+ )
214
+
215
+ client = XParseClient(app_id="test", secret_code="test", server_url=server_url)
216
+ extract_config = ExtractConfig(
217
+ schema_={"type": "object", "properties": {"公司名称": {"type": "string"}}},
218
+ stamp=True
219
+ )
220
+
221
+ result = client.extract.extract(
222
+ file=b"test invoice",
223
+ filename="invoice.pdf",
224
+ extract_config=extract_config
225
+ )
226
+
227
+ # 验证请求中包含 stamp
228
+ request = respx.calls.last.request
229
+ assert b"stamp" in request.content
230
+
231
+ # 验证结果
232
+ assert result is not None