xparse-client 0.2.20__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +110 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +185 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +132 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +132 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b1.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b1.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -163
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -860
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.20.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.20.dist-info/RECORD +0 -11
@@ -0,0 +1,525 @@
1
+ """FtpSource 测试
2
+
3
+ 测试 FTP 数据源连接器。使用 mock 模式测试,无需真实 FTP 服务器。
4
+
5
+ 运行方式:
6
+ pytest tests/unit/connectors/test_ftp.py -v
7
+ """
8
+
9
+ from ftplib import error_perm
10
+ from unittest.mock import MagicMock, patch
11
+
12
+ import pytest
13
+
14
+ from xparse_client.connectors import FtpSource
15
+ from xparse_client.exceptions import SourceError
16
+
17
+ # ============================================================================
18
+ # Mock FTP 测试
19
+ # ============================================================================
20
+
21
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
22
+ def test_ftp_source_initialization(mock_ftp_class):
23
+ """测试 FtpSource 初始化"""
24
+ mock_ftp = MagicMock()
25
+ mock_ftp_class.return_value = mock_ftp
26
+
27
+ source = FtpSource(
28
+ host="ftp.example.com",
29
+ port=21,
30
+ username="testuser",
31
+ password="testpass"
32
+ )
33
+
34
+ assert source.host == "ftp.example.com"
35
+ assert source.port == 21
36
+ assert source.username == "testuser"
37
+ assert source.password == "testpass"
38
+ assert source.pattern is None
39
+ assert source.recursive is False
40
+
41
+ # 验证连接和登录
42
+ mock_ftp_class.assert_called_once_with()
43
+ mock_ftp.connect.assert_called_once_with("ftp.example.com", 21)
44
+ mock_ftp.login.assert_called_once_with("testuser", "testpass")
45
+
46
+
47
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
48
+ def test_ftp_source_list_files_basic(mock_ftp_class):
49
+ """测试列出文件(非递归)"""
50
+ mock_ftp = MagicMock()
51
+ mock_ftp_class.return_value = mock_ftp
52
+
53
+ # Mock pwd() 返回当前目录
54
+ mock_ftp.pwd.return_value = "/"
55
+
56
+ # Mock mlsd() 返回文件列表
57
+ mock_ftp.mlsd.return_value = [
58
+ ("file1.pdf", {"type": "file"}),
59
+ ("file2.docx", {"type": "file"}),
60
+ ("subdir", {"type": "dir"}),
61
+ (".", {"type": "dir"}),
62
+ ("..", {"type": "dir"}),
63
+ ]
64
+
65
+ source = FtpSource(
66
+ host="ftp.example.com",
67
+ port=21,
68
+ username="test",
69
+ password="test",
70
+ recursive=False
71
+ )
72
+
73
+ files = source.list_files()
74
+
75
+ # 非递归,只列出文件,不包含目录和 . ..
76
+ assert len(files) == 2
77
+ assert 'file1.pdf' in files
78
+ assert 'file2.docx' in files
79
+ assert 'subdir' not in files
80
+
81
+
82
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
83
+ def test_ftp_source_list_files_with_pattern(mock_ftp_class):
84
+ """测试带 pattern 的文件列表"""
85
+ mock_ftp = MagicMock()
86
+ mock_ftp_class.return_value = mock_ftp
87
+
88
+ mock_ftp.pwd.return_value = "/"
89
+ mock_ftp.mlsd.return_value = [
90
+ ("file1.pdf", {"type": "file"}),
91
+ ("file2.docx", {"type": "file"}),
92
+ ("file3.txt", {"type": "file"}),
93
+ ]
94
+
95
+ source = FtpSource(
96
+ host="ftp.example.com",
97
+ port=21,
98
+ username="test",
99
+ password="test",
100
+ pattern=["*.pdf"]
101
+ )
102
+
103
+ files = source.list_files()
104
+
105
+ assert len(files) == 1
106
+ assert 'file1.pdf' in files
107
+ assert 'file2.docx' not in files
108
+
109
+
110
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
111
+ def test_ftp_source_list_files_recursive(mock_ftp_class):
112
+ """测试递归列出文件"""
113
+ mock_ftp = MagicMock()
114
+ mock_ftp_class.return_value = mock_ftp
115
+
116
+ mock_ftp.pwd.side_effect = ["/", "/", "/subdir", "/subdir", "/"]
117
+
118
+ # Mock mlsd() 根据当前目录返回不同内容
119
+ call_count = [0]
120
+
121
+ def mock_mlsd():
122
+ call_count[0] += 1
123
+ if call_count[0] == 1:
124
+ # 根目录
125
+ return [
126
+ ("file1.pdf", {"type": "file"}),
127
+ ("subdir", {"type": "dir"}),
128
+ ]
129
+ else:
130
+ # subdir 目录
131
+ return [
132
+ ("file2.pdf", {"type": "file"}),
133
+ ]
134
+
135
+ mock_ftp.mlsd.side_effect = mock_mlsd
136
+
137
+ source = FtpSource(
138
+ host="ftp.example.com",
139
+ port=21,
140
+ username="test",
141
+ password="test",
142
+ recursive=True
143
+ )
144
+
145
+ files = source.list_files()
146
+
147
+ # 应该包含两级目录的文件
148
+ assert len(files) == 2
149
+ assert 'file1.pdf' in files
150
+ assert 'subdir/file2.pdf' in files
151
+
152
+
153
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
154
+ def test_ftp_source_list_files_fallback_list(mock_ftp_class):
155
+ """测试 mlsd 失败时回退到 LIST 命令"""
156
+ mock_ftp = MagicMock()
157
+ mock_ftp_class.return_value = mock_ftp
158
+
159
+ mock_ftp.pwd.return_value = "/"
160
+ # mlsd 失败
161
+ mock_ftp.mlsd.side_effect = Exception("mlsd not supported")
162
+
163
+ # Mock retrlines 返回 LIST 格式输出
164
+ def mock_retrlines(cmd, callback):
165
+ if cmd == "LIST":
166
+ callback("-rw-r--r-- 1 user group 1024 Jan 01 12:00 file1.pdf")
167
+ callback("-rw-r--r-- 1 user group 2048 Jan 02 13:00 file2.docx")
168
+ callback("drwxr-xr-x 2 user group 4096 Jan 03 14:00 subdir")
169
+
170
+ mock_ftp.retrlines.side_effect = mock_retrlines
171
+
172
+ source = FtpSource(
173
+ host="ftp.example.com",
174
+ port=21,
175
+ username="test",
176
+ password="test"
177
+ )
178
+
179
+ files = source.list_files()
180
+
181
+ assert len(files) == 2
182
+ assert 'file1.pdf' in files
183
+ assert 'file2.docx' in files
184
+ assert 'subdir' not in files
185
+
186
+
187
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
188
+ def test_ftp_source_read_file(mock_ftp_class):
189
+ """测试读取文件"""
190
+ mock_ftp = MagicMock()
191
+ mock_ftp_class.return_value = mock_ftp
192
+
193
+ # Mock retrbinary 读取文件
194
+ test_content = b'FTP file content'
195
+
196
+ def mock_retrbinary(cmd, callback):
197
+ callback(test_content)
198
+
199
+ mock_ftp.retrbinary.side_effect = mock_retrbinary
200
+
201
+ # Mock sendcmd 返回修改时间
202
+ mock_ftp.sendcmd.return_value = '213 20240128120000'
203
+
204
+ source = FtpSource(
205
+ host="ftp.example.com",
206
+ port=21,
207
+ username="test",
208
+ password="test"
209
+ )
210
+
211
+ file_bytes, data_source = source.read_file('test.pdf')
212
+
213
+ assert file_bytes == test_content
214
+ assert data_source['url'] == 'ftp://ftp.example.com:21/test.pdf'
215
+ assert data_source['record_locator']['protocol'] == 'ftp'
216
+ assert data_source['record_locator']['server'] == 'ftp.example.com:21'
217
+ assert 'version' in data_source
218
+ assert 'date_modified' in data_source
219
+
220
+
221
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
222
+ def test_ftp_source_read_file_no_mdtm(mock_ftp_class):
223
+ """测试读取文件(服务器不支持 MDTM)"""
224
+ mock_ftp = MagicMock()
225
+ mock_ftp_class.return_value = mock_ftp
226
+
227
+ test_content = b'Content'
228
+
229
+ def mock_retrbinary(cmd, callback):
230
+ callback(test_content)
231
+
232
+ mock_ftp.retrbinary.side_effect = mock_retrbinary
233
+ # sendcmd 失败
234
+ mock_ftp.sendcmd.side_effect = Exception("MDTM not supported")
235
+
236
+ source = FtpSource(
237
+ host="ftp.example.com",
238
+ port=21,
239
+ username="test",
240
+ password="test"
241
+ )
242
+
243
+ file_bytes, data_source = source.read_file('test.pdf')
244
+
245
+ assert file_bytes == test_content
246
+ # version 应该是 None 或 0
247
+ assert data_source['version'] is not None
248
+
249
+
250
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
251
+ def test_ftp_source_read_file_not_found(mock_ftp_class):
252
+ """测试读取不存在的文件"""
253
+ mock_ftp = MagicMock()
254
+ mock_ftp_class.return_value = mock_ftp
255
+
256
+ # Mock retrbinary 抛出异常
257
+ mock_ftp.retrbinary.side_effect = error_perm("550 File not found")
258
+
259
+ source = FtpSource(
260
+ host="ftp.example.com",
261
+ port=21,
262
+ username="test",
263
+ password="test"
264
+ )
265
+
266
+ with pytest.raises(SourceError) as exc_info:
267
+ source.read_file('nonexistent.pdf')
268
+
269
+ assert "读取 FTP 文件失败" in str(exc_info.value)
270
+
271
+
272
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
273
+ def test_ftp_source_connection_error(mock_ftp_class):
274
+ """测试连接错误"""
275
+ mock_ftp = MagicMock()
276
+ mock_ftp_class.return_value = mock_ftp
277
+
278
+ # Mock connect 失败
279
+ mock_ftp.connect.side_effect = OSError("Connection refused")
280
+
281
+ with pytest.raises(SourceError) as exc_info:
282
+ FtpSource(
283
+ host="invalid-host",
284
+ port=21,
285
+ username="test",
286
+ password="test"
287
+ )
288
+
289
+ assert "FTP 连接失败" in str(exc_info.value)
290
+
291
+
292
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
293
+ def test_ftp_source_authentication_error(mock_ftp_class):
294
+ """测试认证错误"""
295
+ mock_ftp = MagicMock()
296
+ mock_ftp_class.return_value = mock_ftp
297
+
298
+ # Mock login 失败
299
+ mock_ftp.login.side_effect = error_perm("530 Login incorrect")
300
+
301
+ with pytest.raises(SourceError) as exc_info:
302
+ FtpSource(
303
+ host="ftp.example.com",
304
+ port=21,
305
+ username="wrong",
306
+ password="wrong"
307
+ )
308
+
309
+ assert "FTP 连接失败" in str(exc_info.value)
310
+
311
+
312
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
313
+ def test_ftp_source_context_manager(mock_ftp_class):
314
+ """测试上下文管理器"""
315
+ mock_ftp = MagicMock()
316
+ mock_ftp_class.return_value = mock_ftp
317
+
318
+ with FtpSource(
319
+ host="ftp.example.com",
320
+ port=21,
321
+ username="test",
322
+ password="test"
323
+ ) as source:
324
+ assert source is not None
325
+
326
+ # 验证连接被关闭
327
+ mock_ftp.quit.assert_called_once()
328
+
329
+
330
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
331
+ def test_ftp_source_close(mock_ftp_class):
332
+ """测试手动关闭连接"""
333
+ mock_ftp = MagicMock()
334
+ mock_ftp_class.return_value = mock_ftp
335
+
336
+ source = FtpSource(
337
+ host="ftp.example.com",
338
+ port=21,
339
+ username="test",
340
+ password="test"
341
+ )
342
+
343
+ source.close()
344
+
345
+ # 验证 quit 被调用
346
+ mock_ftp.quit.assert_called_once()
347
+
348
+
349
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
350
+ def test_ftp_source_close_with_error(mock_ftp_class):
351
+ """测试关闭连接时出错(回退到 close)"""
352
+ mock_ftp = MagicMock()
353
+ mock_ftp_class.return_value = mock_ftp
354
+
355
+ # quit 失败
356
+ mock_ftp.quit.side_effect = Exception("quit failed")
357
+
358
+ source = FtpSource(
359
+ host="ftp.example.com",
360
+ port=21,
361
+ username="test",
362
+ password="test"
363
+ )
364
+
365
+ source.close() # 不应该抛出异常
366
+
367
+ # 验证回退到 close
368
+ mock_ftp.close.assert_called_once()
369
+
370
+
371
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
372
+ def test_ftp_source_custom_pattern_and_recursive(mock_ftp_class):
373
+ """测试自定义 pattern 和 recursive"""
374
+ mock_ftp = MagicMock()
375
+ mock_ftp_class.return_value = mock_ftp
376
+
377
+ source = FtpSource(
378
+ host="ftp.example.com",
379
+ port=21,
380
+ username="test",
381
+ password="test",
382
+ pattern=["*.pdf", "*.docx"],
383
+ recursive=True
384
+ )
385
+
386
+ assert source.pattern == ["*.pdf", "*.docx"]
387
+ assert source.recursive is True
388
+
389
+
390
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
391
+ def test_ftp_source_repr(mock_ftp_class):
392
+ """测试 __repr__"""
393
+ mock_ftp = MagicMock()
394
+ mock_ftp_class.return_value = mock_ftp
395
+
396
+ source = FtpSource(
397
+ host="ftp.example.com",
398
+ port=21,
399
+ username="test",
400
+ password="test"
401
+ )
402
+
403
+ assert repr(source) == "<FtpSource host=ftp.example.com:21>"
404
+
405
+
406
+ # ============================================================================
407
+ # MLSD/NLST 回退测试
408
+ # ============================================================================
409
+
410
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
411
+ def test_ftp_source_get_dir_items_mlsd_to_list_fallback(mock_ftp_class):
412
+ """测试 MLSD 失败回退到 LIST 的场景"""
413
+ mock_ftp = MagicMock()
414
+ mock_ftp_class.return_value = mock_ftp
415
+
416
+ source = FtpSource(
417
+ host="ftp.example.com",
418
+ port=21,
419
+ username="test",
420
+ password="test"
421
+ )
422
+
423
+ # Mock MLSD 失败
424
+ mock_ftp.mlsd.side_effect = Exception("MLSD not supported")
425
+
426
+ # Mock LIST 成功(需要支持 retrlines 回调)
427
+ def mock_retrlines(cmd, callback):
428
+ # 模拟 LIST 输出
429
+ callback("drwxr-xr-x 2 user group 4096 Jan 28 12:00 dir1")
430
+ callback("-rw-r--r-- 1 user group 1024 Jan 28 12:00 file1.txt")
431
+
432
+ mock_ftp.retrlines.side_effect = mock_retrlines
433
+
434
+ items = source._get_dir_items()
435
+
436
+ # 验证返回的结果
437
+ assert len(items) == 2
438
+ assert ("dir1", True) in items
439
+ assert ("file1.txt", False) in items
440
+
441
+
442
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
443
+ def test_ftp_source_get_dir_items_list_to_nlst_fallback(mock_ftp_class):
444
+ """测试 LIST 失败回退到 NLST 的场景"""
445
+ mock_ftp = MagicMock()
446
+ mock_ftp_class.return_value = mock_ftp
447
+
448
+ source = FtpSource(
449
+ host="ftp.example.com",
450
+ port=21,
451
+ username="test",
452
+ password="test"
453
+ )
454
+
455
+ # Mock MLSD 和 LIST 都失败
456
+ mock_ftp.mlsd.side_effect = Exception("MLSD not supported")
457
+ mock_ftp.retrlines.side_effect = Exception("LIST not supported")
458
+
459
+ # Mock NLST 成功
460
+ mock_ftp.nlst.return_value = ["dir1", "file1.txt", ".", ".."]
461
+
462
+ # Mock cwd 判断是否为目录
463
+ def mock_cwd(path):
464
+ if path == "dir1":
465
+ return # 成功,说明是目录
466
+ elif path == "..":
467
+ return # 返回上级目录
468
+ else:
469
+ raise Exception("Not a directory")
470
+
471
+ mock_ftp.cwd.side_effect = mock_cwd
472
+
473
+ items = source._get_dir_items()
474
+
475
+ # 验证返回的结果(. 和 .. 会被过滤)
476
+ assert len(items) == 2
477
+ assert ("dir1", True) in items
478
+ assert ("file1.txt", False) in items
479
+
480
+
481
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
482
+ def test_ftp_source_get_dir_items_all_methods_fail(mock_ftp_class):
483
+ """测试所有方法都失败的场景"""
484
+ mock_ftp = MagicMock()
485
+ mock_ftp_class.return_value = mock_ftp
486
+
487
+ source = FtpSource(
488
+ host="ftp.example.com",
489
+ port=21,
490
+ username="test",
491
+ password="test"
492
+ )
493
+
494
+ # Mock 所有方法都失败
495
+ mock_ftp.mlsd.side_effect = Exception("MLSD not supported")
496
+ mock_ftp.retrlines.side_effect = Exception("LIST not supported")
497
+ mock_ftp.nlst.side_effect = Exception("NLST not supported")
498
+
499
+ items = source._get_dir_items()
500
+
501
+ # 应该返回空列表
502
+ assert items == []
503
+
504
+
505
+ @patch('xparse_client.connectors.sources.ftp.ftplib.FTP')
506
+ def test_ftp_source_close_quit_error(mock_ftp_class):
507
+ """测试 close 时 quit 失败的场景"""
508
+ mock_ftp = MagicMock()
509
+ mock_ftp_class.return_value = mock_ftp
510
+
511
+ source = FtpSource(
512
+ host="ftp.example.com",
513
+ port=21,
514
+ username="test",
515
+ password="test"
516
+ )
517
+
518
+ # Mock quit 失败
519
+ mock_ftp.quit.side_effect = Exception("Quit error")
520
+
521
+ # 应该不抛出异常(错误被捕获)
522
+ source.close()
523
+
524
+ # 验证 quit 被调用
525
+ mock_ftp.quit.assert_called_once()