PyPI - xparse-client - Versions diffs - 0.2.20__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

xparse-client 0.2.20py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

example/1_basic_api_usage.py +198 -0
example/2_async_job.py +210 -0
example/3_local_workflow.py +300 -0
example/4_advanced_workflow.py +327 -0
example/README.md +128 -0
example/config_example.json +95 -0
tests/conftest.py +310 -0
tests/unit/__init__.py +1 -0
tests/unit/api/__init__.py +1 -0
tests/unit/api/test_extract.py +232 -0
tests/unit/api/test_local.py +231 -0
tests/unit/api/test_parse.py +374 -0
tests/unit/api/test_pipeline.py +369 -0
tests/unit/api/test_workflows.py +108 -0
tests/unit/connectors/test_ftp.py +525 -0
tests/unit/connectors/test_local_connectors.py +324 -0
tests/unit/connectors/test_milvus.py +368 -0
tests/unit/connectors/test_qdrant.py +399 -0
tests/unit/connectors/test_s3.py +598 -0
tests/unit/connectors/test_smb.py +442 -0
tests/unit/connectors/test_utils.py +335 -0
tests/unit/models/test_local.py +54 -0
tests/unit/models/test_pipeline_stages.py +144 -0
tests/unit/models/test_workflows.py +55 -0
tests/unit/test_base.py +437 -0
tests/unit/test_client.py +110 -0
tests/unit/test_config.py +160 -0
tests/unit/test_exceptions.py +182 -0
tests/unit/test_http.py +562 -0
xparse_client/__init__.py +110 -20
xparse_client/_base.py +179 -0
xparse_client/_client.py +218 -0
xparse_client/_config.py +221 -0
xparse_client/_http.py +350 -0
xparse_client/api/__init__.py +14 -0
xparse_client/api/extract.py +109 -0
xparse_client/api/local.py +185 -0
xparse_client/api/parse.py +209 -0
xparse_client/api/pipeline.py +132 -0
xparse_client/api/workflows.py +204 -0
xparse_client/connectors/__init__.py +45 -0
xparse_client/connectors/_utils.py +138 -0
xparse_client/connectors/destinations/__init__.py +45 -0
xparse_client/connectors/destinations/base.py +116 -0
xparse_client/connectors/destinations/local.py +91 -0
xparse_client/connectors/destinations/milvus.py +229 -0
xparse_client/connectors/destinations/qdrant.py +238 -0
xparse_client/connectors/destinations/s3.py +163 -0
xparse_client/connectors/sources/__init__.py +45 -0
xparse_client/connectors/sources/base.py +74 -0
xparse_client/connectors/sources/ftp.py +278 -0
xparse_client/connectors/sources/local.py +176 -0
xparse_client/connectors/sources/s3.py +232 -0
xparse_client/connectors/sources/smb.py +259 -0
xparse_client/exceptions.py +398 -0
xparse_client/models/__init__.py +60 -0
xparse_client/models/chunk.py +39 -0
xparse_client/models/embed.py +62 -0
xparse_client/models/extract.py +41 -0
xparse_client/models/local.py +38 -0
xparse_client/models/parse.py +136 -0
xparse_client/models/pipeline.py +132 -0
xparse_client/models/workflows.py +74 -0
xparse_client-0.3.0b1.dist-info/METADATA +1075 -0
xparse_client-0.3.0b1.dist-info/RECORD +68 -0
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/WHEEL +1 -1
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/licenses/LICENSE +1 -1
{xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/top_level.txt +2 -0
xparse_client/pipeline/__init__.py +0 -3
xparse_client/pipeline/config.py +0 -163
xparse_client/pipeline/destinations.py +0 -489
xparse_client/pipeline/pipeline.py +0 -860
xparse_client/pipeline/sources.py +0 -583
xparse_client-0.2.20.dist-info/METADATA +0 -1050
xparse_client-0.2.20.dist-info/RECORD +0 -11

tests/unit/connectors/test_utils.py ADDED Viewed

@@ -0,0 +1,335 @@
+"""Connectors 工具函数测试
+测试 xparse_client.connectors._utils 模块的所有工具函数。
+运行方式:
+    pytest tests/unit/connectors/test_utils.py -v
+"""
+import json
+from datetime import timezone
+from unittest.mock import MagicMock, patch
+import pytest
+from xparse_client.connectors._utils import (
+    flatten_dict,
+    get_current_millis_timestamp,
+    match_file_pattern,
+    normalize_wildcard_patterns,
+    to_millis_timestamp,
+)
+# ============================================================================
+# normalize_wildcard_patterns 测试
+# ============================================================================
+def test_normalize_wildcard_patterns_none():
+    """测试 None 输入返回 None"""
+    result = normalize_wildcard_patterns(None)
+    assert result is None
+def test_normalize_wildcard_patterns_valid_list():
+    """测试有效的模式列表"""
+    result = normalize_wildcard_patterns(["*.pdf", "*.docx"])
+    assert result == ["*.pdf", "*.docx"]
+def test_normalize_wildcard_patterns_with_whitespace():
+    """测试包含空格的模式"""
+    result = normalize_wildcard_patterns([" *.pdf ", "  *.docx  "])
+    assert result == ["*.pdf", "*.docx"]
+def test_normalize_wildcard_patterns_with_empty_strings():
+    """测试包含空字符串的模式列表"""
+    result = normalize_wildcard_patterns(["*.pdf", "", "  ", "*.docx"])
+    assert result == ["*.pdf", "*.docx"]
+def test_normalize_wildcard_patterns_empty_list():
+    """测试空列表返回 None（匹配所有）"""
+    result = normalize_wildcard_patterns([])
+    assert result is None
+def test_normalize_wildcard_patterns_wildcard_all():
+    """测试包含 * 通配符返回 None（匹配所有）"""
+    result = normalize_wildcard_patterns(["*.pdf", "*", "*.docx"])
+    assert result is None
+def test_normalize_wildcard_patterns_only_spaces():
+    """测试只包含空格的列表返回 None"""
+    result = normalize_wildcard_patterns(["  ", "   "])
+    assert result is None
+def test_normalize_wildcard_patterns_invalid_type():
+    """测试无效类型抛出 ValueError"""
+    with pytest.raises(ValueError, match="pattern 必须是列表类型"):
+        normalize_wildcard_patterns("*.pdf")
+    with pytest.raises(ValueError, match="pattern 必须是列表类型"):
+        normalize_wildcard_patterns({"pattern": "*.pdf"})
+# ============================================================================
+# match_file_pattern 测试
+# ============================================================================
+def test_match_file_pattern_none_patterns():
+    """测试 None patterns 匹配所有文件"""
+    assert match_file_pattern("test.pdf", None) is True
+    assert match_file_pattern("any_file.txt", None) is True
+def test_match_file_pattern_basic_match():
+    """测试基本模式匹配"""
+    assert match_file_pattern("document.pdf", ["*.pdf"]) is True
+    assert match_file_pattern("document.docx", ["*.pdf"]) is False
+def test_match_file_pattern_multiple_patterns():
+    """测试多个模式匹配"""
+    patterns = ["*.pdf", "*.docx", "*.txt"]
+    assert match_file_pattern("file.pdf", patterns) is True
+    assert match_file_pattern("file.docx", patterns) is True
+    assert match_file_pattern("file.txt", patterns) is True
+    assert match_file_pattern("file.xlsx", patterns) is False
+def test_match_file_pattern_with_path():
+    """测试带路径的文件匹配（只匹配文件名）"""
+    assert match_file_pattern("dir/subdir/file.pdf", ["*.pdf"]) is True
+    assert match_file_pattern("dir/file.txt", ["*.txt"]) is True
+def test_match_file_pattern_complex_patterns():
+    """测试复杂通配符模式"""
+    assert match_file_pattern("report_2024.pdf", ["report_*.pdf"]) is True
+    assert match_file_pattern("test_file.txt", ["test_*"]) is True
+    assert match_file_pattern("document.pdf", ["doc*"]) is True
+def test_match_file_pattern_case_sensitive():
+    """测试大小写敏感匹配"""
+    # fnmatch 默认是大小写敏感的（在大多数系统上）
+    assert match_file_pattern("file.PDF", ["*.pdf"]) is False
+    assert match_file_pattern("file.PDF", ["*.PDF"]) is True
+# ============================================================================
+# to_millis_timestamp 测试
+# ============================================================================
+def test_to_millis_timestamp_none():
+    """测试 None 输入返回空字符串"""
+    result = to_millis_timestamp(None)
+    assert result == ""
+def test_to_millis_timestamp_seconds():
+    """测试秒级时间戳转换为毫秒"""
+    # 2024-01-28 12:00:00 UTC
+    timestamp = 1706443200.0
+    result = to_millis_timestamp(timestamp)
+    assert result == "1706443200000"
+def test_to_millis_timestamp_millis():
+    """测试毫秒级时间戳直接返回"""
+    # 已经是毫秒
+    timestamp = 1706443200000.0
+    result = to_millis_timestamp(timestamp)
+    assert result == "1706443200000"
+def test_to_millis_timestamp_with_decimals():
+    """测试带小数的时间戳"""
+    timestamp = 1706443200.123
+    result = to_millis_timestamp(timestamp)
+    assert result == "1706443200123"
+def test_to_millis_timestamp_zero():
+    """测试零值时间戳"""
+    result = to_millis_timestamp(0)
+    assert result == "0"
+def test_to_millis_timestamp_boundary():
+    """测试边界值 (1e12)"""
+    # 接近边界但仍是秒
+    result = to_millis_timestamp(1e11)
+    assert result == str(int(1e11 * 1000))
+    # 刚好超过边界，是毫秒
+    result = to_millis_timestamp(1e12 + 1)
+    assert result == str(int(1e12 + 1))
+# ============================================================================
+# get_current_millis_timestamp 测试
+# ============================================================================
+@patch('xparse_client.connectors._utils.datetime')
+def test_get_current_millis_timestamp(mock_datetime):
+    """测试获取当前毫秒时间戳"""
+    # Mock datetime.now() 返回固定时间
+    mock_now = MagicMock()
+    mock_now.timestamp.return_value = 1706443200.0
+    mock_datetime.now.return_value = mock_now
+    mock_datetime.timezone = timezone
+    result = get_current_millis_timestamp()
+    # 验证调用
+    mock_datetime.now.assert_called_once_with(timezone.utc)
+    # 验证结果
+    assert result == "1706443200000"
+def test_get_current_millis_timestamp_real():
+    """测试实际获取当前时间戳（不 mock）"""
+    result = get_current_millis_timestamp()
+    # 验证返回字符串
+    assert isinstance(result, str)
+    # 验证是有效的数字
+    timestamp = int(result)
+    assert timestamp > 0
+    # 验证长度（毫秒时间戳应该是 13 位）
+    assert len(result) == 13
+# ============================================================================
+# flatten_dict 测试
+# ============================================================================
+def test_flatten_dict_simple():
+    """测试简单字典展平"""
+    data = {"a": 1, "b": 2}
+    result = flatten_dict(data)
+    assert result == {"a": 1, "b": 2}
+def test_flatten_dict_nested():
+    """测试嵌套字典展平"""
+    data = {"a": {"b": 1, "c": 2}, "d": 3}
+    result = flatten_dict(data)
+    assert result == {"a_b": 1, "a_c": 2, "d": 3}
+def test_flatten_dict_with_prefix():
+    """测试带前缀的展平"""
+    data = {"a": 1, "b": 2}
+    result = flatten_dict(data, prefix="meta")
+    assert result == {"meta_a": 1, "meta_b": 2}
+def test_flatten_dict_nested_with_prefix():
+    """测试嵌套字典带前缀展平"""
+    data = {"user": {"name": "Alice", "age": 30}}
+    result = flatten_dict(data, prefix="doc")
+    assert result == {"doc_user_name": "Alice", "doc_user_age": 30}
+def test_flatten_dict_with_list():
+    """测试包含列表的字典展平"""
+    data = {"tags": ["python", "test"], "count": 2}
+    result = flatten_dict(data)
+    assert result["count"] == 2
+    assert result["tags"] == json.dumps(["python", "test"], ensure_ascii=False)
+def test_flatten_dict_with_list_chinese():
+    """测试包含中文列表的展平（ensure_ascii=False）"""
+    data = {"tags": ["测试", "文档"]}
+    result = flatten_dict(data)
+    # 验证中文不被转义
+    assert result["tags"] == '["测试", "文档"]'
+    assert "\\u" not in result["tags"]
+def test_flatten_dict_deep_nested():
+    """测试深层嵌套展平"""
+    data = {
+        "level1": {
+            "level2": {
+                "level3": {
+                    "value": 42
+                }
+            }
+        }
+    }
+    result = flatten_dict(data)
+    assert result == {"level1_level2_level3_value": 42}
+def test_flatten_dict_exclude_fields():
+    """测试排除字段"""
+    data = {"a": 1, "b": 2, "c": 3}
+    result = flatten_dict(data, exclude_fields={"b"})
+    assert result == {"a": 1, "c": 3}
+    assert "b" not in result
+def test_flatten_dict_exclude_nested_fields():
+    """测试排除嵌套字段"""
+    data = {"user": {"name": "Alice", "age": 30}, "count": 5}
+    result = flatten_dict(data, exclude_fields={"user_age"})
+    assert result == {"user_name": "Alice", "count": 5}
+    assert "user_age" not in result
+def test_flatten_dict_exclude_with_prefix():
+    """测试带前缀时排除字段"""
+    data = {"a": 1, "b": 2}
+    result = flatten_dict(data, prefix="meta", exclude_fields={"meta_b"})
+    assert result == {"meta_a": 1}
+    assert "meta_b" not in result
+def test_flatten_dict_empty():
+    """测试空字典"""
+    result = flatten_dict({})
+    assert result == {}
+def test_flatten_dict_mixed_types():
+    """测试混合类型"""
+    data = {
+        "string": "text",
+        "number": 42,
+        "float": 3.14,
+        "bool": True,
+        "null": None,
+        "list": [1, 2, 3],
+        "dict": {"nested": "value"}
+    }
+    result = flatten_dict(data)
+    assert result["string"] == "text"
+    assert result["number"] == 42
+    assert result["float"] == 3.14
+    assert result["bool"] is True
+    assert result["null"] is None
+    assert result["list"] == "[1, 2, 3]"
+    assert result["dict_nested"] == "value"
+def test_flatten_dict_exclude_none():
+    """测试 exclude_fields 为 None（默认）"""
+    data = {"a": 1, "b": 2}
+    result = flatten_dict(data, exclude_fields=None)
+    assert result == {"a": 1, "b": 2}

tests/unit/models/test_local.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""测试 Local API 的数据模型"""
+from xparse_client.models.local import FailedFile, WorkflowResult
+def test_failed_file_creation():
+    """测试 FailedFile 创建"""
+    failed = FailedFile(
+        file_path="/path/to/file.pdf",
+        error="Connection timeout",
+        retry_count=3
+    )
+    assert failed.file_path == "/path/to/file.pdf"
+    assert failed.error == "Connection timeout"
+    assert failed.retry_count == 3
+def test_workflow_result_creation():
+    """测试 WorkflowResult 创建"""
+    result = WorkflowResult(
+        total=10,
+        success=8,
+        failed=2,
+        failed_files=[
+            FailedFile(
+                file_path="/path/to/file1.pdf",
+                error="Parse error",
+                retry_count=1
+            )
+        ],
+        duration=125.5
+    )
+    assert result.total == 10
+    assert result.success == 8
+    assert result.failed == 2
+    assert len(result.failed_files) == 1
+    assert result.duration == 125.5
+def test_workflow_result_all_success():
+    """测试全部成功的 WorkflowResult"""
+    result = WorkflowResult(
+        total=5,
+        success=5,
+        failed=0,
+        failed_files=[],
+        duration=60.0
+    )
+    assert result.total == result.success
+    assert result.failed == 0
+    assert len(result.failed_files) == 0

tests/unit/models/test_pipeline_stages.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""测试 PipelineStage Config 类型安全"""
+import pytest
+from pydantic import ValidationError
+from xparse_client.models import (
+    ChunkConfig,
+    ChunkStage,
+    EmbedConfig,
+    EmbedStage,
+    ExtractConfig,
+    ExtractStage,
+    ParseConfig,
+    ParseStage,
+)
+def test_parse_stage_with_config():
+    """测试 ParseStage 使用 ParseConfig"""
+    stage = ParseStage(config=ParseConfig(provider="textin"))
+    assert stage.type == "parse"
+    assert stage.config.provider == "textin"
+def test_parse_stage_with_extra_fields():
+    """测试 ParseConfig 支持额外字段"""
+    config = ParseConfig(provider="textin", custom_field="value")
+    stage = ParseStage(config=config)
+    # 序列化应该包含额外字段
+    dumped = stage.model_dump()
+    assert dumped["config"]["custom_field"] == "value"
+def test_chunk_stage_with_config():
+    """测试 ChunkStage 使用 ChunkConfig"""
+    stage = ChunkStage(
+        config=ChunkConfig(
+            strategy="by_title",
+            max_characters=2048,
+            overlap=100
+        )
+    )
+    assert stage.type == "chunk"
+    assert stage.config.strategy == "by_title"
+    assert stage.config.max_characters == 2048
+    assert stage.config.overlap == 100
+def test_embed_stage_with_config():
+    """测试 EmbedStage 使用 EmbedConfig"""
+    stage = EmbedStage(
+        config=EmbedConfig(
+            provider="qwen",
+            model_name="text-embedding-v3"
+        )
+    )
+    assert stage.type == "embed"
+    assert stage.config.provider == "qwen"
+    assert stage.config.model_name == "text-embedding-v3"
+def test_embed_config_validation():
+    """测试 EmbedConfig 验证 provider 和 model 匹配"""
+    # 正确的组合
+    EmbedConfig(provider="qwen", model_name="text-embedding-v3")
+    EmbedConfig(provider="doubao", model_name="doubao-embedding-large-text-250515")
+    # 错误的组合应该抛出异常
+    with pytest.raises(ValidationError):
+        EmbedConfig(provider="qwen", model_name="doubao-embedding-large-text-250515")
+def test_extract_stage_with_config():
+    """测试 ExtractStage 使用 ExtractConfig"""
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "number"}
+        }
+    }
+    stage = ExtractStage(
+        config=ExtractConfig(
+            schema=schema,
+            generate_citations=True
+        )
+    )
+    assert stage.type == "extract"
+    assert stage.config.generate_citations is True
+    # 序列化检查
+    dumped = stage.model_dump()
+    assert "schema" in dumped["config"]
+    assert dumped["config"]["schema"] == schema
+def test_default_configs():
+    """测试使用默认配置"""
+    parse_stage = ParseStage()
+    assert parse_stage.config.provider == "textin"
+    chunk_stage = ChunkStage()
+    assert chunk_stage.config.strategy == "basic"
+    assert chunk_stage.config.max_characters == 1024
+    embed_stage = EmbedStage()
+    assert embed_stage.config.provider == "qwen"
+    assert embed_stage.config.model_name == "text-embedding-v3"
+def test_stage_json_serialization():
+    """测试 Stage 的 JSON 序列化"""
+    stage = ParseStage(config=ParseConfig(provider="mineru", custom="value"))
+    # 序列化
+    dumped = stage.model_dump()
+    assert dumped == {
+        "type": "parse",
+        "config": {
+            "provider": "mineru",
+            "custom": "value"
+        }
+    }
+    # 反序列化
+    restored = ParseStage.model_validate(dumped)
+    assert restored.config.provider == "mineru"
+def test_chunk_config_constraints():
+    """测试 ChunkConfig 的约束"""
+    # 正常值
+    ChunkConfig(new_after_n_chars=100, max_characters=500, overlap=10)
+    # 负数应该失败
+    with pytest.raises(ValidationError):
+        ChunkConfig(new_after_n_chars=-1)
+    with pytest.raises(ValidationError):
+        ChunkConfig(max_characters=0)
+    with pytest.raises(ValidationError):
+        ChunkConfig(overlap=-10)

tests/unit/models/test_workflows.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""测试 Workflows API 的数据模型"""
+from xparse_client.models import ParseConfig, ParseStage
+from xparse_client.models.workflows import Schedule, WorkflowInformation, WorkflowState
+def test_schedule_creation():
+    """测试 Schedule 创建"""
+    schedule = Schedule(cron="0 0 * * *")
+    assert schedule.cron == "0 0 * * *"
+def test_workflow_information_creation():
+    """测试 WorkflowInformation 创建"""
+    workflow = WorkflowInformation(
+        workflow_id="wf_123",
+        name="daily-processing",
+        source_id="src_456",
+        destination_id="dst_789",
+        stages=[],
+        schedule=Schedule(cron="0 0 * * *"),
+        state=WorkflowState.ACTIVE,
+        created_at="2026-01-27T10:00:00Z",
+        updated_at="2026-01-27T10:00:00Z"
+    )
+    assert workflow.workflow_id == "wf_123"
+    assert workflow.name == "daily-processing"
+    assert workflow.state == WorkflowState.ACTIVE
+    assert workflow.schedule is not None
+def test_workflow_with_stages():
+    """测试带 stages 的工作流"""
+    workflow = WorkflowInformation(
+        workflow_id="wf_123",
+        name="parse-workflow",
+        source_id="src_456",
+        destination_id="dst_789",
+        stages=[ParseStage(config=ParseConfig(provider="textin"))],
+        schedule=None,
+        state=WorkflowState.ACTIVE,
+        created_at="2026-01-27T10:00:00Z",
+        updated_at="2026-01-27T10:00:00Z"
+    )
+    assert len(workflow.stages) == 1
+    assert workflow.schedule is None
+def test_workflow_states():
+    """测试工作流状态枚举"""
+    assert WorkflowState.ACTIVE == "active"
+    assert WorkflowState.PAUSED == "paused"
+    assert WorkflowState.ARCHIVED == "archived"

xparse-client 0.2.20__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

xparse-client 0.2.20py3-none-any.whl → 0.3.0b1py3-none-any.whl