PyPI - xparse-client - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

xparse-client 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

example/run_pipeline.py +17 -14
example/run_pipeline_test.py +8 -8
xparse_client/pipeline/pipeline.py +133 -0
xparse_client/pipeline/sources.py +83 -24
{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/METADATA +311 -252
xparse_client-0.2.9.dist-info/RECORD +13 -0
xparse_client-0.2.7.dist-info/RECORD +0 -13
{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/WHEEL +0 -0
{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/licenses/LICENSE +0 -0
{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/top_level.txt +0 -0

example/run_pipeline.py CHANGED Viewed

@@ -5,6 +5,7 @@ Pipeline 运行脚本
 快速启动和运行 Pipeline 的示例
 '''
+import json
 from datetime import datetime, timezone
 from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
@@ -127,14 +128,14 @@ def run_with_manual_setup():
     #     prefix='',
     #     region='cn-east-3'
     # )
-    source = S3Source(
-        endpoint='https://s3.us-east-1.amazonaws.com',
-        access_key='AKIA6QUE3TVZADUWA4PO',
-        secret_key='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',
-        bucket='textin-xparse',
-        prefix='',
-        region='us-east-1'
-    )
+    # source = S3Source(
+    #     endpoint='https://s3.us-east-1.amazonaws.com',
+    #     access_key='AKIA6QUE3TVZADUWA4PO',
+    #     secret_key='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',
+    #     bucket='textin-xparse',
+    #     prefix='',
+    #     region='us-east-1'
+    # )
     # source = S3Source(
     #     endpoint='http://127.0.0.1:9000',
     #     access_key='',
@@ -157,10 +158,11 @@ def run_with_manual_setup():
     #     username='', # 用户名，按照实际填写
     #     password=''  # 密码，按照实际填写
     # )
-    # source = LocalSource(
-    #     directory='/Users/ke_wang/Documents/doc',
-    #     pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
-    # )
+    source = LocalSource(
+        directory='/Users/ke_wang/Documents/doc',
+        recursive=False,
+        pattern=['*']  # 支持通配符: *.pdf, *.docx, **/*.txt
+    )
     # 创建 Milvus 目的地
     # destination = MilvusDestination(
@@ -175,7 +177,7 @@ def run_with_manual_setup():
     destination = MilvusDestination(
         db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
-        collection_name='textin_test_3', # 数据库collection名称
+        collection_name='textin_test_3_copy', # 数据库collection名称
         dimension=1024,  # 向量维度，需与 embed API 返回一致
         api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46'  # Zilliz Cloud API Key
     )
@@ -193,7 +195,7 @@ def run_with_manual_setup():
     stages = [
         Stage(
             type='parse',
-            config=ParseConfig(provider='textin-lite')
+            config=ParseConfig(provider='textin')
         ),
         Stage(
             type='chunk',
@@ -235,6 +237,7 @@ def run_with_manual_setup():
     )
     # 运行
+    # config = pipeline.get_config()
     pipeline.run()

example/run_pipeline_test.py CHANGED Viewed

@@ -86,14 +86,14 @@ def run_with_manual_setup():
     from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
     # 创建 S3 数据源
-    # source = S3Source(
-    #     endpoint='https://textin-minio-api.ai.intsig.net',
-    #     access_key='IEQspf8C7fVcgmp3AZWl',
-    #     secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-    #     bucket='textin-test',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
+    source = S3Source(
+        endpoint='https://textin-minio-api.ai.intsig.net',
+        access_key='IEQspf8C7fVcgmp3AZWl',
+        secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
+        bucket='textin-test',
+        prefix='',
+        region='us-east-1'
+    )
     source = S3Source(
         endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
         access_key='LTAI5tBgsaVfkbh9rbPyuB17',

xparse_client/pipeline/pipeline.py CHANGED Viewed

@@ -79,6 +79,139 @@ class Pipeline:
             print(f"  Pipeline Config: 中间结果保存已启用")
         print("=" * 60)
+    def get_config(self) -> Dict[str, Any]:
+        """获取 Pipeline 的完整配置信息，返回字典格式（与 create_pipeline_from_config 的入参格式一致）"""
+        config = {}
+        # Source 配置
+        source_type = type(self.source).__name__.replace('Source', '').lower()
+        config['source'] = {'type': source_type}
+        if isinstance(self.source, S3Source):
+            config['source'].update({
+                'endpoint': self.source.endpoint,
+                'bucket': self.source.bucket,
+                'prefix': self.source.prefix,
+                'pattern': self.source.pattern,
+                'recursive': self.source.recursive
+            })
+            # access_key 和 secret_key 不在对象中保存，无法恢复
+            # region 也不在对象中保存，使用默认值
+            config['source']['region'] = 'us-east-1'  # 默认值
+        elif isinstance(self.source, LocalSource):
+            config['source'].update({
+                'directory': str(self.source.directory),
+                'pattern': self.source.pattern,
+                'recursive': self.source.recursive
+            })
+        elif isinstance(self.source, FtpSource):
+            config['source'].update({
+                'host': self.source.host,
+                'port': self.source.port,
+                'username': self.source.username,
+                'pattern': self.source.pattern,
+                'recursive': self.source.recursive
+            })
+            # password 不在对象中保存，无法恢复
+        elif isinstance(self.source, SmbSource):
+            config['source'].update({
+                'host': self.source.host,
+                'share_name': self.source.share_name,
+                'username': self.source.username,
+                'domain': self.source.domain,
+                'port': self.source.port,
+                'path': self.source.path,
+                'pattern': self.source.pattern,
+                'recursive': self.source.recursive
+            })
+            # password 不在对象中保存，无法恢复
+        # Destination 配置
+        dest_type = type(self.destination).__name__.replace('Destination', '').lower()
+        # MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
+        if dest_type == 'milvus':
+            # 判断是本地 Milvus 还是 Zilliz（通过 db_path 判断）
+            if self.destination.db_path.startswith('http'):
+                dest_type = 'zilliz'
+            else:
+                dest_type = 'milvus'
+        config['destination'] = {'type': dest_type}
+        if isinstance(self.destination, MilvusDestination):
+            config['destination'].update({
+                'db_path': self.destination.db_path,
+                'collection_name': self.destination.collection_name,
+                'dimension': self.destination.dimension
+            })
+            # api_key 和 token 不在对象中保存，无法恢复
+        elif isinstance(self.destination, LocalDestination):
+            config['destination'].update({
+                'output_dir': str(self.destination.output_dir)
+            })
+        elif isinstance(self.destination, S3Destination):
+            config['destination'].update({
+                'endpoint': self.destination.endpoint,
+                'bucket': self.destination.bucket,
+                'prefix': self.destination.prefix
+            })
+            # access_key, secret_key, region 不在对象中保存，无法恢复
+            config['destination']['region'] = 'us-east-1'  # 默认值
+        # API 配置
+        config['api_base_url'] = self.api_base_url
+        config['api_headers'] = {}
+        for key, value in self.api_headers.items():
+            config['api_headers'][key] = value
+        # Stages 配置
+        config['stages'] = []
+        for stage in self.stages:
+            stage_dict = {
+                'type': stage.type,
+                'config': {}
+            }
+            if isinstance(stage.config, ParseConfig):
+                stage_dict['config'] = stage.config.to_dict()
+            elif isinstance(stage.config, ChunkConfig):
+                stage_dict['config'] = stage.config.to_dict()
+            elif isinstance(stage.config, EmbedConfig):
+                stage_dict['config'] = stage.config.to_dict()
+            else:
+                # 如果 config 是字典或其他类型，尝试转换
+                if isinstance(stage.config, dict):
+                    stage_dict['config'] = stage.config
+                else:
+                    stage_dict['config'] = str(stage.config)
+            config['stages'].append(stage_dict)
+        # Pipeline Config
+        if self.pipeline_config.include_intermediate_results:
+            config['pipeline_config'] = {
+                'include_intermediate_results': True,
+                'intermediate_results_destination': {}
+            }
+            inter_dest = self.pipeline_config.intermediate_results_destination
+            if inter_dest:
+                inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
+                config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
+                if isinstance(inter_dest, LocalDestination):
+                    config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
+                elif isinstance(inter_dest, S3Destination):
+                    config['pipeline_config']['intermediate_results_destination'].update({
+                        'endpoint': inter_dest.endpoint,
+                        'bucket': inter_dest.bucket,
+                        'prefix': inter_dest.prefix
+                    })
+                    # access_key, secret_key, region 不在对象中保存，无法恢复
+                    config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1'  # 默认值
+        return config
     def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         url = f"{self.api_base_url}/pipeline"
         max_retries = 3

xparse_client/pipeline/sources.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
 from fnmatch import fnmatch
 from pathlib import Path
-from typing import List, Dict, Any, Tuple
+from typing import List, Dict, Any, Tuple, Optional
 from smb.SMBConnection import SMBConnection
 from botocore.config import Config
@@ -20,6 +20,56 @@ from botocore.config import Config
 logger = logging.getLogger(__name__)
+def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
+    """规范化通配符模式列表
+    Args:
+        pattern: 通配符模式列表，如果为 None 或空列表则返回 None（表示匹配所有文件）
+    Returns:
+        通配符模式列表，如果 pattern 是 None、空列表或包含 "*" 则返回 None（表示匹配所有文件）
+    """
+    if pattern is None or not pattern:
+        return None  # None 表示匹配所有文件
+    if not isinstance(pattern, list):
+        raise ValueError(f"pattern 类型错误: {type(pattern)}")
+    # 过滤空字符串并去除空格
+    normalized = [p.strip() for p in pattern if p and p.strip()]
+    if not normalized:
+        return None
+    # 如果包含 "*"，直接返回 None（匹配所有文件，减少后续开销）
+    if '*' in normalized:
+        return None
+    return normalized
+def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
+    """检查文件路径是否匹配通配符模式
+    Args:
+        file_path: 文件路径
+        wildcard_patterns: 已规范化的通配符模式列表（如 ['*.pdf', '*.docx']）
+    Returns:
+        如果匹配返回 True，否则返回 False
+    """
+    # 如果 wildcard_patterns 是 None 或空列表，匹配所有文件
+    if wildcard_patterns is None:
+        return True
+    # 检查是否匹配任何一个通配符模式
+    for wildcard_pattern in wildcard_patterns:
+        if fnmatch(file_path, wildcard_pattern):
+            return True
+    return False
 def _to_millis_timestamp_string(timestamp):
     """将时间戳转换为毫秒时间戳字符串
@@ -62,11 +112,11 @@ class S3Source(Source):
     """S3/MinIO 数据源"""
     def __init__(self, endpoint: str, access_key: str, secret_key: str,
-                 bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*', recursive: bool = False):
+                 bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
         self.endpoint = endpoint
         self.bucket = bucket
         self.prefix = prefix
-        self.pattern = pattern or '*'
+        self.pattern = _normalize_wildcard_patterns(pattern)  # 在初始化时规范化
         self.recursive = recursive
         if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
@@ -108,7 +158,7 @@ class S3Source(Source):
                     key = obj['Key']
                     if key.endswith('/') or key.endswith('empty.tmp'):
                         continue
-                    if fnmatch(key, self.pattern):
+                    if _match_file_extension(key, self.pattern):
                         files.append(key)
             # 非递归模式下，CommonPrefixes 包含子目录，我们忽略它们
@@ -156,9 +206,9 @@ class S3Source(Source):
 class LocalSource(Source):
     """本地文件系统数据源"""
-    def __init__(self, directory: str, pattern: str = '*', recursive: bool = False):
+    def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
         self.directory = Path(directory)
-        self.pattern = pattern or '*'
+        self.pattern = _normalize_wildcard_patterns(pattern)  # 在初始化时规范化
         self.recursive = recursive
         if not self.directory.exists():
@@ -168,20 +218,29 @@ class LocalSource(Source):
         logger.info(f"本地目录: {self.directory}")
     def list_files(self) -> List[str]:
+        all_files = []
+        # 匹配所有文件
         if self.recursive:
-            # 递归模式：使用 rglob
-            files = [
+            all_files.extend([
                 str(f.relative_to(self.directory))
-                for f in self.directory.rglob(self.pattern)
+                for f in self.directory.rglob('*')
                 if f.is_file()
-            ]
+            ])
         else:
-            # 非递归模式：只列出根目录下的文件，使用 glob
-            files = [
+            all_files.extend([
                 str(f.relative_to(self.directory))
-                for f in self.directory.glob(self.pattern)
+                for f in self.directory.glob('*')
                 if f.is_file()
-            ]
+            ])
+        files = []
+        if self.pattern is not None:
+            for file in all_files:
+                if _match_file_extension(file, self.pattern):
+                    files.append(file)
+        else:
+            files.extend(all_files)
         print(f"✓ 本地找到 {len(files)} 个文件")
         return files
@@ -217,12 +276,12 @@ class LocalSource(Source):
 class FtpSource(Source):
     """FTP 数据源"""
-    def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*', recursive: bool = False):
+    def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
         self.host = host
         self.port = port
         self.username = username
         self.password = password
-        self.pattern = pattern or '*'
+        self.pattern = _normalize_wildcard_patterns(pattern)  # 在初始化时规范化
         self.recursive = recursive
         self.client = ftplib.FTP()
@@ -288,12 +347,12 @@ class FtpSource(Source):
                             except:
                                 # 不是目录，是文件
                                 relative_path = full_path.lstrip('/')
-                                if fnmatch(relative_path, self.pattern):
+                                if _match_file_extension(relative_path, self.pattern):
                                     files.append(relative_path)
                         else:
                             # 是文件
                             relative_path = full_path.lstrip('/')
-                            if fnmatch(relative_path, self.pattern):
+                            if _match_file_extension(relative_path, self.pattern):
                                 files.append(relative_path)
                     # 恢复原始目录
@@ -325,7 +384,7 @@ class FtpSource(Source):
                     item_type = item_info.get('type', 'unknown')
                     # 只添加文件，排除目录
                     if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
-                        if fnmatch(item_name, self.pattern):
+                        if _match_file_extension(item_name, self.pattern):
                             files.append(item_name)
             except:
                 # 如果不支持 MLSD，使用 LIST 命令
@@ -341,7 +400,7 @@ class FtpSource(Source):
                                 continue
                             is_dir = parts[0].startswith('d')
                             # 只添加文件，排除目录
-                            if not is_dir and fnmatch(item_name, self.pattern):
+                            if not is_dir and _match_file_extension(item_name, self.pattern):
                                 files.append(item_name)
                 except:
                     # 最后回退到 nlst，通过尝试切换目录来判断是否为目录
@@ -357,7 +416,7 @@ class FtpSource(Source):
                             continue
                         except:
                             # 不能切换，说明是文件
-                            if fnmatch(item_name, self.pattern):
+                            if _match_file_extension(item_name, self.pattern):
                                 files.append(item_name)
             # 确保回到原始目录
@@ -405,7 +464,7 @@ class SmbSource(Source):
     """SMB/CIFS 数据源"""
     def __init__(self, host: str, share_name: str, username: str, password: str,
-                 domain: str = '', port: int = 445, path: str = '', pattern: str = '*', recursive: bool = False):
+                 domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
         self.host = host
         self.share_name = share_name
         self.username = username
@@ -413,7 +472,7 @@ class SmbSource(Source):
         self.domain = domain
         self.port = port
         self.path = path.strip('/').strip('\\') if path else ''
-        self.pattern = pattern or '*'
+        self.pattern = _normalize_wildcard_patterns(pattern)  # 在初始化时规范化
         self.recursive = recursive
         self.conn = SMBConnection(
@@ -451,7 +510,7 @@ class SmbSource(Source):
                             _list_recursive(conn, share, item_path)
                         # 非递归模式：忽略子目录
                     else:
-                        if fnmatch(relative_path, self.pattern):
+                        if _match_file_extension(relative_path, self.pattern):
                             files.append(relative_path)
             except Exception as e:
                 logger.warning(f"列出路径失败 {current_path}: {str(e)}")

{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.7
+Version: 0.2.9
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -173,7 +173,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='us-east-1',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -193,7 +193,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -214,7 +214,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='ap-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -235,7 +235,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -257,7 +257,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-east-3',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -279,7 +279,7 @@ source = S3Source(
     bucket='textin-xparse',
     prefix='',
     region='us-east-1',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -294,7 +294,7 @@ s3:GetObject
 ```python
 source = LocalSource(
     directory='./input',
-    pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
+    pattern=['*.pdf', '*.docx']  # 支持多个通配符模式列表
 )
 ```
@@ -306,7 +306,7 @@ source = FtpSource(
     port=21,
     username='', # 用户名，按照实际填写
     password='',  # 密码，按照实际填写
-    pattern='*.pdf'  # 可选，过滤指定类型文件
+    pattern=['*.pdf']  # 可选，通配符模式列表，过滤指定类型文件
 )
 ```
@@ -319,11 +319,11 @@ source = SmbSource(
     username='',  # 用户名，按照实际填写
     password='',  # 密码，按照实际填写
     domain='your-smb-domain',
-    pattern='**/*.pdf'  # 可选，支持多级匹配
+    pattern=['**/*.pdf']  # 可选，通配符模式列表，支持多级匹配
 )
 ```
-> 注 1：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 1：所有 Source 均支持 `pattern` 参数，使用通配符模式列表（如 `['*.pdf', '*.docx']`）来过滤需要处理的文件。支持多个通配符模式，如果列表中包含 `'*'` 则匹配所有文件。默认为 `None`，即处理全部文件。
 > 注 2：所有 Source 均支持 `recursive` 参数，表示是否递归遍历，默认为 `False`。
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`，表示文档解析服务的供应商，
 ## 💡 使用示例
-### 示例 1: 使用 config 字典配置（推荐）
+### 示例 1: 手动创建 Pipeline（推荐）
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, S3Source, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-# 完整的配置示例
-config = {
-    # S3 数据源配置
-    'source': {
-        'type': 's3',
-        'endpoint': 'https://your-minio.com',
-        'access_key': 'your-access-key',
-        'secret_key': 'your-secret-key',
-        'bucket': 'documents',
-        'prefix': 'pdfs/',
-        'region': 'us-east-1',
-        'pattern': '*.pdf'  # 仅处理匹配的文件
-    },
-    # Milvus 目的地配置
-    'destination': {
-        'type': 'milvus',
-        'db_path': './vectors.db',
-        'collection_name': 'documents',
-        'dimension': 1024
-    },
-    # API 配置
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建数据源
+source = S3Source(
+    endpoint='https://your-minio.com',
+    access_key='your-access-key',
+    secret_key='your-secret-key',
+    bucket='documents',
+    prefix='pdfs/',
+    region='us-east-1',
+    pattern=['*.pdf'],  # 仅处理匹配的文件
+    recursive=False   # 不递归子目录
+)
+# 创建目的地
+destination = MilvusDestination(
+    db_path='./vectors.db',
+    collection_name='documents',
+    dimension=1024
+)
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_title',           # 按标题分块
+            include_orig_elements=False,
+            new_after_n_chars=512,
+            max_characters=1024,
+            overlap=50                     # 块之间重叠 50 字符
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_title',           # 按标题分块
-                'include_orig_elements': False,
-                'new_after_n_chars': 512,
-                'max_characters': 1024,
-                'overlap': 50                    # 块之间重叠 50 字符
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-# 使用配置创建并运行 pipeline
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
-### 示例 2: 本地到本地（测试）
+### 示例 1.1: 输出配置字典
+手动创建 Pipeline 后，可以使用 `get_config()` 方法获取配置字典：
 ```python
-from datetime import datetime, timezone
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    'source': {
-        'type': 'local',
-        'directory': './test_files',
-        'pattern': '*.pdf'
-    },
-    'destination': {
-        'type': 'local',
-        'output_dir': './test_output'
+# 手动创建 Pipeline
+source = LocalSource(
+    directory='./test_files',
+    pattern=['*.pdf'],
+    recursive=False
+)
+destination = LocalDestination(output_dir='./test_output')
+stages = [
+    Stage(type='parse', config=ParseConfig(provider='textin')),
+    Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
+    Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
+]
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
+        'x-ti-app-id': 'your-app-id',
+        'x-ti-secret-code': 'your-secret-code'
     },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+    stages=stages
+)
+# 获取配置字典（格式与 create_pipeline_from_config 的入参一致）
+config_dict = pipeline.get_config()
+# 可以保存为 JSON 文件
+import json
+with open('pipeline_config.json', 'w', encoding='utf-8') as f:
+    json.dump(config_dict, f, indent=2, ensure_ascii=False)
+# 或者用于创建新的 Pipeline（需要补充敏感信息如 access_key, secret_key 等）
+# from xparse_client import create_pipeline_from_config
+# new_pipeline = create_pipeline_from_config(config_dict)
+```
+### 示例 2: 本地到本地（测试）
+```python
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
+# 创建本地数据源
+source = LocalSource(
+    directory='./test_files',
+    pattern=['*.pdf'],
+    recursive=False
+)
+# 创建本地输出目的地
+destination = LocalDestination(output_dir='./test_output')
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
 ### 示例 3: 不同分块策略的配置
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, S3Source, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
+# 创建数据源和目的地
+source = S3Source(...)
+destination = MilvusDestination(...)
 # 配置 1：按页面分块（适合 PDF 文档）
-config_by_page = {
-    'source': {...},
-    'destination': {...},
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {...},
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_page',         # 按页面分块
-                'max_characters': 2048,       # 增大块大小
-                'overlap': 100                # 页面间重叠 100 字符
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v4'  # 使用更高精度的模型
-            }
-        }
-    ]
-}
+stages_by_page = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_page',         # 按页面分块
+            max_characters=2048,         # 增大块大小
+            overlap=100                  # 页面间重叠 100 字符
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v4'  # 使用更高精度的模型
+        )
+    )
+]
 # 配置 2：按标题分块（适合结构化文档）
-config_by_title = {
-    'source': {...},
-    'destination': {...},
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {...},
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_title',        # 按标题分块
-                'include_orig_elements': True, # 保留原始元素信息
-                'max_characters': 1536
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+stages_by_title = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_title',        # 按标题分块
+            include_orig_elements=True,  # 保留原始元素信息
+            max_characters=1536
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
 # 根据文档类型选择配置
-pipeline = create_pipeline_from_config(config_by_page)
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={...},
+    stages=stages_by_page  # 或 stages_by_title
+)
 pipeline.run()
 ```
 ### 示例 4: FTP 数据源配置
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, FtpSource, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    # FTP 数据源
-    'source': {
-        'type': 'ftp',
-        'host': 'ftp.example.com',
-        'port': 21,
-        'username': 'user',
-        'password': 'pass'
-    },
-    # Milvus 目的地
-    'destination': {
-        'type': 'milvus',
-        'db_path': './vectors.db',
-        'collection_name': 'ftp_docs',
-        'dimension': 1024
-    },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建 FTP 数据源
+source = FtpSource(
+    host='ftp.example.com',
+    port=21,
+    username='user',
+    password='pass',
+    pattern=['*.pdf'],
+    recursive=False
+)
+# 创建 Milvus 目的地
+destination = MilvusDestination(
+    db_path='./vectors.db',
+    collection_name='ftp_docs',
+    dimension=1024
+)
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'app-id',
         'x-ti-secret-code': 'secret'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
 ### 示例 5: 获取处理统计信息
 ```python
-from xparse_client import create_pipeline_from_config
+from datetime import datetime, timezone
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    'source': {
-        'type': 'local',
-        'directory': './docs',
-        'pattern': '*.pdf'
-    },
-    'destination': {
-        'type': 'local',
-        'output_dir': './output'
-    },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建 Pipeline
+source = LocalSource(
+    directory='./docs',
+    pattern=['*.pdf'],
+    recursive=False
+)
+destination = LocalDestination(output_dir='./output')
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
-pipeline = create_pipeline_from_config(config)
+    stages=stages
+)
 # 处理单个文件并获取统计信息
 file_bytes, data_source = pipeline.source.read_file('document.pdf')

xparse_client-0.2.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+example/run_pipeline.py,sha256=xZ8TLofrK7naEwBe-tiuotcQ8yKWUES_k9iCQcIOIYo,15446
+example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
+xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
+xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
+xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
+xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
+xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
+xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
+xparse_client-0.2.9.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
+xparse_client-0.2.9.dist-info/METADATA,sha256=Faj3fvt9Fc-EW9yFDewhpkqGVo_qSvL5N-tq1aIkkyk,28086
+xparse_client-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+xparse_client-0.2.9.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
+xparse_client-0.2.9.dist-info/RECORD,,

xparse_client-0.2.7.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-example/run_pipeline.py,sha256=ijws5q_vMmV0-bMHuFtOUMrEnxnL1LvOBCtcCD2c8zc,15366
-example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
-xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
-xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
-xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
-xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
-xparse_client/pipeline/pipeline.py,sha256=pHw32eo-bRegzDvkuVUu0CjMXMejJ64dDXH7esGMXjg,20379
-xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
-xparse_client-0.2.7.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
-xparse_client-0.2.7.dist-info/METADATA,sha256=qMHiAq2qdH4vfW5zktrYrh9Kj72JxBDERVht8KYerl0,26805
-xparse_client-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xparse_client-0.2.7.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
-xparse_client-0.2.7.dist-info/RECORD,,

{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xparse_client-0.2.7.dist-info → xparse_client-0.2.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

xparse-client 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

xparse-client 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl