PyPI - xparse-client - Versions diffs - 0.2.7__tar.gz → 0.2.9__tar.gz - Mend

xparse-client 0.2.7tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{xparse_client-0.2.7 → xparse_client-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.7
+Version: 0.2.9
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -173,7 +173,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='us-east-1',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -193,7 +193,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -214,7 +214,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='ap-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -235,7 +235,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-shanghai',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -257,7 +257,7 @@ source = S3Source(
     bucket='textin',
     prefix='',
     region='cn-east-3',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
@@ -279,7 +279,7 @@ source = S3Source(
     bucket='textin-xparse',
     prefix='',
     region='us-east-1',
-    pattern='*.pdf'  # 可选，使用 Shell 通配符过滤对象
+    pattern=['*.pdf']  # 可选，通配符模式列表，支持多个扩展名
 )
 ```
 请确保配置的访问凭证至少包括以下几项权限：
@@ -294,7 +294,7 @@ s3:GetObject
 ```python
 source = LocalSource(
     directory='./input',
-    pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
+    pattern=['*.pdf', '*.docx']  # 支持多个通配符模式列表
 )
 ```
@@ -306,7 +306,7 @@ source = FtpSource(
     port=21,
     username='', # 用户名，按照实际填写
     password='',  # 密码，按照实际填写
-    pattern='*.pdf'  # 可选，过滤指定类型文件
+    pattern=['*.pdf']  # 可选，通配符模式列表，过滤指定类型文件
 )
 ```
@@ -319,11 +319,11 @@ source = SmbSource(
     username='',  # 用户名，按照实际填写
     password='',  # 密码，按照实际填写
     domain='your-smb-domain',
-    pattern='**/*.pdf'  # 可选，支持多级匹配
+    pattern=['**/*.pdf']  # 可选，通配符模式列表，支持多级匹配
 )
 ```
-> 注 1：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 1：所有 Source 均支持 `pattern` 参数，使用通配符模式列表（如 `['*.pdf', '*.docx']`）来过滤需要处理的文件。支持多个通配符模式，如果列表中包含 `'*'` 则匹配所有文件。默认为 `None`，即处理全部文件。
 > 注 2：所有 Source 均支持 `recursive` 参数，表示是否递归遍历，默认为 `False`。
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`，表示文档解析服务的供应商，
 ## 💡 使用示例
-### 示例 1: 使用 config 字典配置（推荐）
+### 示例 1: 手动创建 Pipeline（推荐）
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, S3Source, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-# 完整的配置示例
-config = {
-    # S3 数据源配置
-    'source': {
-        'type': 's3',
-        'endpoint': 'https://your-minio.com',
-        'access_key': 'your-access-key',
-        'secret_key': 'your-secret-key',
-        'bucket': 'documents',
-        'prefix': 'pdfs/',
-        'region': 'us-east-1',
-        'pattern': '*.pdf'  # 仅处理匹配的文件
-    },
-    # Milvus 目的地配置
-    'destination': {
-        'type': 'milvus',
-        'db_path': './vectors.db',
-        'collection_name': 'documents',
-        'dimension': 1024
-    },
-    # API 配置
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建数据源
+source = S3Source(
+    endpoint='https://your-minio.com',
+    access_key='your-access-key',
+    secret_key='your-secret-key',
+    bucket='documents',
+    prefix='pdfs/',
+    region='us-east-1',
+    pattern=['*.pdf'],  # 仅处理匹配的文件
+    recursive=False   # 不递归子目录
+)
+# 创建目的地
+destination = MilvusDestination(
+    db_path='./vectors.db',
+    collection_name='documents',
+    dimension=1024
+)
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_title',           # 按标题分块
+            include_orig_elements=False,
+            new_after_n_chars=512,
+            max_characters=1024,
+            overlap=50                     # 块之间重叠 50 字符
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_title',           # 按标题分块
-                'include_orig_elements': False,
-                'new_after_n_chars': 512,
-                'max_characters': 1024,
-                'overlap': 50                    # 块之间重叠 50 字符
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-# 使用配置创建并运行 pipeline
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
-### 示例 2: 本地到本地（测试）
+### 示例 1.1: 输出配置字典
+手动创建 Pipeline 后，可以使用 `get_config()` 方法获取配置字典：
 ```python
-from datetime import datetime, timezone
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    'source': {
-        'type': 'local',
-        'directory': './test_files',
-        'pattern': '*.pdf'
-    },
-    'destination': {
-        'type': 'local',
-        'output_dir': './test_output'
+# 手动创建 Pipeline
+source = LocalSource(
+    directory='./test_files',
+    pattern=['*.pdf'],
+    recursive=False
+)
+destination = LocalDestination(output_dir='./test_output')
+stages = [
+    Stage(type='parse', config=ParseConfig(provider='textin')),
+    Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
+    Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
+]
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
+        'x-ti-app-id': 'your-app-id',
+        'x-ti-secret-code': 'your-secret-code'
     },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+    stages=stages
+)
+# 获取配置字典（格式与 create_pipeline_from_config 的入参一致）
+config_dict = pipeline.get_config()
+# 可以保存为 JSON 文件
+import json
+with open('pipeline_config.json', 'w', encoding='utf-8') as f:
+    json.dump(config_dict, f, indent=2, ensure_ascii=False)
+# 或者用于创建新的 Pipeline（需要补充敏感信息如 access_key, secret_key 等）
+# from xparse_client import create_pipeline_from_config
+# new_pipeline = create_pipeline_from_config(config_dict)
+```
+### 示例 2: 本地到本地（测试）
+```python
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
+# 创建本地数据源
+source = LocalSource(
+    directory='./test_files',
+    pattern=['*.pdf'],
+    recursive=False
+)
+# 创建本地输出目的地
+destination = LocalDestination(output_dir='./test_output')
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
 ### 示例 3: 不同分块策略的配置
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, S3Source, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
+# 创建数据源和目的地
+source = S3Source(...)
+destination = MilvusDestination(...)
 # 配置 1：按页面分块（适合 PDF 文档）
-config_by_page = {
-    'source': {...},
-    'destination': {...},
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {...},
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_page',         # 按页面分块
-                'max_characters': 2048,       # 增大块大小
-                'overlap': 100                # 页面间重叠 100 字符
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v4'  # 使用更高精度的模型
-            }
-        }
-    ]
-}
+stages_by_page = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_page',         # 按页面分块
+            max_characters=2048,         # 增大块大小
+            overlap=100                  # 页面间重叠 100 字符
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v4'  # 使用更高精度的模型
+        )
+    )
+]
 # 配置 2：按标题分块（适合结构化文档）
-config_by_title = {
-    'source': {...},
-    'destination': {...},
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {...},
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'by_title',        # 按标题分块
-                'include_orig_elements': True, # 保留原始元素信息
-                'max_characters': 1536
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+stages_by_title = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_title',        # 按标题分块
+            include_orig_elements=True,  # 保留原始元素信息
+            max_characters=1536
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
 # 根据文档类型选择配置
-pipeline = create_pipeline_from_config(config_by_page)
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={...},
+    stages=stages_by_page  # 或 stages_by_title
+)
 pipeline.run()
 ```
 ### 示例 4: FTP 数据源配置
 ```python
-from xparse_client import create_pipeline_from_config
+from xparse_client import (
+    Pipeline, FtpSource, MilvusDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    # FTP 数据源
-    'source': {
-        'type': 'ftp',
-        'host': 'ftp.example.com',
-        'port': 21,
-        'username': 'user',
-        'password': 'pass'
-    },
-    # Milvus 目的地
-    'destination': {
-        'type': 'milvus',
-        'db_path': './vectors.db',
-        'collection_name': 'ftp_docs',
-        'dimension': 1024
-    },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建 FTP 数据源
+source = FtpSource(
+    host='ftp.example.com',
+    port=21,
+    username='user',
+    password='pass',
+    pattern=['*.pdf'],
+    recursive=False
+)
+# 创建 Milvus 目的地
+destination = MilvusDestination(
+    db_path='./vectors.db',
+    collection_name='ftp_docs',
+    dimension=1024
+)
+# 配置处理阶段
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+# 创建并运行 Pipeline
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'app-id',
         'x-ti-secret-code': 'secret'
     },
-    # Stages 配置
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
+    stages=stages
+)
-pipeline = create_pipeline_from_config(config)
 pipeline.run()
 ```
 ### 示例 5: 获取处理统计信息
 ```python
-from xparse_client import create_pipeline_from_config
+from datetime import datetime, timezone
+from xparse_client import (
+    Pipeline, LocalSource, LocalDestination,
+    ParseConfig, ChunkConfig, EmbedConfig, Stage
+)
-config = {
-    'source': {
-        'type': 'local',
-        'directory': './docs',
-        'pattern': '*.pdf'
-    },
-    'destination': {
-        'type': 'local',
-        'output_dir': './output'
-    },
-    'api_base_url': 'https://api.textin.com/api/xparse',
-    'api_headers': {
+# 创建 Pipeline
+source = LocalSource(
+    directory='./docs',
+    pattern=['*.pdf'],
+    recursive=False
+)
+destination = LocalDestination(output_dir='./output')
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='basic',
+            max_characters=1024
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v3'
+        )
+    )
+]
+pipeline = Pipeline(
+    source=source,
+    destination=destination,
+    api_base_url='https://api.textin.com/api/xparse',
+    api_headers={
         'x-ti-app-id': 'your-app-id',
         'x-ti-secret-code': 'your-secret-code'
     },
-    'stages': [
-        {
-            'type': 'parse',
-            'config': {
-                'provider': 'textin'
-            }
-        },
-        {
-            'type': 'chunk',
-            'config': {
-                'strategy': 'basic',
-                'max_characters': 1024
-            }
-        },
-        {
-            'type': 'embed',
-            'config': {
-                'provider': 'qwen',
-                'model_name': 'text-embedding-v3'
-            }
-        }
-    ]
-}
-pipeline = create_pipeline_from_config(config)
+    stages=stages
+)
 # 处理单个文件并获取统计信息
 file_bytes, data_source = pipeline.source.read_file('document.pdf')

xparse-client 0.2.7__tar.gz → 0.2.9__tar.gz

xparse-client 0.2.7tar.gz → 0.2.9tar.gz