PyPI - xparse-client - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

xparse-client 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{xparse_client-0.2.1/xparse_client.egg-info → xparse_client-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.1
+Version: 0.2.2
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -69,25 +69,32 @@ pip install --upgrade xparse-client
 #### 代码配置
 ```python
-from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Pipeline, S3Source, MilvusDestination
-# 创建配置对象
-parse_config = ParseConfig(
-    provider='textin'
-)
-chunk_config = ChunkConfig(
-    strategy='by_title',
-    include_orig_elements=False,
-    new_after_n_chars=512,
-    max_characters=1024,
-    overlap=50
-)
-embed_config = EmbedConfig(
-    provider='qwen',
-    model_name='text-embedding-v4'
-)
+from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Stage, Pipeline, S3Source, MilvusDestination
+# 使用新的 stages 格式创建配置
+stages = [
+    Stage(
+        type='parse',
+        config=ParseConfig(provider='textin')
+    ),
+    Stage(
+        type='chunk',
+        config=ChunkConfig(
+            strategy='by_title',
+            include_orig_elements=False,
+            new_after_n_chars=512,
+            max_characters=1024,
+            overlap=50
+        )
+    ),
+    Stage(
+        type='embed',
+        config=EmbedConfig(
+            provider='qwen',
+            model_name='text-embedding-v4'
+        )
+    )
+]
 # 创建 Pipeline
 source = S3Source(...)
@@ -98,9 +105,7 @@ pipeline = Pipeline(
     destination=destination,
     api_base_url='https://api.textin.com/api/xparse',
     api_headers={...},
-    parse_config=parse_config,
-    chunk_config=chunk_config,
-    embed_config=embed_config
+    stages=stages
 )
 pipeline.run()
@@ -115,25 +120,32 @@ config = {
     'api_base_url': 'https://api.textin.com/api/xparse',
     'api_headers': {...},
-    # Parse 配置（可选）
-    'parse_config': {
-        'provider': 'textin' # 当前支持textin文档解析，未来可扩展
-    },
-    # Chunk 配置（可选）
-    'chunk_config': {
-        'strategy': 'basic',             # 分块策略: 'basic' | 'by_title' | 'by_page'
-        'include_orig_elements': False,  # 是否包含原始元素
-        'new_after_n_chars': 512,        # 多少字符后创建新块
-        'max_characters': 1024,          # 最大字符数
-        'overlap': 0                     # 重叠字符数
-    },
-    # Embed 配置（可选）
-    'embed_config': {
-        'provider': 'qwen',                # 向量化供应商: 'qwen'/'doubao'
-        'model_name': 'text-embedding-v3'  # 模型名称
-    }
+    # Stages 配置
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'  # 当前支持textin文档解析，未来可扩展
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'basic',             # 分块策略: 'basic' | 'by_title' | 'by_page'
+                'include_orig_elements': False,  # 是否包含原始元素
+                'new_after_n_chars': 512,        # 多少字符后创建新块
+                'max_characters': 1024,          # 最大字符数
+                'overlap': 0                     # 重叠字符数
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',                # 向量化供应商: 'qwen'/'doubao'
+                'model_name': 'text-embedding-v3'  # 模型名称
+            }
+        }
+    ]
 }
 # 使用配置创建 pipeline
@@ -426,8 +438,12 @@ Parse 参数中有必填项`Provider`，表示文档解析服务的供应商，
 - textin: 合合信息提供的文档解析服务，在速度、准确性上均为行业领先
   - 支持的文档解析参数参考 [TextIn 文档解析官方API文档](https://docs.textin.com/api-reference/endpoint/parse)
   - 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
-- mineru: 敬请期待
-- paddle: 敬请期待
+- textin-lite：
+  - 接口调用将按照 `TextIn 通用表格识别` 服务的计费标准进行计费
+- mineru:
+  - 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
+- paddle:
+  - 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
 2. **Chunk Stage** (`type: "chunk"`)
@@ -473,21 +489,21 @@ Parse 参数中有必填项`Provider`，表示文档解析服务的供应商，
     "stats": {
       "original_elements": 10,   // 原始解析的元素数量
       "chunked_elements": 15,    // 分块后的元素数量
-      "embedded_elements": 15,   // 向量化后的元素数量
-      "parse_config": {       // 使用的 parse 配置
-        "provider": "textin"
-      },
-      "chunk_config": {          // 使用的 chunk 配置
-        "strategy": "basic",
-        "include_orig_elements": false,
-        "new_after_n_chars": 512,
-        "max_characters": 1024,
-        "overlap": 0
-      },
-      "embed_config": {          // 使用的 embed 配置
-        "provider": "qwen",
-        "model_name": "text-embedding-v3"
-      }
+      "embedded_elements": 15,    // 向量化后的元素数量
+      "stages": [
+        {
+          "type": "parse",
+          "config": {
+            "provider": "textin-lite"
+          }
+        },
+        {
+          "type": "chunk",
+          "config": {
+            "strategy": "by_title"
+          }
+        }
+      ]
     }
   }
 }
@@ -529,25 +545,32 @@ config = {
         'x-ti-secret-code': 'your-secret-code'
     },
-    # Parse 配置（可选）
-    'parse_config': {
-        'provider': 'textin'
-    },
-    # Chunk 配置（可选）
-    'chunk_config': {
-        'strategy': 'by_title',           # 按标题分块
-        'include_orig_elements': False,
-        'new_after_n_chars': 512,
-        'max_characters': 1024,
-        'overlap': 50                    # 块之间重叠 50 字符
-    },
-    # Embed 配置（可选）
-    'embed_config': {
-        'provider': 'qwen',
-        'model_name': 'text-embedding-v3'
-    }
+    # Stages 配置
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'by_title',           # 按标题分块
+                'include_orig_elements': False,
+                'new_after_n_chars': 512,
+                'max_characters': 1024,
+                'overlap': 50                    # 块之间重叠 50 字符
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v3'
+            }
+        }
+    ]
 }
 # 使用配置创建并运行 pipeline
@@ -572,15 +595,33 @@ config = {
         'output_dir': './test_output'
     },
     'api_base_url': 'https://api.textin.com/api/xparse',
-    # 使用默认的 chunk 和 embed 配置
-    'chunk_config': {
-        'strategy': 'basic',
-        'max_characters': 1024
+    'api_headers': {
+        'x-ti-app-id': 'your-app-id',
+        'x-ti-secret-code': 'your-secret-code'
     },
-    'embed_config': {
-        'provider': 'qwen',
-        'model_name': 'text-embedding-v3'
-    }
+    # Stages 配置
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'basic',
+                'max_characters': 1024
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v3'
+            }
+        }
+    ]
 }
 pipeline = create_pipeline_from_config(config)
@@ -598,14 +639,29 @@ config_by_page = {
     'destination': {...},
     'api_base_url': 'https://api.textin.com/api/xparse',
     'api_headers': {...},
-    'chunk_config': {
-        'strategy': 'by_page',         # 按页面分块
-        'max_characters': 2048,       # 增大块大小
-        'overlap': 100                # 页面间重叠 100 字符
-    },
-    'embed_config': {
-        'model_name': 'text-embedding-v4'  # 使用更高精度的模型
-    }
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'by_page',         # 按页面分块
+                'max_characters': 2048,       # 增大块大小
+                'overlap': 100                # 页面间重叠 100 字符
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v4'  # 使用更高精度的模型
+            }
+        }
+    ]
 }
 # 配置 2：按标题分块（适合结构化文档）
@@ -614,15 +670,29 @@ config_by_title = {
     'destination': {...},
     'api_base_url': 'https://api.textin.com/api/xparse',
     'api_headers': {...},
-    'chunk_config': {
-        'strategy': 'by_title',        # 按标题分块
-        'include_orig_elements': True, # 保留原始元素信息
-        'max_characters': 1536
-    },
-    'embed_config': {
-        'provider': 'qwen',
-        'model_name': 'text-embedding-v3'
-    }
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'by_title',        # 按标题分块
+                'include_orig_elements': True, # 保留原始元素信息
+                'max_characters': 1536
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v3'
+            }
+        }
+    ]
 }
 # 根据文档类型选择配置
@@ -659,15 +729,29 @@ config = {
         'x-ti-secret-code': 'secret'
     },
-    # 配置处理参数
-    'chunk_config': {
-        'strategy': 'basic',
-        'max_characters': 1024
-    },
-    'embed_config': {
-        'provider': 'qwen',
-        'model_name': 'text-embedding-v3'
-    }
+    # Stages 配置
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'basic',
+                'max_characters': 1024
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v3'
+            }
+        }
+    ]
 }
 pipeline = create_pipeline_from_config(config)
@@ -690,14 +774,32 @@ config = {
         'output_dir': './output'
     },
     'api_base_url': 'https://api.textin.com/api/xparse',
-    'chunk_config': {
-        'strategy': 'basic',
-        'max_characters': 1024
+    'api_headers': {
+        'x-ti-app-id': 'your-app-id',
+        'x-ti-secret-code': 'your-secret-code'
     },
-    'embed_config': {
-        'provider': 'qwen',
-        'model_name': 'text-embedding-v3'
-    }
+    'stages': [
+        {
+            'type': 'parse',
+            'config': {
+                'provider': 'textin'
+            }
+        },
+        {
+            'type': 'chunk',
+            'config': {
+                'strategy': 'basic',
+                'max_characters': 1024
+            }
+        },
+        {
+            'type': 'embed',
+            'config': {
+                'provider': 'qwen',
+                'model_name': 'text-embedding-v3'
+            }
+        }
+    ]
 }
 pipeline = create_pipeline_from_config(config)
@@ -712,9 +814,7 @@ if result:
     print(f"原始元素: {stats.original_elements}")
     print(f"分块后: {stats.chunked_elements}")
     print(f"向量化: {stats.embedded_elements}")
-    print(f"使用配置:")
-    print(f"  - 分块策略: {stats.chunk_config.strategy}")
-    print(f"  - 向量模型: {stats.embed_config.model_name}")
+    print(f"执行的 stages: {[s.type for s in stats.stages]}")
     # 写入目的地
     metadata = {
@@ -733,9 +833,7 @@ Pipeline 接口会返回详细的处理统计信息：
 | `original_elements` | int | 原始解析的元素数量 |
 | `chunked_elements` | int | 分块后的元素数量 |
 | `embedded_elements` | int | 向量化后的元素数量 |
-| `parse_config` | ParseConfig | 使用的解析配置 |
-| `chunk_config` | ChunkConfig | 使用的分块配置 |
-| `embed_config` | EmbedConfig | 使用的向量化配置 |
+| `stages` | List[Stage] | 实际执行的 stages 配置 |
 **示例输出：**
 ```

xparse-client 0.2.1__tar.gz → 0.2.2__tar.gz

xparse-client 0.2.1tar.gz → 0.2.2tar.gz