PyPI - xparse-client - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

xparse-client 0.2.11py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

xparse_client/pipeline/destinations.py CHANGED Viewed

@@ -240,6 +240,8 @@ class S3Destination(Destination):
             config = Config(signature_version='s3v4')
         elif self.endpoint.endswith('aliyuncs.com'):
             config = Config(signature_version='s3', s3={'addressing_style': 'virtual'})
+        elif self.endpoint.endswith('myhuaweicloud.com'):
+            config = Config(signature_version='s3', s3={'addressing_style': 'virtual'})
         else:
             config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})

xparse_client/pipeline/pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import json
 import logging
+import re
 import time
 from datetime import datetime, timezone
 from pathlib import Path
@@ -220,6 +221,67 @@ class Pipeline:
         return config
+    def _extract_error_message(self, response: requests.Response) -> Tuple[str, str]:
+        """
+        从响应中提取规范化的错误信息
+        Returns:
+            Tuple[str, str]: (error_msg, x_request_id)
+        """
+        # 首先尝试从响应头中提取 x-request-id（requests的headers大小写不敏感）
+        x_request_id = response.headers.get('x-request-id', '')
+        error_msg = ''
+        # 获取Content-Type
+        content_type = response.headers.get('Content-Type', '').lower()
+        # 尝试解析JSON响应
+        if 'application/json' in content_type:
+            try:
+                result = response.json()
+                # 如果响应头中没有x-request-id，尝试从响应体中获取
+                if not x_request_id:
+                    x_request_id = result.get('x_request_id', '')
+                error_msg = result.get('message', result.get('msg', f'HTTP {response.status_code}'))
+                return error_msg, x_request_id
+            except:
+                pass
+        # 处理HTML响应
+        if 'text/html' in content_type or response.text.strip().startswith('<'):
+            try:
+                # 从HTML中提取标题（通常包含状态码和状态文本）
+                title_match = re.search(r'<title>(.*?)</title>', response.text, re.IGNORECASE)
+                if title_match:
+                    error_msg = title_match.group(1).strip()
+                else:
+                    # 如果没有title，尝试提取h1标签
+                    h1_match = re.search(r'<h1>(.*?)</h1>', response.text, re.IGNORECASE)
+                    if h1_match:
+                        error_msg = h1_match.group(1).strip()
+                    else:
+                        error_msg = f'HTTP {response.status_code}'
+            except:
+                error_msg = f'HTTP {response.status_code}'
+        # 处理纯文本响应
+        elif 'text/plain' in content_type:
+            error_msg = response.text[:200].strip() if response.text else f'HTTP {response.status_code}'
+        # 其他情况
+        else:
+            if response.text:
+                # 尝试截取前200字符，但去除换行和多余空格
+                text = response.text[:200].strip()
+                # 如果包含多行，只取第一行
+                if '\n' in text:
+                    text = text.split('\n')[0].strip()
+                error_msg = text if text else f'HTTP {response.status_code}'
+            else:
+                error_msg = f'HTTP {response.status_code}'
+        return error_msg, x_request_id
     def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         url = f"{self.api_base_url}/pipeline"
         max_retries = 3
@@ -231,19 +293,24 @@ class Pipeline:
                 # 将 stages 转换为 API 格式
                 stages_data = [stage.to_dict() for stage in self.stages]
-                form_data['stages'] = json.dumps(stages_data)
-                form_data['data_source'] = json.dumps(data_source, ensure_ascii=False)
-                # 如果启用了中间结果保存，在请求中添加参数
-                if self.pipeline_config:
-                    form_data['config'] = json.dumps(self.pipeline_config.to_dict(), ensure_ascii=False)
+                try:
+                    form_data['stages'] = json.dumps(stages_data)
+                    form_data['data_source'] = json.dumps(data_source, ensure_ascii=False)
+                    # 如果启用了中间结果保存，在请求中添加参数
+                    if self.pipeline_config:
+                        form_data['config'] = json.dumps(self.pipeline_config.to_dict(), ensure_ascii=False)
+                except Exception as e:
+                    print(f"  ✗ 入参处理失败，请检查配置: {e}")
+                    logger.error(f"入参处理失败，请检查配置: {e}")
+                    return None
                 response = requests.post(
                     url,
                     files=files,
                     data=form_data,
                     headers=self.api_headers,
-                    timeout=120
+                    timeout=630
                 )
                 if response.status_code == 200:
@@ -258,15 +325,8 @@ class Pipeline:
                     logger.error(f"Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
                     return None
                 else:
-                    # 尝试解析响应获取 x_request_id 和错误信息
-                    x_request_id = ''
-                    error_msg = ''
-                    try:
-                        result = response.json()
-                        x_request_id = result.get('x_request_id', '')
-                        error_msg = result.get('message', result.get('msg', response.text[:200]))
-                    except:
-                        error_msg = response.text[:200] if response.text else f'HTTP {response.status_code}'
+                    # 使用规范化函数提取错误信息
+                    error_msg, x_request_id = self._extract_error_message(response)
                     print(f"  ✗ API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
                     logger.warning(f"API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
@@ -369,6 +429,14 @@ class Pipeline:
             print(f"  → 读取文件...")
             file_bytes, data_source = self.source.read_file(file_path)
             data_source = data_source or {}
+            # 检查文件大小，超过 100MB 则报错
+            MAX_FILE_SIZE = 100 * 1024 * 1024  # 100MB
+            file_size = len(file_bytes)
+            if file_size > MAX_FILE_SIZE:
+                file_size_mb = file_size / (1024 * 1024)
+                raise ValueError(f"文件大小过大: {file_size_mb:.2f}MB，超过100MB限制")
             # 转换为毫秒时间戳字符串
             timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
             data_source['date_processed'] = str(timestamp_ms)
@@ -469,13 +537,13 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             bucket=source_config['bucket'],
             prefix=source_config.get('prefix', ''),
             region=source_config.get('region', 'us-east-1'),
-            pattern=source_config.get('pattern', '*'),
+            pattern=source_config.get('pattern', None),
             recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'local':
         source = LocalSource(
             directory=source_config['directory'],
-            pattern=source_config.get('pattern', '*'),
+            pattern=source_config.get('pattern', None),
             recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'ftp':
@@ -484,7 +552,7 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             port=source_config['port'],
             username=source_config['username'],
             password=source_config['password'],
-            pattern=source_config.get('pattern', '*'),
+            pattern=source_config.get('pattern', None),
             recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'smb':
@@ -496,7 +564,7 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             domain=source_config.get('domain', ''),
             port=source_config.get('port', 445),
             path=source_config.get('path', ''),
-            pattern=source_config.get('pattern', '*'),
+            pattern=source_config.get('pattern', None),
             recursive=source_config.get('recursive', False)
         )
     else:

xparse_client/pipeline/sources.py CHANGED Viewed

@@ -121,8 +121,6 @@ class S3Source(Source):
         if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
             config = Config(signature_version='s3v4')
-        elif self.endpoint.endswith('aliyuncs.com'):
-            config = Config(signature_version='s3', s3={'addressing_style': 'virtual'})
         else:
             config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})

{xparse_client-0.2.11.dist-info → xparse_client-0.2.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.11
+Version: 0.2.19
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -11,7 +11,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: boto3
 Requires-Dist: pymilvus
-Requires-Dist: milvus-lite
+Requires-Dist: milvus-lite; sys_platform != "win32"
 Requires-Dist: requests
 Requires-Dist: pysmb
 Requires-Dist: qdrant-client

xparse_client-0.2.19.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+xparse_client/__init__.py,sha256=C2XLxkCoONl6_B1FmDhWRw84TqOL4pZF20br-K26SSY,1721
+xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
+xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
+xparse_client/pipeline/destinations.py,sha256=QKlNGcpXIqkZS3rlBlhLDoRqIWA21Jgn3GiGhhfE8Rc,20921
+xparse_client/pipeline/pipeline.py,sha256=Haaz0yBxdnVD3QW2CGJPfwhY7P1nF0uZ-7Oc3Vsqx2U,30200
+xparse_client/pipeline/sources.py,sha256=D-kLrSQ-qsFFFq7JC4sL3Y3Q3Q87Wcpv9R5K85YkDjE,22144
+xparse_client-0.2.19.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
+xparse_client-0.2.19.dist-info/METADATA,sha256=W9zMbFzrVOk20p-chC9zTq_heZ8P57xcTN1jrCXdZLs,28824
+xparse_client-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+xparse_client-0.2.19.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
+xparse_client-0.2.19.dist-info/RECORD,,

{xparse_client-0.2.11.dist-info → xparse_client-0.2.19.dist-info}/top_level.txt RENAMED Viewed

	@@ -1,2 +1 @@
1	- example
2 1	xparse_client

example/run_pipeline.py DELETED Viewed

@@ -1,506 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-Pipeline 运行脚本
-快速启动和运行 Pipeline 的示例
-'''
-import json
-from datetime import datetime, timezone
-from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
-# ============================================================================
-# 常量配置
-# ============================================================================
-# API 请求头配置
-API_HEADERS = {
-    'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
-    'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
-}
-# ============================================================================
-# 方式 1: 使用配置字典
-# ============================================================================
-def run_with_config():
-    """使用配置字典运行 pipeline"""
-    config = {
-        'source': {
-            'type': 's3',
-            'endpoint': 'https://textin-minio-api.ai.intsig.net',
-            'access_key': 'IEQspf8C7fVcgmp3AZWl',
-            'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-            'bucket': 'textin-test',
-            'prefix': '',  # 留空处理所有文件，或指定如 'milvus/'
-            'region': 'us-east-1'
-        },
-        'destination': {
-            'type': 'milvus',
-            'db_path': './milvus_pipeline.db',
-            'collection_name': 'pipeline_collection',
-            'dimension': 1024
-        },
-        'api_base_url': 'https://api.textin.com/api/xparse',
-        'api_headers': API_HEADERS,
-        # Stages 配置
-        'stages': [
-            {
-                'type': 'parse',
-                'config': {
-                    'provider': 'textin'
-                }
-            },
-            {
-                'type': 'chunk',
-                'config': {
-                    'strategy': 'basic',              # 分块策略: 'basic' | 'by_title' | 'by_page'
-                    'include_orig_elements': False,   # 是否包含原始元素
-                    'new_after_n_chars': 512,         # 多少字符后创建新块
-                    'max_characters': 1024,           # 最大字符数
-                    'overlap': 0                      # 重叠字符数
-                }
-            },
-            {
-                'type': 'embed',
-                'config': {
-                    'provider': 'qwen',                    # 向量化供应商: 'qwen'
-                    'model_name': 'text-embedding-v3'      # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
-                }
-            }
-        ]
-    }
-    pipeline = create_pipeline_from_config(config)
-    pipeline.run()
-# ============================================================================
-# 方式 2: 手动创建组件
-# ============================================================================
-def run_with_manual_setup():
-    """手动创建 Source、Destination 和 Pipeline"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage, PipelineConfig, LocalDestination, QdrantDestination
-    # 创建 S3 数据源
-    # source = S3Source(
-    #     endpoint='https://textin-minio-api.ai.intsig.net',
-    #     access_key='IEQspf8C7fVcgmp3AZWl',
-    #     secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-    #     bucket='textin-test',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
-    # source = S3Source(
-    #     endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
-    #     access_key='LTAI5tBgsaVfkbh9rbPyuB17',
-    #     secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='cn-shanghai'
-    # )
-    # source=S3Source(
-    #     endpoint='https://S3.oss-cn-shanghai.aliyuncs.com',
-    #     access_key='LTAI5t6ZnqTra8oLmJEfvcr7',
-    #     secret_key='SEbz4oJ4KNJIOTMfphuVGOWmRpGGUG',
-    #     bucket='textin-test-aliyun',
-    #     prefix='',
-    #     region='cn-shanghai'
-    # )
-    # source = S3Source(
-    #     endpoint='https://cos.ap-shanghai.myqcloud.com',
-    #     access_key='',
-    #     secret_key='',
-    #     bucket='textin-1300705866',
-    #     prefix='',
-    #     region='ap-shanghai'
-    # )
-    # source = S3Source(
-    #     endpoint='https://tos-s3-cn-shanghai.volces.com',
-    #     access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
-    #     secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='cn-shanghai'
-    # )
-    # source = S3Source(
-    #     endpoint='https://obs.cn-east-3.myhuaweicloud.com',
-    #     access_key='',
-    #     secret_key='',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='cn-east-3'
-    # )
-    # source = S3Source(
-    #     endpoint='https://s3.us-east-1.amazonaws.com',
-    #     access_key='AKIA6QUE3TVZADUWA4PO',
-    #     secret_key='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',
-    #     bucket='textin-test',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
-    # source = S3Source(
-    #     endpoint='http://127.0.0.1:9000',
-    #     access_key='',
-    #     secret_key='',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
-    # source = SmbSource(
-    #     host='internal-storage.intsig.net',
-    #     share_name='ke_wang',
-    #     username='ke_wang',
-    #     password='',
-    #     domain='INTSIG.COM'
-    # )
-    # source = FtpSource(
-    #     host='127.0.0.1',
-    #     port=21,
-    #     # recursive=True,
-    #     username='', # 用户名，按照实际填写
-    #     password=''  # 密码，按照实际填写
-    # )
-    source = LocalSource(
-        directory='/Users/ke_wang/Documents/doc',
-        pattern=['*.pdf'],
-        recursive=True,
-    )
-    # source=S3Source(
-    #     endpoint='https://obs.cn-north-4.myhuaweicloud.com',
-    #     access_key='HPUAFT3D1Q6O6UUN1RWQ',
-    #     secret_key='4zIk8x37nZiDS9P585BTFCWsOSo5G7ok1yRWtEA1',
-    #     bucket='textin-test-ywj',
-    #     prefix='',
-    #     region='cn-north-4'
-    # )# 华为云
-    # 创建 Milvus 目的地
-    # destination = MilvusDestination(
-    #     db_path='./milvus_pipeline1.db',
-    #     collection_name='pipeline_collection',
-    #     dimension=1024
-    # )
-    destination = LocalDestination(
-        output_dir='./result'
-    )
-    # destination = MilvusDestination(
-    #     db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
-    #     collection_name='textin_test_3_copy', # 数据库collection名称
-    #     dimension=1024,  # 向量维度，需与 embed API 返回一致
-    #     api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46'  # Zilliz Cloud API Key
-    # )
-    # destination = S3Destination(
-    #     endpoint='https://cos.ap-shanghai.myqcloud.com',
-    #     access_key='',
-    #     secret_key='',
-    #     bucket='textin-1300705866',
-    #     prefix='result',
-    #     region='ap-shanghai'
-    # )
-    # destination = QdrantDestination(
-    #     url='https://1325db22-7dd8-4fc9-930b-f969d4963b3d.us-east-1-1.aws.cloud.qdrant.io:6333',
-    #     collection_name='textin1',
-    #     dimension=1024,
-    #     api_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.TGnFB1pAD7c7IqSOvTpgCPpHXSnnoKhWEQ5pQ8DrBnI',
-    # )
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin', page_ranges='3')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_title',           # 按标题分块
-                include_orig_elements=False,
-                new_after_n_chars=512,
-                max_characters=1024,
-                overlap=50                     # 块之间重叠 50 字符
-            )
-        ),
-        Stage(
-            type='embed',
-            config=EmbedConfig(
-                provider='qwen',
-                model_name='text-embedding-v3'
-            )
-        )
-    ]
-    # 配置中间结果保存
-    intermediate_results_destination = LocalDestination(
-        output_dir='./intermediate_results'
-    )
-    pipeline_config = PipelineConfig(
-        include_intermediate_results=True,
-        intermediate_results_destination=intermediate_results_destination
-    )
-    # 创建 Pipeline
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages,
-        pipeline_config=pipeline_config
-    )
-    # 运行
-    # config = pipeline.get_config()
-    pipeline.run()
-# ============================================================================
-# 方式 3: 本地测试（本地文件 -> 本地输出）
-# ============================================================================
-def run_local_test():
-    """使用本地文件进行测试"""
-    config = {
-        'source': {
-            'type': 'local',
-            'directory': '/Users/ke_wang/Documents/doc',
-            'pattern': '*.pdf'
-        },
-        'destination': {
-            'type': 's3',
-            'endpoint': 'https://textin-minio-api.ai.intsig.net',
-            'access_key': '',
-            'secret_key': '',
-            'bucket': 'textin-test',
-            'prefix': '',
-            'region': 'us-east-1'
-        },
-        'api_base_url': 'https://api.textin.com/api/xparse',
-        'api_headers': API_HEADERS,
-        # Stages 配置
-        'stages': [
-            {
-                'type': 'parse',
-                'config': {
-                    'provider': 'textin'
-                }
-            },
-            {
-                'type': 'embed',
-                'config': {
-                    'provider': 'qwen',
-                    'model_name': 'text-embedding-v3'
-                }
-            }
-        ]
-    }
-    pipeline = create_pipeline_from_config(config)
-    pipeline.run()
-# ============================================================================
-# 方式 4: 处理单个文件
-# ============================================================================
-def run_single_file():
-    """只处理单个文件"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
-    # 创建 pipeline
-    source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
-    destination = LocalDestination(output_dir='./output')
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_page',            # 按页面分块
-                max_characters=2048,           # 增大块大小
-                overlap=100
-            )
-        ),
-        Stage(
-            type='embed',
-            config=EmbedConfig(
-                provider='qwen',
-                model_name='text-embedding-v4'  # 使用更高精度的模型
-            )
-        )
-    ]
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://api.textin.com/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages
-    )
-    # 只处理指定文件
-    file_path = '4e3250f00210431fb29ca0c808.pdf'  # 相对于 source directory 的路径
-    success = pipeline.process_file(file_path)
-    if success:
-        print(f"\n✅ 文件 {file_path} 处理成功！")
-    else:
-        print(f"\n❌ 文件 {file_path} 处理失败！")
-# ============================================================================
-# 方式 5: 自定义处理流程
-# ============================================================================
-def run_custom_flow():
-    """自定义处理流程，手动控制文件处理"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
-    # 创建组件
-    source = S3Source(
-        endpoint='https://textin-minio-api.ai.intsig.net',
-        access_key='',
-        secret_key='',
-        bucket='textin-test',
-        prefix='',
-        region='us-east-1',
-        pattern='*.pdf'
-    )
-    destination = MilvusDestination(
-        db_path='./milvus_custom.db',
-        collection_name='custom_collection',
-        dimension=1024
-    )
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_title',
-                include_orig_elements=True,
-                max_characters=1536,
-                overlap=80
-            )
-        ),
-        Stage(
-            type='embed',
-            config=EmbedConfig(
-                provider='qwen',
-                model_name='text-embedding-v4'
-            )
-        )
-    ]
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://api.textin.com/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages
-    )
-    # 手动控制文件处理
-    files = source.list_files()
-    for file_path in files[:2]:  # 只处理前2个文件
-        print(f"\n处理: {file_path}")
-        file_bytes, data_source = source.read_file(file_path)
-        data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
-        # 使用 pipeline 接口处理
-        result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
-        if result:
-            embedded, stats = result
-            print(f"  - 原始元素: {stats.original_elements}")
-            print(f"  - 分块后: {stats.chunked_elements}")
-            print(f"  - 向量化: {stats.embedded_elements}")
-            # 写入
-            metadata = {
-                'file_name': file_path,
-                'data_source': data_source,
-                'stats': {
-                    'original_elements': stats.original_elements,
-                    'chunked_elements': stats.chunked_elements,
-                    'embedded_elements': stats.embedded_elements
-                }
-            }
-            destination.write(embedded, metadata)
-            print(f"✓ 完成: {file_path}")
-        else:
-            print(f"✗ 失败: {file_path}")
-# ============================================================================
-# 主函数
-# ============================================================================
-def main():
-    """主函数 - 选择运行方式"""
-    print("=" * 60)
-    print("Pipeline 运行脚本")
-    print("=" * 60)
-    print("\n请选择运行方式：")
-    print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
-    print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
-    print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
-    print("4. 处理单个文件 [按页面分块 + V4模型]")
-    print("5. 自定义处理流程 [手动控制 + 统计信息]")
-    print()
-    try:
-        choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
-        if choice == '1':
-            print("\n使用配置字典运行...")
-            run_with_config()
-        elif choice == '2':
-            print("\n手动创建组件运行...")
-            run_with_manual_setup()
-        elif choice == '3':
-            print("\n本地测试模式...")
-            run_local_test()
-        elif choice == '4':
-            print("\n处理单个文件...")
-            run_single_file()
-        elif choice == '5':
-            print("\n自定义处理流程...")
-            run_custom_flow()
-        else:
-            print("无效的选项，使用默认方式运行...")
-            run_with_config()
-    except KeyboardInterrupt:
-        print("\n\n用户中断执行")
-    except Exception as e:
-        print(f"\n程序异常: {str(e)}")
-        import traceback
-        traceback.print_exc()
-if __name__ == '__main__':
-    main()

example/run_pipeline_test.py DELETED Viewed

@@ -1,458 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-Pipeline 运行脚本
-快速启动和运行 Pipeline 的示例
-'''
-from datetime import datetime, timezone
-from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
-# ============================================================================
-# 常量配置
-# ============================================================================
-# API 请求头配置
-API_HEADERS = {
-    'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
-    'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
-}
-# ============================================================================
-# 方式 1: 使用配置字典
-# ============================================================================
-def run_with_config():
-    """使用配置字典运行 pipeline"""
-    config = {
-        'source': {
-            'type': 's3',
-            'endpoint': 'https://textin-minio-api.ai.intsig.net',
-            'access_key': 'IEQspf8C7fVcgmp3AZWl',
-            'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-            'bucket': 'textin-test',
-            'prefix': '',  # 留空处理所有文件，或指定如 'milvus/'
-            'region': 'us-east-1'
-        },
-        'destination': {
-            'type': 'milvus',
-            'db_path': './milvus_pipeline.db',
-            'collection_name': 'pipeline_collection',
-            'dimension': 1024
-        },
-        'api_base_url': 'https://api.textin.com/api/xparse',
-        'api_headers': API_HEADERS,
-        # Stages 配置
-        'stages': [
-            {
-                'type': 'parse',
-                'config': {
-                    'provider': 'textin'
-                }
-            },
-            {
-                'type': 'chunk',
-                'config': {
-                    'strategy': 'basic',              # 分块策略: 'basic' | 'by_title' | 'by_page'
-                    'include_orig_elements': False,   # 是否包含原始元素
-                    'new_after_n_chars': 512,         # 多少字符后创建新块
-                    'max_characters': 1024,           # 最大字符数
-                    'overlap': 0                      # 重叠字符数
-                }
-            },
-            {
-                'type': 'embed',
-                'config': {
-                    'provider': 'qwen',                    # 向量化供应商: 'qwen'
-                    'model_name': 'text-embedding-v3'      # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
-                }
-            }
-        ]
-    }
-    pipeline = create_pipeline_from_config(config)
-    pipeline.run()
-# ============================================================================
-# 方式 2: 手动创建组件
-# ============================================================================
-def run_with_manual_setup():
-    """手动创建 Source、Destination 和 Pipeline"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
-    # 创建 S3 数据源
-    source = S3Source(
-        endpoint='https://textin-minio-api.ai.intsig.net',
-        access_key='IEQspf8C7fVcgmp3AZWl',
-        secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-        bucket='textin-test',
-        prefix='',
-        region='us-east-1'
-    )
-    source = S3Source(
-        endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
-        access_key='LTAI5tBgsaVfkbh9rbPyuB17',
-        secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
-        bucket='textin',
-        prefix='',
-        region='cn-shanghai',
-        pattern='*.png'
-    )
-    # source = S3Source(
-    #     endpoint='https://cos.ap-shanghai.myqcloud.com',
-    #     access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
-    #     secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
-    #     bucket='textin-1300705866',
-    #     prefix='',
-    #     region='ap-shanghai'
-    # )
-    # source = S3Source(
-    #     endpoint='https://tos-s3-cn-shanghai.volces.com',
-    #     access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
-    #     secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='cn-shanghai'
-    # )
-    # source = S3Source(
-    #     endpoint='https://obs.cn-east-3.myhuaweicloud.com',
-    #     access_key='HPUAL646UCQ1YAT7JMWY',
-    #     secret_key='z9cm95UXCw0R4J3AEig9siqGpZNbwDYz8PVoBGDI',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='cn-east-3'
-    # )
-    # source = S3Source(
-    #     endpoint='https://s3.us-east-1.amazonaws.com',
-    #     access_key='AKIA6QUE3TVZADUWA4PO',
-    #     secret_key='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',
-    #     bucket='textin-xparse',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
-    # source = S3Source(
-    #     endpoint='http://127.0.0.1:9000',
-    #     access_key='ldvOkKVZrsHW8ruqhwVG',
-    #     secret_key='sH665Q2DKgQxyLOpObXTb088SD2hvP0Rtg1dGTiT',
-    #     bucket='textin',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
-    # source = SmbSource(
-    #     host='internal-storage.intsig.net',
-    #     share_name='ke_wang',
-    #     username='ke_wang',
-    #     password='Hhxxblj!4',
-    #     domain='INTSIG.COM'
-    # )
-    # source = FtpSource(
-    #     host='127.0.0.1',
-    #     port=21,
-    #     username='', # 用户名，按照实际填写
-    #     password=''  # 密码，按照实际填写
-    # )
-    # source = LocalSource(
-    #     directory='/Users/ke_wang/Documents/doc',
-    #     pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
-    # )
-    # 创建 Milvus 目的地
-    destination = MilvusDestination(
-        db_path='./milvus_pipeline1.db',
-        collection_name='pipeline_collection',
-        dimension=1024
-    )
-    # destination = S3Destination(
-    #     endpoint='https://cos.ap-shanghai.myqcloud.com',
-    #     access_key='AKIDRnwsa4JLAl8GBspcAVcU9anlUzHLAmAJ',
-    #     secret_key='we7KJ4buxlLhogJm0zkFUUKxWu3yeDZi',
-    #     bucket='textin-1300705866',
-    #     prefix='result',
-    #     region='ap-shanghai'
-    # )
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_title',           # 按标题分块
-                include_orig_elements=False,
-                new_after_n_chars=512,
-                max_characters=1024,
-                overlap=50                     # 块之间重叠 50 字符
-            )
-        )
-        # 如果需要 embed，取消下面的注释
-        # Stage(
-        #     type='embed',
-        #     config=EmbedConfig(
-        #         provider='qwen',
-        #         model_name='text-embedding-v3'
-        #     )
-        # )
-    ]
-    # 创建 Pipeline
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages
-    )
-    # 运行
-    pipeline.run()
-# ============================================================================
-# 方式 3: 本地测试（本地文件 -> 本地输出）
-# ============================================================================
-def run_local_test():
-    """使用本地文件进行测试"""
-    config = {
-        'source': {
-            'type': 'local',
-            'directory': '/Users/ke_wang/Documents/doc',
-            'pattern': '*.pdf'
-        },
-        'destination': {
-            'type': 's3',
-            'endpoint': 'https://textin-minio-api.ai.intsig.net',
-            'access_key': 'IEQspf8C7fVcgmp3AZWl',
-            'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-            'bucket': 'textin-test',
-            'prefix': '',
-            'region': 'us-east-1'
-        },
-        'api_base_url': 'https://api.textin.com/api/xparse',
-        'api_headers': API_HEADERS,
-        # Stages 配置
-        'stages': [
-            {
-                'type': 'parse',
-                'config': {
-                    'provider': 'textin'
-                }
-            },
-            {
-                'type': 'embed',
-                'config': {
-                    'provider': 'qwen',
-                    'model_name': 'text-embedding-v3'
-                }
-            }
-        ]
-    }
-    pipeline = create_pipeline_from_config(config)
-    pipeline.run()
-# ============================================================================
-# 方式 4: 处理单个文件
-# ============================================================================
-def run_single_file():
-    """只处理单个文件"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
-    # 创建 pipeline
-    source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
-    destination = LocalDestination(output_dir='./output')
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_page',            # 按页面分块
-                max_characters=2048,           # 增大块大小
-                overlap=100
-            )
-        ),
-        Stage(
-            type='embed',
-            config=EmbedConfig(
-                provider='qwen',
-                model_name='text-embedding-v4'  # 使用更高精度的模型
-            )
-        )
-    ]
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://api.textin.com/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages
-    )
-    # 只处理指定文件
-    file_path = '4e3250f00210431fb29ca0c808.pdf'  # 相对于 source directory 的路径
-    success = pipeline.process_file(file_path)
-    if success:
-        print(f"\n✅ 文件 {file_path} 处理成功！")
-    else:
-        print(f"\n❌ 文件 {file_path} 处理失败！")
-# ============================================================================
-# 方式 5: 自定义处理流程
-# ============================================================================
-def run_custom_flow():
-    """自定义处理流程，手动控制文件处理"""
-    from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
-    # 创建组件
-    source = S3Source(
-        endpoint='https://textin-minio-api.ai.intsig.net',
-        access_key='IEQspf8C7fVcgmp3AZWl',
-        secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
-        bucket='textin-test',
-        prefix='',
-        region='us-east-1',
-        pattern='*.pdf'
-    )
-    destination = MilvusDestination(
-        db_path='./milvus_custom.db',
-        collection_name='custom_collection',
-        dimension=1024
-    )
-    # 使用新的 stages 格式创建配置
-    stages = [
-        Stage(
-            type='parse',
-            config=ParseConfig(provider='textin')
-        ),
-        Stage(
-            type='chunk',
-            config=ChunkConfig(
-                strategy='by_title',
-                include_orig_elements=True,
-                max_characters=1536,
-                overlap=80
-            )
-        ),
-        Stage(
-            type='embed',
-            config=EmbedConfig(
-                provider='qwen',
-                model_name='text-embedding-v4'
-            )
-        )
-    ]
-    pipeline = Pipeline(
-        source=source,
-        destination=destination,
-        api_base_url='https://api.textin.com/api/xparse',
-        api_headers=API_HEADERS,
-        stages=stages
-    )
-    # 手动控制文件处理
-    files = source.list_files()
-    for file_path in files[:2]:  # 只处理前2个文件
-        print(f"\n处理: {file_path}")
-        file_bytes, data_source = source.read_file(file_path)
-        data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
-        # 使用 pipeline 接口处理
-        result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
-        if result:
-            embedded, stats = result
-            print(f"  - 原始元素: {stats.original_elements}")
-            print(f"  - 分块后: {stats.chunked_elements}")
-            print(f"  - 向量化: {stats.embedded_elements}")
-            # 写入
-            metadata = {
-                'file_name': file_path,
-                'data_source': data_source,
-                'stats': {
-                    'original_elements': stats.original_elements,
-                    'chunked_elements': stats.chunked_elements,
-                    'embedded_elements': stats.embedded_elements
-                }
-            }
-            destination.write(embedded, metadata)
-            print(f"✓ 完成: {file_path}")
-        else:
-            print(f"✗ 失败: {file_path}")
-# ============================================================================
-# 主函数
-# ============================================================================
-def main():
-    """主函数 - 选择运行方式"""
-    print("=" * 60)
-    print("Pipeline 运行脚本")
-    print("=" * 60)
-    print("\n请选择运行方式：")
-    print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
-    print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
-    print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
-    print("4. 处理单个文件 [按页面分块 + V4模型]")
-    print("5. 自定义处理流程 [手动控制 + 统计信息]")
-    print()
-    try:
-        choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
-        if choice == '1':
-            print("\n使用配置字典运行...")
-            run_with_config()
-        elif choice == '2':
-            print("\n手动创建组件运行...")
-            run_with_manual_setup()
-        elif choice == '3':
-            print("\n本地测试模式...")
-            run_local_test()
-        elif choice == '4':
-            print("\n处理单个文件...")
-            run_single_file()
-        elif choice == '5':
-            print("\n自定义处理流程...")
-            run_custom_flow()
-        else:
-            print("无效的选项，使用默认方式运行...")
-            run_with_config()
-    except KeyboardInterrupt:
-        print("\n\n用户中断执行")
-    except Exception as e:
-        print(f"\n程序异常: {str(e)}")
-        import traceback
-        traceback.print_exc()
-if __name__ == '__main__':
-    main()

xparse_client-0.2.11.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-example/run_pipeline.py,sha256=xM79ebGJVordKH_SlNc8qWb1UNR4XXbeY3xlAYuThXU,16342
-example/run_pipeline_test.py,sha256=pxsNiq_LmP6M4R7tTuja0u-Lu7fW-wIBU1uBf0-agQI,14845
-xparse_client/__init__.py,sha256=C2XLxkCoONl6_B1FmDhWRw84TqOL4pZF20br-K26SSY,1721
-xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
-xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
-xparse_client/pipeline/destinations.py,sha256=9UyZ8Ygjoe4yAq6-VZNZBoNYRbb3mahify3c1AdOHMY,20775
-xparse_client/pipeline/pipeline.py,sha256=ZspagUjiL5wnzGJq6A7riOU8qGXJMtg1fqPm9H09mkk,27272
-xparse_client/pipeline/sources.py,sha256=pzJ5FjP-kZi-6Cphhm9rOPXETmHw5Qpf7EaxrQPgSxs,22285
-xparse_client-0.2.11.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
-xparse_client-0.2.11.dist-info/METADATA,sha256=8KMalNVqmo54zVvZlpGib0CaY77ORfVJxrj1okqh0rw,28799
-xparse_client-0.2.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xparse_client-0.2.11.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
-xparse_client-0.2.11.dist-info/RECORD,,

{xparse_client-0.2.11.dist-info → xparse_client-0.2.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{xparse_client-0.2.11.dist-info → xparse_client-0.2.19.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

xparse-client 0.2.11__py3-none-any.whl → 0.2.19__py3-none-any.whl

xparse-client 0.2.11py3-none-any.whl → 0.2.19py3-none-any.whl