PyPI - xparse-client - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

xparse-client 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{xparse_client-0.2.4 → xparse_client-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.4
+Version: 0.2.6
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -323,28 +323,34 @@ source = SmbSource(
 )
 ```
-> 提示：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 1：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 2：所有 Source 均支持 `recursive` 参数，表示是否递归遍历，默认为 `False`。
 ### Destination 配置
 #### 本地 Milvus 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: './milvus_pipeline.db', # 本地数据库文件
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024  # 向量维度，需与 embed API 返回一致
+    db_path='./milvus_pipeline.db', # 本地数据库文件
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024  # 向量维度，需与 embed API 返回一致
 )
 ```
 #### Zilliz 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: 'https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024,  # 向量维度，需与 embed API 返回一致
-    api_key: 'your-api-key'  # Zilliz Cloud API Key
+    db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024,  # 向量维度，需与 embed API 返回一致
+    api_key='your-api-key'  # Zilliz Cloud API Key
 )
 ```
@@ -354,7 +360,7 @@ destination = MilvusDestination(
 ```python
 destination = LocalDestination(
-    output_dir: './output'
+    output_dir='./output'
 )
 ```

{xparse_client-0.2.4 → xparse_client-0.2.6}/README.md RENAMED Viewed

@@ -306,28 +306,34 @@ source = SmbSource(
 )
 ```
-> 提示：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 1：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 2：所有 Source 均支持 `recursive` 参数，表示是否递归遍历，默认为 `False`。
 ### Destination 配置
 #### 本地 Milvus 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: './milvus_pipeline.db', # 本地数据库文件
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024  # 向量维度，需与 embed API 返回一致
+    db_path='./milvus_pipeline.db', # 本地数据库文件
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024  # 向量维度，需与 embed API 返回一致
 )
 ```
 #### Zilliz 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: 'https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024,  # 向量维度，需与 embed API 返回一致
-    api_key: 'your-api-key'  # Zilliz Cloud API Key
+    db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024,  # 向量维度，需与 embed API 返回一致
+    api_key='your-api-key'  # Zilliz Cloud API Key
 )
 ```
@@ -337,7 +343,7 @@ destination = MilvusDestination(
 ```python
 destination = LocalDestination(
-    output_dir: './output'
+    output_dir='./output'
 )
 ```

{xparse_client-0.2.4 → xparse_client-0.2.6}/example/run_pipeline.py RENAMED Viewed

@@ -96,8 +96,8 @@ def run_with_manual_setup():
     # )
     # source = S3Source(
     #     endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
-    #     access_key='',
-    #     secret_key='',
+    #     access_key='LTAI5tBgsaVfkbh9rbPyuB17',
+    #     secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
     #     bucket='textin',
     #     prefix='',
     #     region='cn-shanghai',
@@ -113,8 +113,8 @@ def run_with_manual_setup():
     # )
     # source = S3Source(
     #     endpoint='https://tos-s3-cn-shanghai.volces.com',
-    #     access_key='',
-    #     secret_key='',
+    #     access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
+    #     secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
     #     bucket='textin',
     #     prefix='',
     #     region='cn-shanghai'
@@ -127,14 +127,14 @@ def run_with_manual_setup():
     #     prefix='',
     #     region='cn-east-3'
     # )
-    # source = S3Source(
-    #     endpoint='https://s3.us-east-1.amazonaws.com',
-    #     access_key='',
-    #     secret_key='',
-    #     bucket='textin-xparse',
-    #     prefix='',
-    #     region='us-east-1'
-    # )
+    source = S3Source(
+        endpoint='https://s3.us-east-1.amazonaws.com',
+        access_key='AKIA6QUE3TVZADUWA4PO',
+        secret_key='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',
+        bucket='textin-xparse',
+        prefix='',
+        region='us-east-1'
+    )
     # source = S3Source(
     #     endpoint='http://127.0.0.1:9000',
     #     access_key='',
@@ -153,13 +153,14 @@ def run_with_manual_setup():
     # source = FtpSource(
     #     host='127.0.0.1',
     #     port=21,
+    #     # recursive=True,
     #     username='', # 用户名，按照实际填写
     #     password=''  # 密码，按照实际填写
     # )
-    source = LocalSource(
-        directory='/Users/ke_wang/Documents/doc',
-        pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
-    )
+    # source = LocalSource(
+    #     directory='/Users/ke_wang/Documents/doc',
+    #     pattern='*.pdf'  # 支持通配符: *.pdf, *.docx, **/*.txt
+    # )
     # 创建 Milvus 目的地
     # destination = MilvusDestination(
@@ -174,7 +175,7 @@ def run_with_manual_setup():
     destination = MilvusDestination(
         db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
-        collection_name='textin_test_2', # 数据库collection名称
+        collection_name='textin_test_3', # 数据库collection名称
         dimension=1024,  # 向量维度，需与 embed API 返回一致
         api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46'  # Zilliz Cloud API Key
     )

{xparse_client-0.2.4 → xparse_client-0.2.6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xparse-client"
-version = "0.2.4"
+version = "0.2.6"
 description = "面向Agent和RAG的新一代文档处理 AI Infra"
 readme = "README.md"
 license = "MIT"

{xparse_client-0.2.4 → xparse_client-0.2.6}/xparse_client/pipeline/config.py RENAMED Viewed

@@ -98,6 +98,7 @@ class PipelineStats:
     chunked_elements: int = 0
     embedded_elements: int = 0
     stages: Optional[List[Stage]] = None  # 存储实际执行的 stages
+    record_id: Optional[str] = None  # 记录 ID，用于标识需要写入 Milvus 的记录
 @dataclass

{xparse_client-0.2.4 → xparse_client-0.2.6}/xparse_client/pipeline/destinations.py RENAMED Viewed

@@ -18,6 +18,41 @@ from pymilvus import MilvusClient
 logger = logging.getLogger(__name__)
+def _flatten_dict(data: Dict[str, Any], prefix: str = '', fixed_fields: set = None) -> Dict[str, Any]:
+    """递归展平嵌套字典
+    Args:
+        data: 要展平的字典
+        prefix: 键的前缀
+        fixed_fields: 需要排除的字段集合
+    Returns:
+        展平后的字典
+    """
+    if fixed_fields is None:
+        fixed_fields = set()
+    result = {}
+    for key, value in data.items():
+        flat_key = f'{prefix}_{key}' if prefix else key
+        if flat_key in fixed_fields:
+            continue
+        if isinstance(value, dict):
+            # 递归展平嵌套字典
+            nested = _flatten_dict(value, flat_key, fixed_fields)
+            result.update(nested)
+        elif isinstance(value, list):
+            # 列表转换为 JSON 字符串
+            result[flat_key] = json.dumps(value, ensure_ascii=False)
+        else:
+            # 其他类型直接使用
+            result[flat_key] = value
+    return result
 class Destination(ABC):
     """数据目的地抽象基类"""
@@ -54,8 +89,7 @@ class MilvusDestination(Destination):
             schema.add_field(field_name="element_id", datatype=DataType.VARCHAR, max_length=128, is_primary=True)
             schema.add_field(field_name="embeddings", datatype=DataType.FLOAT_VECTOR, dim=dimension)
             schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)
-            schema.add_field(field_name="record_id", datatype=DataType.VARCHAR, max_length=128)
-            schema.add_field(field_name="metadata", datatype=DataType.JSON)
+            schema.add_field(field_name="record_id", datatype=DataType.VARCHAR, max_length=200)
             index_params = self.client.prepare_index_params()
             index_params.add_index(
@@ -77,6 +111,32 @@ class MilvusDestination(Destination):
     def write(self, data: List[Dict[str, Any]], metadata: Dict[str, Any]) -> bool:
         try:
+            # 如果 metadata 中有 record_id，先删除相同 record_id 的现有记录
+            record_id = metadata.get('record_id')
+            if record_id:
+                try:
+                    # 删除相同 record_id 的所有记录
+                    # MilvusClient.delete 返回删除的记录数（可能是 int 或 dict）
+                    result = self.client.delete(
+                        collection_name=self.collection_name,
+                        filter=f'record_id == "{record_id}"'
+                    )
+                    # 处理返回值：可能是数字或字典
+                    deleted_count = result if isinstance(result, int) else result.get('delete_count', 0) if isinstance(result, dict) else 0
+                    if deleted_count > 0:
+                        print(f"  ✓ 删除现有记录: record_id={record_id}, 删除 {deleted_count} 条")
+                        logger.info(f"删除 Milvus 现有记录: record_id={record_id}, 删除 {deleted_count} 条")
+                    else:
+                        print(f"  → 未找到现有记录: record_id={record_id}")
+                except Exception as e:
+                    print(f"  ! 删除现有记录失败: {str(e)}")
+                    logger.warning(f"删除 Milvus 现有记录失败: record_id={record_id}, {str(e)}")
+                    # 继续执行写入，不因为删除失败而中断
+            else:
+                print(f"  → 没有 record_id")
+                logger.warning(f"没有 record_id")
+                return
             insert_data = []
             for item in data:
                 # 获取元素级别的 metadata
@@ -90,8 +150,7 @@ class MilvusDestination(Destination):
                         'embeddings': item['embeddings'],
                         'text': item.get('text', ''),
                         'element_id': element_id,
-                        'record_id': element_metadata.get('record_id', ''),
-                        'created_at': datetime.now().isoformat()
+                        'record_id': record_id
                     }
                     # 合并文件级别的 metadata 和元素级别的 metadata
@@ -103,17 +162,13 @@ class MilvusDestination(Destination):
                     fixed_fields = {'embeddings', 'text', 'element_id', 'record_id', 'created_at', 'metadata'}
                     for key, value in merged_metadata.items():
                         if key not in fixed_fields:
-                            # 特殊处理 data_source 字段：如果是字典则展平
+                            # 特殊处理 data_source 字段：如果是字典则递归展平
                             if key == 'data_source' and isinstance(value, dict):
-                                # 将 data_source 字典展平为 data_source_* 格式
-                                for sub_key, sub_value in value.items():
-                                    flat_key = f'data_source_{sub_key}'
-                                    if flat_key not in fixed_fields:
-                                        # 如果子值也是字典或列表，转换为 JSON 字符串
-                                        if isinstance(sub_value, (dict, list)):
-                                            insert_item[flat_key] = json.dumps(sub_value, ensure_ascii=False)
-                                        else:
-                                            insert_item[flat_key] = sub_value
+                                # 递归展平 data_source 字典，包括嵌套的字典
+                                flattened = _flatten_dict(value, 'data_source', fixed_fields)
+                                insert_item.update(flattened)
+                            elif key == 'coordinates' and isinstance(value, list):
+                                insert_item[key] = value
                             elif isinstance(value, (dict, list)):
                                 continue
                             else:
@@ -149,8 +204,8 @@ class LocalDestination(Destination):
     def write(self, data: List[Dict[str, Any]], metadata: Dict[str, Any]) -> bool:
         try:
-            file_name = metadata.get('file_name', 'output')
-            base_name = Path(file_name).stem
+            filename = metadata.get('filename', 'output')
+            base_name = Path(filename).stem
             stage = metadata.get('stage')  # 用于区分中间结果的阶段
             # 如果是中间结果，在文件名中添加阶段标识
@@ -218,8 +273,8 @@ class S3Destination(Destination):
     def write(self, data: List[Dict[str, Any]], metadata: Dict[str, Any]) -> bool:
         try:
-            file_name = metadata.get('file_name', 'output')
-            base_name = Path(file_name).stem
+            filename = metadata.get('filename', 'output')
+            base_name = Path(filename).stem
             object_key = f"{self.prefix}/{base_name}.json" if self.prefix else f"{base_name}.json"
             json_data = json.dumps(data, ensure_ascii=False, indent=2)

{xparse_client-0.2.4 → xparse_client-0.2.6}/xparse_client/pipeline/pipeline.py RENAMED Viewed

@@ -79,13 +79,13 @@ class Pipeline:
             print(f"  Pipeline Config: 中间结果保存已启用")
         print("=" * 60)
-    def _call_pipeline_api(self, file_bytes: bytes, file_name: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         url = f"{self.api_base_url}/pipeline"
         max_retries = 3
         for try_count in range(max_retries):
             try:
-                files = {'file': (file_name or 'file', file_bytes)}
+                files = {'file': (filename or 'file', file_bytes)}
                 form_data = {}
                 # 将 stages 转换为 API 格式
@@ -107,26 +107,55 @@ class Pipeline:
                 if response.status_code == 200:
                     result = response.json()
-                    print(f"  ✓ Pipeline 接口返回 x_request_id: {result.get('x_request_id')}")
+                    x_request_id = result.get('x_request_id', '')
+                    print(f"  ✓ Pipeline 接口返回 x_request_id: {x_request_id}")
                     if result.get('code') == 200 and 'data' in result:
                         return result.get('data')
+                    # 如果 code 不是 200，打印错误信息
+                    error_msg = result.get('message', result.get('msg', '未知错误'))
+                    print(f"  ✗ Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
+                    logger.error(f"Pipeline 接口返回错误: code={result.get('code')}, message={error_msg}, x_request_id={x_request_id}")
                     return None
                 else:
-                    print(f"  ! API 错误 {response.status_code}, 重试 {try_count + 1}/{max_retries}")
-                    logger.warning(f"API 错误 {response.status_code}: pipeline")
+                    # 尝试解析响应获取 x_request_id 和错误信息
+                    x_request_id = ''
+                    error_msg = ''
+                    try:
+                        result = response.json()
+                        x_request_id = result.get('x_request_id', '')
+                        error_msg = result.get('message', result.get('msg', response.text[:200]))
+                    except:
+                        error_msg = response.text[:200] if response.text else f'HTTP {response.status_code}'
+                    print(f"  ✗ API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
+                    logger.warning(f"API 错误 {response.status_code}: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
             except Exception as e:
-                print(f"  ! 请求异常: {str(e)}, 重试 {try_count + 1}/{max_retries}")
-                logger.error(f"API 请求异常 pipeline: {str(e)}")
+                # 如果是 requests 异常，尝试从响应中获取 x_request_id
+                x_request_id = ''
+                error_msg = str(e)
+                try:
+                    if hasattr(e, 'response') and e.response is not None:
+                        try:
+                            result = e.response.json()
+                            x_request_id = result.get('x_request_id', '')
+                            error_msg = result.get('message', result.get('msg', error_msg))
+                        except:
+                            pass
+                except:
+                    pass
+                print(f"  ✗ 请求异常: {error_msg}, x_request_id={x_request_id}, 重试 {try_count + 1}/{max_retries}")
+                logger.error(f"API 请求异常 pipeline: {error_msg}, x_request_id={x_request_id}")
             if try_count < max_retries - 1:
                 time.sleep(2)
         return None
-    def process_with_pipeline(self, file_bytes: bytes, file_name: str, data_source: Dict[str, Any]) -> Optional[Tuple[List[Dict[str, Any]], PipelineStats]]:
-        print(f"  → 调用 Pipeline 接口: {file_name}")
-        result = self._call_pipeline_api(file_bytes, file_name, data_source)
+    def process_with_pipeline(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Tuple[List[Dict[str, Any]], PipelineStats]]:
+        print(f"  → 调用 Pipeline 接口: {filename}")
+        result = self._call_pipeline_api(file_bytes, filename, data_source)
         if result and 'elements' in result and 'stats' in result:
             elements = result['elements']
@@ -136,31 +165,32 @@ class Pipeline:
                 original_elements=stats_data.get('original_elements', 0),
                 chunked_elements=stats_data.get('chunked_elements', 0),
                 embedded_elements=stats_data.get('embedded_elements', 0),
-                stages=self.stages  # 使用实际执行的 stages
+                stages=self.stages,  # 使用实际执行的 stages
+                record_id=stats_data.get('record_id')  # 从 API 响应中获取 record_id
             )
             # 如果启用了中间结果保存，处理中间结果
             if self.pipeline_config.include_intermediate_results and 'intermediate_results' in result:
-                self._save_intermediate_results(result['intermediate_results'], file_name, data_source)
+                self._save_intermediate_results(result['intermediate_results'], filename, data_source)
             print(f"  ✓ Pipeline 完成:")
             print(f"    - 原始元素: {stats.original_elements}")
             print(f"    - 分块后: {stats.chunked_elements}")
             print(f"    - 向量化: {stats.embedded_elements}")
-            logger.info(f"Pipeline 完成: {file_name}, {stats.embedded_elements} 个向量")
+            logger.info(f"Pipeline 完成: {filename}, {stats.embedded_elements} 个向量")
             return elements, stats
         else:
             print(f"  ✗ Pipeline 失败")
-            logger.error(f"Pipeline 失败: {file_name}")
+            logger.error(f"Pipeline 失败: {filename}")
             return None
-    def _save_intermediate_results(self, intermediate_results: List[Dict[str, Any]], file_name: str, data_source: Dict[str, Any]) -> None:
+    def _save_intermediate_results(self, intermediate_results: List[Dict[str, Any]], filename: str, data_source: Dict[str, Any]) -> None:
         """保存中间结果
         Args:
             intermediate_results: 中间结果数组，每个元素包含 stage 和 elements 字段
-            file_name: 文件名
+            filename: 文件名
             data_source: 数据源信息
         """
         try:
@@ -174,7 +204,7 @@ class Pipeline:
                 elements = result_item['elements']
                 metadata = {
-                    'file_name': file_name,
+                    'filename': filename,
                     'stage': stage,
                     'total_elements': len(elements),
                     'processed_at': datetime.now().isoformat(),
@@ -183,11 +213,11 @@ class Pipeline:
                 self.pipeline_config.intermediate_results_destination.write(elements, metadata)
                 print(f"  ✓ 保存 {stage.upper()} 中间结果: {len(elements)} 个元素")
-                logger.info(f"保存 {stage.upper()} 中间结果成功: {file_name}")
+                logger.info(f"保存 {stage.upper()} 中间结果成功: {filename}")
         except Exception as e:
             print(f"  ✗ 保存中间结果失败: {str(e)}")
-            logger.error(f"保存中间结果失败: {file_name}, {str(e)}")
+            logger.error(f"保存中间结果失败: {filename}, {str(e)}")
     def process_file(self, file_path: str) -> bool:
         print(f"\n{'=' * 60}")
@@ -198,7 +228,10 @@ class Pipeline:
             print(f"  → 读取文件...")
             file_bytes, data_source = self.source.read_file(file_path)
             data_source = data_source or {}
-            data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
+            print(f"  data_source: {data_source}")
+            # 转换为毫秒时间戳字符串
+            timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
+            data_source['date_processed'] = str(timestamp_ms)
             print(f"  ✓ 文件读取完成: {len(file_bytes)} bytes")
             result = self.process_with_pipeline(file_bytes, file_path, data_source)
@@ -209,16 +242,13 @@ class Pipeline:
             print(f"  → 写入目的地...")
             metadata = {
-                'file_name': file_path,
-                'total_elements': len(embedded_data),
-                'processed_at': datetime.now().isoformat(),
-                'data_source': data_source,
-                'stats': {
-                    'original_elements': stats.original_elements,
-                    'chunked_elements': stats.chunked_elements,
-                    'embedded_elements': stats.embedded_elements
-                }
+                'filename': file_path,
+                'processed_at': str(timestamp_ms),
             }
+            # 如果 stats 中有 record_id，添加到 metadata 中
+            if stats.record_id:
+                metadata['record_id'] = stats.record_id
             success = self.destination.write(embedded_data, metadata)
@@ -299,12 +329,14 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             bucket=source_config['bucket'],
             prefix=source_config.get('prefix', ''),
             region=source_config.get('region', 'us-east-1'),
-            pattern=source_config.get('pattern', '*')
+            pattern=source_config.get('pattern', '*'),
+            recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'local':
         source = LocalSource(
             directory=source_config['directory'],
-            pattern=source_config.get('pattern', '*')
+            pattern=source_config.get('pattern', '*'),
+            recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'ftp':
         source = FtpSource(
@@ -312,7 +344,8 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             port=source_config['port'],
             username=source_config['username'],
             password=source_config['password'],
-            pattern=source_config.get('pattern', '*')
+            pattern=source_config.get('pattern', '*'),
+            recursive=source_config.get('recursive', False)
         )
     elif source_config['type'] == 'smb':
         source = SmbSource(
@@ -323,7 +356,8 @@ def create_pipeline_from_config(config: Dict[str, Any]) -> Pipeline:
             domain=source_config.get('domain', ''),
             port=source_config.get('port', 445),
             path=source_config.get('path', ''),
-            pattern=source_config.get('pattern', '*')
+            pattern=source_config.get('pattern', '*'),
+            recursive=source_config.get('recursive', False)
         )
     else:
         raise ValueError(f"未知的 source 类型: {source_config['type']}")

{xparse_client-0.2.4 → xparse_client-0.2.6}/xparse_client/pipeline/sources.py RENAMED Viewed

@@ -20,6 +20,30 @@ from botocore.config import Config
 logger = logging.getLogger(__name__)
+def _to_millis_timestamp_string(timestamp):
+    """将时间戳转换为毫秒时间戳字符串
+    Args:
+        timestamp: 时间戳（秒或毫秒），可以是 int、float 或 None
+    Returns:
+        str: 毫秒时间戳字符串，如果输入为 None 则返回空字符串
+    """
+    if timestamp is None:
+        return ""
+    # 如果已经是毫秒时间戳（大于 1e12），直接转换
+    if isinstance(timestamp, (int, float)):
+        if timestamp > 1e12:
+            # 已经是毫秒时间戳
+            return str(int(timestamp))
+        else:
+            # 秒级时间戳，转换为毫秒
+            return str(int(timestamp * 1000))
+    return str(timestamp)
 class Source(ABC):
     """数据源抽象基类"""
@@ -38,11 +62,12 @@ class S3Source(Source):
     """S3/MinIO 数据源"""
     def __init__(self, endpoint: str, access_key: str, secret_key: str,
-                 bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*'):
+                 bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*', recursive: bool = False):
         self.endpoint = endpoint
         self.bucket = bucket
         self.prefix = prefix
         self.pattern = pattern or '*'
+        self.recursive = recursive
         if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
             config = Config(signature_version='s3v4')
@@ -73,8 +98,12 @@ class S3Source(Source):
         params = {'Bucket': self.bucket}
         if self.prefix:
             params['Prefix'] = self.prefix
+        if not self.recursive:
+            # 非递归模式：使用 Delimiter 只列出当前目录下的文件
+            params['Delimiter'] = '/'
         for page in paginator.paginate(**params):
+            print(page)
             if 'Contents' in page:
                 for obj in page['Contents']:
                     key = obj['Key']
@@ -82,6 +111,11 @@ class S3Source(Source):
                         continue
                     if fnmatch(key, self.pattern):
                         files.append(key)
+            # 非递归模式下，CommonPrefixes 包含子目录，我们忽略它们
+            if not self.recursive and 'CommonPrefixes' in page:
+                # 这些是子目录，在非递归模式下忽略
+                pass
         print(f"✓ S3 找到 {len(files)} 个文件")
         return files
@@ -91,7 +125,9 @@ class S3Source(Source):
         file_bytes = response['Body'].read()
         headers = response.get('ResponseMetadata', {}).get('HTTPHeaders', {})
-        version = response.get('VersionId') or headers.get('x-amz-version-id')
+        version = headers.get('etag') or ""
+        if version.startswith('"') and version.endswith('"'):
+            version = version[1:-1]
         last_modified = headers.get('last-modified')
         server = headers.get('server') or "unknown"
         date_modified = None
@@ -106,8 +142,8 @@ class S3Source(Source):
         data_source = {
             'url': f"s3://{self.bucket}/{normalized_key}",
             'version': version,
-            'date_created': date_modified,
-            'date_modified': date_modified,
+            'date_created': _to_millis_timestamp_string(date_modified),
+            'date_modified': _to_millis_timestamp_string(date_modified),
             'record_locator': {
                 'server': server,
                 'protocol': 's3',
@@ -121,9 +157,10 @@ class S3Source(Source):
 class LocalSource(Source):
     """本地文件系统数据源"""
-    def __init__(self, directory: str, pattern: str = '*'):
+    def __init__(self, directory: str, pattern: str = '*', recursive: bool = False):
         self.directory = Path(directory)
         self.pattern = pattern or '*'
+        self.recursive = recursive
         if not self.directory.exists():
             raise ValueError(f"目录不存在: {directory}")
@@ -132,11 +169,20 @@ class LocalSource(Source):
         logger.info(f"本地目录: {self.directory}")
     def list_files(self) -> List[str]:
-        files = [
-            str(f.relative_to(self.directory))
-            for f in self.directory.rglob(self.pattern)
-            if f.is_file()
-        ]
+        if self.recursive:
+            # 递归模式：使用 rglob
+            files = [
+                str(f.relative_to(self.directory))
+                for f in self.directory.rglob(self.pattern)
+                if f.is_file()
+            ]
+        else:
+            # 非递归模式：只列出根目录下的文件，使用 glob
+            files = [
+                str(f.relative_to(self.directory))
+                for f in self.directory.glob(self.pattern)
+                if f.is_file()
+            ]
         print(f"✓ 本地找到 {len(files)} 个文件")
         return files
@@ -159,8 +205,8 @@ class LocalSource(Source):
         data_source = {
             'url': full_path.as_uri(),
             'version': version,
-            'date_created': date_created,
-            'date_modified': date_modified,
+            'date_created': _to_millis_timestamp_string(date_created),
+            'date_modified': _to_millis_timestamp_string(date_modified),
             'record_locator': {
                 'protocol': 'file',
                 'remote_file_path': str(full_path)
@@ -172,12 +218,13 @@ class LocalSource(Source):
 class FtpSource(Source):
     """FTP 数据源"""
-    def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*'):
+    def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*', recursive: bool = False):
         self.host = host
         self.port = port
         self.username = username
         self.password = password
         self.pattern = pattern or '*'
+        self.recursive = recursive
         self.client = ftplib.FTP()
         self.client.connect(self.host, self.port)
@@ -187,8 +234,139 @@ class FtpSource(Source):
         logger.info(f"FTP 连接成功: {self.host}:{self.port}")
     def list_files(self) -> List[str]:
-        raw_files = self.client.nlst()
-        files = [f for f in raw_files if fnmatch(f, self.pattern)]
+        if self.recursive:
+            # 递归模式：递归列出所有文件
+            files = []
+            current_dir = self.client.pwd()
+            def _list_recursive(path=''):
+                try:
+                    # 保存当前目录
+                    original_dir = self.client.pwd()
+                    if path:
+                        try:
+                            self.client.cwd(path)
+                        except:
+                            return
+                    items = []
+                    try:
+                        # 尝试使用 MLSD 命令（更可靠）
+                        items = []
+                        for item in self.client.mlsd():
+                            items.append(item)
+                    except:
+                        # 如果不支持 MLSD，使用 LIST 命令
+                        try:
+                            lines = []
+                            self.client.retrlines('LIST', lines.append)
+                            for line in lines:
+                                parts = line.split()
+                                if len(parts) >= 9:
+                                    # 解析 LIST 输出，第一个字符表示文件类型
+                                    item_name = ' '.join(parts[8:])
+                                    is_dir = parts[0].startswith('d')
+                                    items.append((item_name, {'type': 'dir' if is_dir else 'file'}))
+                        except:
+                            # 最后回退到 nlst，但无法区分文件和目录
+                            for item_name in self.client.nlst():
+                                items.append((item_name, {'type': 'unknown'}))
+                    for item_name, item_info in items:
+                        if item_name in ['.', '..']:
+                            continue
+                        item_type = item_info.get('type', 'unknown')
+                        full_path = f"{path}/{item_name}" if path else item_name
+                        if item_type == 'dir' or item_type == 'unknown':
+                            # 尝试切换目录来判断是否为目录
+                            try:
+                                self.client.cwd(item_name)
+                                self.client.cwd('..')
+                                # 是目录，递归处理
+                                _list_recursive(full_path)
+                            except:
+                                # 不是目录，是文件
+                                relative_path = full_path.lstrip('/')
+                                if fnmatch(relative_path, self.pattern):
+                                    files.append(relative_path)
+                        else:
+                            # 是文件
+                            relative_path = full_path.lstrip('/')
+                            if fnmatch(relative_path, self.pattern):
+                                files.append(relative_path)
+                    # 恢复原始目录
+                    self.client.cwd(original_dir)
+                except Exception as e:
+                    logger.warning(f"FTP 列出路径失败 {path}: {str(e)}")
+                    try:
+                        self.client.cwd(current_dir)
+                    except:
+                        pass
+            _list_recursive()
+            # 确保回到原始目录
+            try:
+                self.client.cwd(current_dir)
+            except:
+                pass
+        else:
+            # 非递归模式：只列出当前目录下的文件（排除目录）
+            files = []
+            current_dir = self.client.pwd()
+            try:
+                # 尝试使用 MLSD 命令（更可靠）
+                items = []
+                for item_name, item_info in self.client.mlsd():
+                    if item_name in ['.', '..']:
+                        continue
+                    item_type = item_info.get('type', 'unknown')
+                    # 只添加文件，排除目录
+                    if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
+                        if fnmatch(item_name, self.pattern):
+                            files.append(item_name)
+            except:
+                # 如果不支持 MLSD，使用 LIST 命令
+                try:
+                    lines = []
+                    self.client.retrlines('LIST', lines.append)
+                    for line in lines:
+                        parts = line.split()
+                        if len(parts) >= 9:
+                            # 解析 LIST 输出，第一个字符表示文件类型
+                            item_name = ' '.join(parts[8:])
+                            if item_name in ['.', '..']:
+                                continue
+                            is_dir = parts[0].startswith('d')
+                            # 只添加文件，排除目录
+                            if not is_dir and fnmatch(item_name, self.pattern):
+                                files.append(item_name)
+                except:
+                    # 最后回退到 nlst，通过尝试切换目录来判断是否为目录
+                    raw_items = self.client.nlst()
+                    for item_name in raw_items:
+                        if item_name in ['.', '..']:
+                            continue
+                        # 尝试切换目录来判断是否为目录
+                        try:
+                            self.client.cwd(item_name)
+                            self.client.cwd('..')
+                            # 能切换成功，说明是目录，跳过
+                            continue
+                        except:
+                            # 不能切换，说明是文件
+                            if fnmatch(item_name, self.pattern):
+                                files.append(item_name)
+            # 确保回到原始目录
+            try:
+                self.client.cwd(current_dir)
+            except:
+                pass
         print(f"✓ FTP 找到 {len(files)} 个文件 (匹配 pattern)")
         return files
@@ -208,11 +386,12 @@ class FtpSource(Source):
             logger.debug(f"FTP 获取文件时间失败 {file_path}: {exc}")
         normalized_path = file_path.lstrip('/')
+        version = _to_millis_timestamp_string(date_modified)
         data_source = {
             'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
-            'version': None,
-            'date_created': None,
-            'date_modified': date_modified,
+            'version': version,
+            'date_created': version,
+            'date_modified': version,
             'record_locator': {
                 'server': f"{self.host}:{self.port}",
                 'protocol': 'ftp',
@@ -227,7 +406,7 @@ class SmbSource(Source):
     """SMB/CIFS 数据源"""
     def __init__(self, host: str, share_name: str, username: str, password: str,
-                 domain: str = '', port: int = 445, path: str = '', pattern: str = '*'):
+                 domain: str = '', port: int = 445, path: str = '', pattern: str = '*', recursive: bool = False):
         self.host = host
         self.share_name = share_name
         self.username = username
@@ -236,6 +415,7 @@ class SmbSource(Source):
         self.port = port
         self.path = path.strip('/').strip('\\') if path else ''
         self.pattern = pattern or '*'
+        self.recursive = recursive
         self.conn = SMBConnection(
             username,
@@ -267,7 +447,10 @@ class SmbSource(Source):
                     item_path = f"{current_path.rstrip('/')}/{item.filename}" if current_path != '/' else f"/{item.filename}"
                     relative_path = item_path[len(base_path):].lstrip('/')
                     if item.isDirectory:
-                        _list_recursive(conn, share, item_path)
+                        if self.recursive:
+                            # 递归模式：继续递归子目录
+                            _list_recursive(conn, share, item_path)
+                        # 非递归模式：忽略子目录
                     else:
                         if fnmatch(relative_path, self.pattern):
                             files.append(relative_path)
@@ -310,9 +493,9 @@ class SmbSource(Source):
         smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
         data_source = {
             'url': smb_url,
-            'version': None,
-            'date_created': date_created,
-            'date_modified': date_modified,
+            'version': _to_millis_timestamp_string(date_modified),
+            'date_created': _to_millis_timestamp_string(date_created),
+            'date_modified': _to_millis_timestamp_string(date_modified),
             'record_locator': {
                 'server': self.host,
                 'share': self.share_name,

{xparse_client-0.2.4 → xparse_client-0.2.6}/xparse_client.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xparse-client
-Version: 0.2.4
+Version: 0.2.6
 Summary: 面向Agent和RAG的新一代文档处理 AI Infra
 License-Expression: MIT
 Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -323,28 +323,34 @@ source = SmbSource(
 )
 ```
-> 提示：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 1：所有 Source 均支持 `pattern` 参数，使用 Shell 通配符（`*.pdf`、`**/*.txt` 等）来过滤需要处理的文件；默认为 `*`，即处理全部文件。
+> 注 2：所有 Source 均支持 `recursive` 参数，表示是否递归遍历，默认为 `False`。
 ### Destination 配置
 #### 本地 Milvus 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: './milvus_pipeline.db', # 本地数据库文件
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024  # 向量维度，需与 embed API 返回一致
+    db_path='./milvus_pipeline.db', # 本地数据库文件
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024  # 向量维度，需与 embed API 返回一致
 )
 ```
 #### Zilliz 向量存储
+collection 中至少需要包含 `element_id`，`text`，`embeddings`，`record_id` 四个字段。
 ```python
 destination = MilvusDestination(
-    db_path: 'https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
-    collection_name: 'my_collection', # 数据库collection名称
-    dimension: 1024,  # 向量维度，需与 embed API 返回一致
-    api_key: 'your-api-key'  # Zilliz Cloud API Key
+    db_path='https://xxxxxxx.serverless.xxxxxxx.cloud.zilliz.com.cn', # zilliz连接地址
+    collection_name='my_collection', # 数据库collection名称
+    dimension=1024,  # 向量维度，需与 embed API 返回一致
+    api_key='your-api-key'  # Zilliz Cloud API Key
 )
 ```
@@ -354,7 +360,7 @@ destination = MilvusDestination(
 ```python
 destination = LocalDestination(
-    output_dir: './output'
+    output_dir='./output'
 )
 ```