PyPI - maque - Versions diffs - 0.2.1__py3-none-any.whl - Mend

maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

maque/__init__.py +30 -0
maque/__main__.py +926 -0
maque/ai_platform/__init__.py +0 -0
maque/ai_platform/crawl.py +45 -0
maque/ai_platform/metrics.py +258 -0
maque/ai_platform/nlp_preprocess.py +67 -0
maque/ai_platform/webpage_screen_shot.py +195 -0
maque/algorithms/__init__.py +78 -0
maque/algorithms/bezier.py +15 -0
maque/algorithms/bktree.py +117 -0
maque/algorithms/core.py +104 -0
maque/algorithms/hilbert.py +16 -0
maque/algorithms/rate_function.py +92 -0
maque/algorithms/transform.py +27 -0
maque/algorithms/trie.py +272 -0
maque/algorithms/utils.py +63 -0
maque/algorithms/video.py +587 -0
maque/api/__init__.py +1 -0
maque/api/common.py +110 -0
maque/api/fetch.py +26 -0
maque/api/static/icon.png +0 -0
maque/api/static/redoc.standalone.js +1782 -0
maque/api/static/swagger-ui-bundle.js +3 -0
maque/api/static/swagger-ui.css +3 -0
maque/cli/__init__.py +1 -0
maque/cli/clean_invisible_chars.py +324 -0
maque/cli/core.py +34 -0
maque/cli/groups/__init__.py +26 -0
maque/cli/groups/config.py +205 -0
maque/cli/groups/data.py +615 -0
maque/cli/groups/doctor.py +259 -0
maque/cli/groups/embedding.py +222 -0
maque/cli/groups/git.py +29 -0
maque/cli/groups/help.py +410 -0
maque/cli/groups/llm.py +223 -0
maque/cli/groups/mcp.py +241 -0
maque/cli/groups/mllm.py +1795 -0
maque/cli/groups/mllm_simple.py +60 -0
maque/cli/groups/quant.py +210 -0
maque/cli/groups/service.py +490 -0
maque/cli/groups/system.py +570 -0
maque/cli/mllm_run.py +1451 -0
maque/cli/script.py +52 -0
maque/cli/tree.py +49 -0
maque/clustering/__init__.py +52 -0
maque/clustering/analyzer.py +347 -0
maque/clustering/clusterers.py +464 -0
maque/clustering/sampler.py +134 -0
maque/clustering/visualizer.py +205 -0
maque/constant.py +13 -0
maque/core.py +133 -0
maque/cv/__init__.py +1 -0
maque/cv/image.py +219 -0
maque/cv/utils.py +68 -0
maque/cv/video/__init__.py +3 -0
maque/cv/video/keyframe_extractor.py +368 -0
maque/embedding/__init__.py +43 -0
maque/embedding/base.py +56 -0
maque/embedding/multimodal.py +308 -0
maque/embedding/server.py +523 -0
maque/embedding/text.py +311 -0
maque/git/__init__.py +24 -0
maque/git/pure_git.py +912 -0
maque/io/__init__.py +29 -0
maque/io/core.py +38 -0
maque/io/ops.py +194 -0
maque/llm/__init__.py +111 -0
maque/llm/backend.py +416 -0
maque/llm/base.py +411 -0
maque/llm/server.py +366 -0
maque/mcp_server.py +1096 -0
maque/mllm_data_processor_pipeline/__init__.py +17 -0
maque/mllm_data_processor_pipeline/core.py +341 -0
maque/mllm_data_processor_pipeline/example.py +291 -0
maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
maque/mllm_data_processor_pipeline/web_app.py +317 -0
maque/nlp/__init__.py +14 -0
maque/nlp/ngram.py +9 -0
maque/nlp/parser.py +63 -0
maque/nlp/risk_matcher.py +543 -0
maque/nlp/sentence_splitter.py +202 -0
maque/nlp/simple_tradition_cvt.py +31 -0
maque/performance/__init__.py +21 -0
maque/performance/_measure_time.py +70 -0
maque/performance/_profiler.py +367 -0
maque/performance/_stat_memory.py +51 -0
maque/pipelines/__init__.py +15 -0
maque/pipelines/clustering.py +252 -0
maque/quantization/__init__.py +42 -0
maque/quantization/auto_round.py +120 -0
maque/quantization/base.py +145 -0
maque/quantization/bitsandbytes.py +127 -0
maque/quantization/llm_compressor.py +102 -0
maque/retriever/__init__.py +35 -0
maque/retriever/chroma.py +654 -0
maque/retriever/document.py +140 -0
maque/retriever/milvus.py +1140 -0
maque/table_ops/__init__.py +1 -0
maque/table_ops/core.py +133 -0
maque/table_viewer/__init__.py +4 -0
maque/table_viewer/download_assets.py +57 -0
maque/table_viewer/server.py +698 -0
maque/table_viewer/static/element-plus-icons.js +5791 -0
maque/table_viewer/static/element-plus.css +1 -0
maque/table_viewer/static/element-plus.js +65236 -0
maque/table_viewer/static/main.css +268 -0
maque/table_viewer/static/main.js +669 -0
maque/table_viewer/static/vue.global.js +18227 -0
maque/table_viewer/templates/index.html +401 -0
maque/utils/__init__.py +56 -0
maque/utils/color.py +68 -0
maque/utils/color_string.py +45 -0
maque/utils/compress.py +66 -0
maque/utils/constant.py +183 -0
maque/utils/core.py +261 -0
maque/utils/cursor.py +143 -0
maque/utils/distance.py +58 -0
maque/utils/docker.py +96 -0
maque/utils/downloads.py +51 -0
maque/utils/excel_helper.py +542 -0
maque/utils/helper_metrics.py +121 -0
maque/utils/helper_parser.py +168 -0
maque/utils/net.py +64 -0
maque/utils/nvidia_stat.py +140 -0
maque/utils/ops.py +53 -0
maque/utils/packages.py +31 -0
maque/utils/path.py +57 -0
maque/utils/tar.py +260 -0
maque/utils/untar.py +129 -0
maque/web/__init__.py +0 -0
maque/web/image_downloader.py +1410 -0
maque-0.2.1.dist-info/METADATA +450 -0
maque-0.2.1.dist-info/RECORD +143 -0
maque-0.2.1.dist-info/WHEEL +4 -0
maque-0.2.1.dist-info/entry_points.txt +3 -0
maque-0.2.1.dist-info/licenses/LICENSE +21 -0

maque/utils/tar.py ADDED Viewed

@@ -0,0 +1,260 @@
+#!/usr/bin/env python
+# coding: utf-8
+"""
+Optimized Parallel Tar with stdin piping, compression support, and timing.
+"""
+import os
+import subprocess
+import argparse
+import shutil
+import sys
+import time  # 导入 time 模块用于计时
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def check_command(cmd):
+    """检查指定的命令是否存在于系统的PATH中。"""
+    if shutil.which(cmd) is None:
+        print(f"错误: 必需的命令 '{cmd}' 未找到。", file=sys.stderr)
+        print(f"请确保 '{cmd}' 已安装并且在您的系统PATH中。", file=sys.stderr)
+        sys.exit(1)
+def parse_size_to_bytes(size_str):
+    """将带有单位的尺寸字符串 (如 '5G', '100M') 解析为字节。"""
+    size_str = size_str.strip().upper()
+    units = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4}
+    try:
+        if size_str and size_str[-1] in units:
+            num = float(size_str[:-1])
+            unit = size_str[-1]
+            return int(num * units[unit])
+        else:
+            return int(size_str)
+    except (ValueError, TypeError):
+        print(f"错误: 无法解析尺寸字符串 '{size_str}'。", file=sys.stderr)
+        sys.exit(1)
+def run_tar_job(job_info):
+    """
+    为单个文件块执行 tar 命令。
+    此版本通过管道将文件列表传递给 tar 的 stdin，避免了临时文件。
+    """
+    file_list, archive_path, source_dir, job_id, compression_args = job_info
+    # 基本命令: -c (创建), --no-recursion (不递归), -f (指定文件), -C (切换目录), -T - (从stdin读取列表)
+    tar_cmd = (
+        ["tar"]
+        + compression_args
+        + ["-cf", archive_path, "-C", source_dir, "--no-recursion", "-T", "-"]
+    )
+    # 将文件列表（Python列表）转换为 tar -T- 所需的格式（换行符分隔的字符串）
+    # 必须编码为字节串才能传递给 stdin
+    files_to_pipe = "\n".join(file_list).encode("utf-8")
+    try:
+        # 使用 Popen 以便我们可以访问 stdin
+        process = subprocess.Popen(
+            tar_cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        # 将文件列表写入 stdin 并关闭它，然后等待进程完成
+        stdout, stderr = process.communicate(input=files_to_pipe)
+        if process.returncode != 0:
+            # 如果 tar 命令失败，则格式化错误信息
+            raise subprocess.CalledProcessError(
+                returncode=process.returncode, cmd=tar_cmd, stderr=stderr
+            )
+        return f"任务 {job_id} ({os.path.basename(archive_path)}) 成功。"
+    except subprocess.CalledProcessError as e:
+        error_message = (
+            f"错误: 任务 {job_id} ({os.path.basename(archive_path)}) 失败。\n"
+        )
+        # stderr 是字节串，需要解码
+        error_message += (
+            f"错误信息: {e.stderr.decode('utf-8', errors='ignore').strip()}"
+        )
+        return error_message
+    except Exception as e:
+        return f"错误: 任务 {job_id} 发生意外异常: {e}"
+def archive_by_size(
+    source_dir, dest_dir, max_size_str, parallel_jobs, compression, compression_level
+):
+    """
+    根据文件大小将源目录打包，并使用线程池控制并行度。
+    """
+    # --- 新增：记录开始时间 ---
+    start_time = time.time()
+    # --- 1. 检查和准备 ---
+    print("--- 步骤 1: 检查与准备 ---")
+    check_command("tar")
+    source_dir = os.path.abspath(source_dir)
+    dest_dir = os.path.abspath(dest_dir)
+    max_size_bytes = parse_size_to_bytes(max_size_str)
+    if not os.path.isdir(source_dir):
+        print(f"错误: 源目录 '{source_dir}' 不存在。", file=sys.stderr)
+        return
+    os.makedirs(dest_dir, exist_ok=True)
+    # 根据压缩选项确定文件扩展名和 tar 参数
+    compression_map = {
+        "none": (".tar", []),
+        "gzip": (".tar.gz", ["-z"]),
+        "zstd": (
+            ".tar.zst",
+            ["--zstd", f"--compress-program=zstd -{compression_level}"],
+        ),
+    }
+    if compression not in compression_map:
+        print(f"错误: 不支持的压缩格式 '{compression}'。", file=sys.stderr)
+        return
+    archive_ext, compression_args = compression_map[compression]
+    # --- 2. 扫描文件并按大小分组 ---
+    print(f"--- 步骤 2: 扫描文件并按最大 {max_size_str} 每包进行分组 ---")
+    job_definitions = []
+    current_chunk_files = []
+    current_chunk_size = 0
+    part_num = 1
+    # 使用 os.walk 遍历目录
+    for root, _, files in os.walk(source_dir):
+        for filename in files:
+            full_path = os.path.join(root, filename)
+            relative_path = os.path.relpath(full_path, source_dir)
+            try:
+                if not os.path.islink(full_path):
+                    file_size = os.path.getsize(full_path)
+                else:
+                    continue
+            except OSError as e:
+                print(f"警告: 无法访问文件 {full_path}，已跳过。错误: {e}")
+                continue
+            if current_chunk_files and (
+                current_chunk_size + file_size > max_size_bytes
+            ):
+                archive_name = f"archive_part_{part_num:04d}{archive_ext}"
+                archive_path = os.path.join(dest_dir, archive_name)
+                # 直接将文件列表添加到任务定义中，而不是写入文件
+                job_definitions.append(
+                    (
+                        list(current_chunk_files),
+                        archive_path,
+                        source_dir,
+                        part_num,
+                        compression_args,
+                    )
+                )
+                part_num += 1
+                current_chunk_files = []
+                current_chunk_size = 0
+            current_chunk_files.append(relative_path)
+            current_chunk_size += file_size
+    if current_chunk_files:
+        archive_name = f"archive_part_{part_num:04d}{archive_ext}"
+        archive_path = os.path.join(dest_dir, archive_name)
+        job_definitions.append(
+            (
+                list(current_chunk_files),
+                archive_path,
+                source_dir,
+                part_num,
+                compression_args,
+            )
+        )
+    if not job_definitions:
+        print("警告: 源目录中没有文件可打包。")
+        return
+    print(f"分组完成，总共将生成 {len(job_definitions)} 个归档包。")
+    # --- 3. 使用线程池并行执行打包 ---
+    print(f"\n--- 步骤 3: 并行启动打包任务 (最多 {parallel_jobs} 个任务同时运行) ---")
+    with ThreadPoolExecutor(max_workers=parallel_jobs) as executor:
+        futures = [executor.submit(run_tar_job, job) for job in job_definitions]
+        for future in as_completed(futures):
+            result = future.result()
+            print(f"  -> {result}")
+    print("\n----------------------------------------")
+    print("所有打包任务已成功完成！")
+    print(f"归档文件位于: {dest_dir}")
+    # --- 新增：计算并打印总耗时 ---
+    end_time = time.time()
+    duration = end_time - start_time
+    print(f"总耗时: {duration:.2f} 秒。")
+    print("----------------------------------------")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="根据指定的最大文件大小，快速、并行地打包一个文件夹。此版本通过管道向tar传递文件列表以提高性能，并支持压缩。",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument("source_dir", type=str, help="要打包的源文件夹路径。")
+    parser.add_argument("dest_dir", type=str, help="用于存放打包文件的目标文件夹。")
+    parser.add_argument(
+        "-s",
+        "--max-size",
+        type=str,
+        default="5G",
+        help="每个归档包内容物的最大预估大小。\n支持单位: K, M, G, T (例如: '2G', '500M')。默认为 '5G'。",
+    )
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=os.cpu_count() or 8,
+        help="并行的任务数量 (即同时运行多少个 tar 进程)。默认为系统的CPU核心数。",
+    )
+    parser.add_argument(
+        "-c",
+        "--compression",
+        type=str,
+        default="none",
+        choices=["none", "gzip", "zstd"],
+        help="选择压缩算法。\n"
+        " - none: 不压缩，速度最快，文件最大 (.tar)\n"
+        " - gzip: 通用压缩，兼容性好 (.tar.gz)\n"
+        " - zstd: 现代高效压缩，速度和压缩率俱佳 (.tar.zst)\n"
+        "默认为 'none'。",
+    )
+    parser.add_argument(
+        "--level",
+        dest="compression_level",
+        type=int,
+        default=3,
+        help="压缩级别 (仅对 zstd 有效)。范围 1-19。\n"
+        "较低的级别速度更快，压缩率较低。默认为 3。",
+    )
+    args = parser.parse_args()
+    archive_by_size(
+        args.source_dir,
+        args.dest_dir,
+        args.max_size,
+        args.jobs,
+        args.compression,
+        args.compression_level,
+    )

maque/utils/untar.py ADDED Viewed

@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# coding: utf-8
+"""
+parallel_untar.py
+Unpacks various tar archives (.tar, .tar.gz, .tar.zst) in parallel.
+"""
+import os
+import subprocess
+import argparse
+import shutil
+import sys
+import glob
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def check_command(cmd):
+    """检查指定的命令是否存在于系统的PATH中。"""
+    if shutil.which(cmd) is None:
+        print(f"错误: 必需的命令 '{cmd}' 未找到。", file=sys.stderr)
+        print(f"请确保 '{cmd}' 已安装并且在您的系统PATH中。", file=sys.stderr)
+        sys.exit(1)
+def run_untar_job(archive_path, dest_dir):
+    """
+    为单个归档文件执行 'tar -xf' 解压命令。
+    现代 tar 会自动检测压缩格式 (gzip, zstd, etc.)。
+    """
+    job_name = os.path.basename(archive_path)
+    # -x: 提取 (extract)
+    # -f: 指定文件 (file)
+    # -C: 指定目标目录 (Change directory)
+    tar_cmd = ["tar", "-xf", archive_path, "-C", dest_dir]
+    try:
+        subprocess.run(
+            tar_cmd, check=True, capture_output=True, text=True, encoding="utf-8"
+        )
+        return f"文件 '{job_name}' 解压成功。"
+    except subprocess.CalledProcessError as e:
+        error_message = f"错误: 解压 '{job_name}' 失败。\n"
+        error_message += f"错误信息: {e.stderr.strip()}"
+        return error_message
+    except Exception as e:
+        return f"错误: 解压 '{job_name}' 时发生意外异常: {e}"
+def parallel_unpack(source_dir, dest_dir, parallel_jobs):
+    """
+    并行地将源目录中的所有 .tar, .tar.gz, .tar.zst 文件解压到目标目录。
+    """
+    # --- 新增：记录开始时间 ---
+    start_time = time.time()
+    # --- 1. 检查和准备 ---
+    print("--- 步骤 1: 检查与准备 ---")
+    check_command("tar")
+    source_dir = os.path.abspath(source_dir)
+    dest_dir = os.path.abspath(dest_dir)
+    if not os.path.isdir(source_dir):
+        print(f"错误: 源目录 '{source_dir}' 不存在或不是一个目录。", file=sys.stderr)
+        return
+    # 创建目标目录，如果不存在的话
+    os.makedirs(dest_dir, exist_ok=True)
+    # --- 2. 查找所有要解压的归档文件 ---
+    print(f"--- 步骤 2: 在 '{source_dir}' 中查找归档文件 ---")
+    # --- 已修改：查找多种压缩格式 ---
+    patterns = ["*.tar", "*.tar.gz", "*.tar.zst"]
+    archive_files = []
+    for pattern in patterns:
+        archive_files.extend(glob.glob(os.path.join(source_dir, pattern)))
+    if not archive_files:
+        print(f"在 '{source_dir}' 中未找到任何支持的归档文件 ({', '.join(patterns)})。")
+        return
+    print(f"找到 {len(archive_files)} 个归档文件，准备解压。")
+    # --- 3. 使用线程池并行执行解压 ---
+    print(f"\n--- 步骤 3: 并行启动解压任务 (最多 {parallel_jobs} 个任务同时运行) ---")
+    with ThreadPoolExecutor(max_workers=parallel_jobs) as executor:
+        # 提交所有解压任务
+        futures = {
+            executor.submit(run_untar_job, archive, dest_dir): archive
+            for archive in archive_files
+        }
+        # 等待任务完成并打印结果
+        for future in as_completed(futures):
+            result = future.result()
+            print(f"  -> {result}")
+    print("\n----------------------------------------")
+    print("所有解压任务已完成！")
+    print(f"文件已解压至: {dest_dir}")
+    # --- 新增：计算并打印总耗时 ---
+    end_time = time.time()
+    duration = end_time - start_time
+    print(f"总耗时: {duration:.2f} 秒。")
+    print("----------------------------------------")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="并行地将一个目录中的所有 .tar, .tar.gz, .tar.zst 归档文件解压到目标位置。",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument("source_dir", type=str, help="包含归档文件的源文件夹路径。")
+    parser.add_argument("dest_dir", type=str, help="用于存放解压后文件的目标文件夹。")
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=os.cpu_count() or 4,
+        help="并行的解压任务数量 (即同时运行多少个 tar 进程)。\n"
+        "默认为系统的CPU核心数。",
+    )
+    args = parser.parse_args()
+    parallel_unpack(args.source_dir, args.dest_dir, args.jobs)

maque/web/__init__.py ADDED Viewed

File without changes