maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/cli/script.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from rich import print
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def source_and_run(filename):
|
|
8
|
+
command = f'source {filename}; env'
|
|
9
|
+
pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, executable="/bin/bash")
|
|
10
|
+
output = pipe.communicate()[0]
|
|
11
|
+
env = dict((line.decode().split('=', 1) for line in output.splitlines()))
|
|
12
|
+
os.environ.update(env)
|
|
13
|
+
|
|
14
|
+
def install_node_with_nvm(version='16'):
|
|
15
|
+
args = (f"curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash &&"
|
|
16
|
+
f"source ~/.nvm/nvm.sh &&"
|
|
17
|
+
f"nvm install {version}")
|
|
18
|
+
os.system(args)
|
|
19
|
+
print("Install Node.js success, please exec command `nvm use {version}` to use Node.js v{version}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def install_nvim(version="0.9.2"):
|
|
23
|
+
nvim_dir = Path("~/software/nvim/")
|
|
24
|
+
if not nvim_dir.exists():
|
|
25
|
+
nvim_dir.mkdir(parents=True)
|
|
26
|
+
|
|
27
|
+
args = (
|
|
28
|
+
f"wget https://github.com/neovim/neovim/releases/download/v{version}/nvim.appimage &&"
|
|
29
|
+
f"sudo mv nvim.appimage ~/software/nvim/ &&"
|
|
30
|
+
f"chmod u+x ~/software/nvim/nvim.appimage &&"
|
|
31
|
+
f"sudo ln -s ~/software/nvim/nvim.appimage /usr/local/bin/nvim &&"
|
|
32
|
+
f"sudo ln -s ~/software/nvim/nvim.appimage /usr/bin/nvim" # If your system does not have FUSE you can extract the appimage: https://github.com/AppImage/AppImageKit/wiki/FUSE#install-fuse
|
|
33
|
+
)
|
|
34
|
+
os.system(args)
|
|
35
|
+
print(f"Install neovim {version} success.")
|
|
36
|
+
|
|
37
|
+
def uninstall_nvim():
|
|
38
|
+
args = (
|
|
39
|
+
f"sudo rm -f /user/local/bin/nvim &&"
|
|
40
|
+
f"sudo rm -f /user/bin/nvim &&"
|
|
41
|
+
f"rm -f ~/software/nvim/nvim.appimage"
|
|
42
|
+
)
|
|
43
|
+
os.system(args)
|
|
44
|
+
print("Uninstall neovim success.")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def install_make():
|
|
48
|
+
os.system("sudo apt install build-essential")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def install_cargo():
|
|
52
|
+
os.system("curl https://sh.rustup.rs -sSf | sh")
|
maque/cli/tree.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from rich import print
|
|
3
|
+
from rich.filesize import decimal
|
|
4
|
+
from rich.markup import escape
|
|
5
|
+
from rich.text import Text
|
|
6
|
+
from rich.tree import Tree
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def walk_directory(directory: pathlib.Path, tree: Tree) -> None:
|
|
10
|
+
"""Recursively build a Tree with directory contents."""
|
|
11
|
+
# Sort dirs first then by filename
|
|
12
|
+
paths = sorted(
|
|
13
|
+
pathlib.Path(directory).iterdir(),
|
|
14
|
+
key=lambda path: (path.is_file(), path.name.lower()),
|
|
15
|
+
)
|
|
16
|
+
for path in paths:
|
|
17
|
+
# Remove hidden files
|
|
18
|
+
if path.name.startswith("."):
|
|
19
|
+
continue
|
|
20
|
+
if path.is_dir():
|
|
21
|
+
style = "dim" if path.name.startswith("__") else ""
|
|
22
|
+
branch = tree.add(
|
|
23
|
+
f"[bold magenta]:open_file_folder: [link file://{path}]{escape(path.name)}",
|
|
24
|
+
style=style,
|
|
25
|
+
guide_style=style,
|
|
26
|
+
)
|
|
27
|
+
walk_directory(path, branch)
|
|
28
|
+
else:
|
|
29
|
+
text_filename = Text(path.name, "green")
|
|
30
|
+
text_filename.highlight_regex(r"\..*$", "bold red")
|
|
31
|
+
text_filename.stylize(f"link file://{path}")
|
|
32
|
+
file_size = path.stat().st_size
|
|
33
|
+
text_filename.append(f" ({decimal(file_size)})", "blue")
|
|
34
|
+
icon = "🐍 " if path.suffix == ".py" else "📄 "
|
|
35
|
+
tree.add(Text(icon) + text_filename)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def print_directory_tree(directory="."):
|
|
39
|
+
try:
|
|
40
|
+
directory = pathlib.Path(directory).absolute()
|
|
41
|
+
except IndexError:
|
|
42
|
+
print("[b]Usage:[/] python tree.py <DIRECTORY>")
|
|
43
|
+
else:
|
|
44
|
+
tree = Tree(
|
|
45
|
+
f":open_file_folder: [link file://{directory}]{directory}",
|
|
46
|
+
guide_style="bold bright_blue",
|
|
47
|
+
)
|
|
48
|
+
walk_directory(pathlib.Path(directory), tree)
|
|
49
|
+
print(tree)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
聚类分析模块
|
|
4
|
+
|
|
5
|
+
提供通用的聚类分析工具,支持多种算法和可视化方法。
|
|
6
|
+
|
|
7
|
+
聚类算法:
|
|
8
|
+
- K-Means: 经典算法,需指定聚类数
|
|
9
|
+
- HDBSCAN: 自动确定聚类数,对噪声鲁棒
|
|
10
|
+
- DBSCAN: 基于密度,自动确定聚类数
|
|
11
|
+
- Agglomerative: 层次聚类
|
|
12
|
+
|
|
13
|
+
可视化方法:
|
|
14
|
+
- UMAP: 速度快,保持全局结构(推荐)
|
|
15
|
+
- t-SNE: 保持局部结构
|
|
16
|
+
- PCA: 最快,线性降维
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from .clusterers import (
|
|
20
|
+
BaseClusterer,
|
|
21
|
+
ClusteringResult,
|
|
22
|
+
ClusterMetrics,
|
|
23
|
+
KMeansClusterer,
|
|
24
|
+
HDBSCANClusterer,
|
|
25
|
+
DBSCANClusterer,
|
|
26
|
+
AgglomerativeClusterer,
|
|
27
|
+
create_clusterer,
|
|
28
|
+
)
|
|
29
|
+
from .sampler import CentroidSampler, ClusterSample
|
|
30
|
+
from .visualizer import ClusterVisualizer, DimReductionMethod
|
|
31
|
+
from .analyzer import ClusterAnalyzer, ClusterResult
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
# 聚类器
|
|
35
|
+
"BaseClusterer",
|
|
36
|
+
"ClusteringResult",
|
|
37
|
+
"ClusterMetrics",
|
|
38
|
+
"KMeansClusterer",
|
|
39
|
+
"HDBSCANClusterer",
|
|
40
|
+
"DBSCANClusterer",
|
|
41
|
+
"AgglomerativeClusterer",
|
|
42
|
+
"create_clusterer",
|
|
43
|
+
# 采样器
|
|
44
|
+
"CentroidSampler",
|
|
45
|
+
"ClusterSample",
|
|
46
|
+
# 可视化
|
|
47
|
+
"ClusterVisualizer",
|
|
48
|
+
"DimReductionMethod",
|
|
49
|
+
# 分析器
|
|
50
|
+
"ClusterAnalyzer",
|
|
51
|
+
"ClusterResult",
|
|
52
|
+
]
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
聚类分析器
|
|
4
|
+
|
|
5
|
+
整合聚类、采样、可视化等功能的高级分析器。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Literal, Optional, Tuple, Union
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
from .clusterers import (
|
|
17
|
+
BaseClusterer,
|
|
18
|
+
ClusteringResult,
|
|
19
|
+
KMeansClusterer,
|
|
20
|
+
HDBSCANClusterer,
|
|
21
|
+
DBSCANClusterer,
|
|
22
|
+
AgglomerativeClusterer,
|
|
23
|
+
create_clusterer,
|
|
24
|
+
)
|
|
25
|
+
from .sampler import CentroidSampler, ClusterSample
|
|
26
|
+
from .visualizer import ClusterVisualizer, DimReductionMethod
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
AlgorithmType = Literal["kmeans", "hdbscan", "dbscan", "agglomerative"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ClusterResult:
|
|
34
|
+
"""聚类分析结果"""
|
|
35
|
+
n_samples: int
|
|
36
|
+
embedding_dim: int
|
|
37
|
+
n_clusters: int
|
|
38
|
+
algorithm: str
|
|
39
|
+
silhouette: Optional[float]
|
|
40
|
+
calinski_harabasz: Optional[float]
|
|
41
|
+
davies_bouldin: Optional[float]
|
|
42
|
+
n_noise: int
|
|
43
|
+
clusters: List[ClusterSample]
|
|
44
|
+
algorithm_info: dict = field(default_factory=dict)
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> dict:
|
|
47
|
+
"""转换为字典"""
|
|
48
|
+
return {
|
|
49
|
+
"n_samples": self.n_samples,
|
|
50
|
+
"embedding_dim": self.embedding_dim,
|
|
51
|
+
"n_clusters": self.n_clusters,
|
|
52
|
+
"algorithm": self.algorithm,
|
|
53
|
+
"metrics": {
|
|
54
|
+
"silhouette": self.silhouette,
|
|
55
|
+
"calinski_harabasz": self.calinski_harabasz,
|
|
56
|
+
"davies_bouldin": self.davies_bouldin,
|
|
57
|
+
},
|
|
58
|
+
"n_noise": self.n_noise,
|
|
59
|
+
"algorithm_info": self.algorithm_info,
|
|
60
|
+
"clusters": {
|
|
61
|
+
str(c.label): {
|
|
62
|
+
"count": c.count,
|
|
63
|
+
"sample_ids": c.sample_ids,
|
|
64
|
+
"sample_docs": c.sample_docs,
|
|
65
|
+
}
|
|
66
|
+
for c in self.clusters
|
|
67
|
+
},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def save(self, path: Union[str, Path]) -> None:
|
|
71
|
+
"""保存结果到 JSON 文件"""
|
|
72
|
+
path = Path(path)
|
|
73
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
75
|
+
json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
|
|
76
|
+
logger.info(f"结果保存到: {path}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ClusterAnalyzer:
|
|
80
|
+
"""
|
|
81
|
+
聚类分析器
|
|
82
|
+
|
|
83
|
+
支持多种聚类算法和可视化方法。
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> # K-Means 聚类 + UMAP 可视化
|
|
87
|
+
>>> analyzer = ClusterAnalyzer(algorithm="kmeans", n_clusters=20, viz_method="umap")
|
|
88
|
+
>>> result = analyzer.analyze(embeddings, ids=ids, documents=docs)
|
|
89
|
+
|
|
90
|
+
>>> # HDBSCAN 自动聚类
|
|
91
|
+
>>> analyzer = ClusterAnalyzer(algorithm="hdbscan", min_cluster_size=50)
|
|
92
|
+
>>> result = analyzer.analyze(embeddings)
|
|
93
|
+
|
|
94
|
+
>>> # 从 ChromaDB 分析
|
|
95
|
+
>>> result = analyzer.analyze_chroma(
|
|
96
|
+
... persist_dir="./chroma_db",
|
|
97
|
+
... collection_name="my_collection",
|
|
98
|
+
... )
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
algorithm: AlgorithmType = "kmeans",
|
|
104
|
+
viz_method: DimReductionMethod = "umap",
|
|
105
|
+
n_samples: int = 5,
|
|
106
|
+
max_doc_length: int = 500,
|
|
107
|
+
random_state: int = 42,
|
|
108
|
+
# K-Means / Agglomerative 参数
|
|
109
|
+
n_clusters: Union[int, Literal["auto"]] = "auto",
|
|
110
|
+
k_range: Tuple[int, int] = (2, 30),
|
|
111
|
+
# HDBSCAN 参数
|
|
112
|
+
min_cluster_size: int = 15,
|
|
113
|
+
reduce_dim: Union[int, Literal["auto"], None] = "auto",
|
|
114
|
+
# DBSCAN 参数
|
|
115
|
+
eps: Union[float, Literal["auto"]] = "auto",
|
|
116
|
+
min_samples: int = 5,
|
|
117
|
+
# Agglomerative 参数
|
|
118
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
119
|
+
):
|
|
120
|
+
"""
|
|
121
|
+
Args:
|
|
122
|
+
algorithm: 聚类算法 ("kmeans", "hdbscan", "dbscan", "agglomerative")
|
|
123
|
+
viz_method: 可视化降维方法 ("umap", "tsne", "pca")
|
|
124
|
+
n_samples: 每个簇采样的样本数
|
|
125
|
+
max_doc_length: 文档截断长度
|
|
126
|
+
random_state: 随机种子
|
|
127
|
+
n_clusters: 聚类数 (kmeans/agglomerative)
|
|
128
|
+
k_range: 自动选 k 范围 (kmeans/agglomerative)
|
|
129
|
+
min_cluster_size: 最小簇大小 (hdbscan)
|
|
130
|
+
reduce_dim: HDBSCAN 预降维维度 ("auto"/int/None)
|
|
131
|
+
eps: 邻域半径 (dbscan)
|
|
132
|
+
min_samples: 最小样本数 (dbscan/hdbscan)
|
|
133
|
+
linkage: 链接方式 (agglomerative)
|
|
134
|
+
"""
|
|
135
|
+
self.algorithm = algorithm
|
|
136
|
+
self.random_state = random_state
|
|
137
|
+
|
|
138
|
+
# 创建聚类器
|
|
139
|
+
if algorithm == "kmeans":
|
|
140
|
+
self.clusterer = KMeansClusterer(
|
|
141
|
+
n_clusters=n_clusters,
|
|
142
|
+
k_range=k_range,
|
|
143
|
+
random_state=random_state,
|
|
144
|
+
)
|
|
145
|
+
elif algorithm == "hdbscan":
|
|
146
|
+
self.clusterer = HDBSCANClusterer(
|
|
147
|
+
min_cluster_size=min_cluster_size,
|
|
148
|
+
min_samples=min_samples,
|
|
149
|
+
reduce_dim=reduce_dim,
|
|
150
|
+
)
|
|
151
|
+
elif algorithm == "dbscan":
|
|
152
|
+
self.clusterer = DBSCANClusterer(
|
|
153
|
+
eps=eps,
|
|
154
|
+
min_samples=min_samples,
|
|
155
|
+
)
|
|
156
|
+
elif algorithm == "agglomerative":
|
|
157
|
+
self.clusterer = AgglomerativeClusterer(
|
|
158
|
+
n_clusters=n_clusters,
|
|
159
|
+
linkage=linkage,
|
|
160
|
+
k_range=k_range,
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
raise ValueError(f"未知算法: {algorithm}")
|
|
164
|
+
|
|
165
|
+
self.sampler = CentroidSampler(
|
|
166
|
+
n_samples=n_samples,
|
|
167
|
+
max_doc_length=max_doc_length,
|
|
168
|
+
metric="cosine", # 文本嵌入推荐使用余弦距离
|
|
169
|
+
)
|
|
170
|
+
self.visualizer = ClusterVisualizer(
|
|
171
|
+
method=viz_method,
|
|
172
|
+
random_state=random_state,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def analyze(
|
|
176
|
+
self,
|
|
177
|
+
embeddings: np.ndarray,
|
|
178
|
+
ids: Optional[List[str]] = None,
|
|
179
|
+
documents: Optional[List[str]] = None,
|
|
180
|
+
output_dir: Union[str, Path, None] = None,
|
|
181
|
+
name: str = "cluster",
|
|
182
|
+
) -> ClusterResult:
|
|
183
|
+
"""
|
|
184
|
+
执行聚类分析
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
embeddings: 向量矩阵 (n_samples, n_features)
|
|
188
|
+
ids: 样本 ID 列表
|
|
189
|
+
documents: 文档内容列表
|
|
190
|
+
output_dir: 输出目录(可选)
|
|
191
|
+
name: 结果文件名前缀
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
ClusterResult: 聚类分析结果
|
|
195
|
+
"""
|
|
196
|
+
n_samples, embedding_dim = embeddings.shape
|
|
197
|
+
logger.info(f"开始聚类分析: {n_samples} 个样本, {embedding_dim} 维向量, 算法={self.algorithm}")
|
|
198
|
+
|
|
199
|
+
# 1. 聚类
|
|
200
|
+
cluster_result = self.clusterer.fit(embeddings)
|
|
201
|
+
|
|
202
|
+
# 2. 采样
|
|
203
|
+
logger.info("从每个簇中采样代表性样本...")
|
|
204
|
+
samples = self.sampler.sample(
|
|
205
|
+
embeddings, cluster_result.labels,
|
|
206
|
+
ids=ids, documents=documents,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# 3. 统计簇大小
|
|
210
|
+
self._log_cluster_stats(samples)
|
|
211
|
+
|
|
212
|
+
# 4. 构建结果
|
|
213
|
+
result = ClusterResult(
|
|
214
|
+
n_samples=n_samples,
|
|
215
|
+
embedding_dim=embedding_dim,
|
|
216
|
+
n_clusters=cluster_result.n_clusters,
|
|
217
|
+
algorithm=cluster_result.algorithm,
|
|
218
|
+
silhouette=cluster_result.silhouette,
|
|
219
|
+
calinski_harabasz=cluster_result.calinski_harabasz,
|
|
220
|
+
davies_bouldin=cluster_result.davies_bouldin,
|
|
221
|
+
n_noise=cluster_result.n_noise,
|
|
222
|
+
clusters=samples,
|
|
223
|
+
algorithm_info=cluster_result.extra_info,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# 5. 保存结果
|
|
227
|
+
if output_dir:
|
|
228
|
+
output_dir = Path(output_dir)
|
|
229
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
230
|
+
|
|
231
|
+
# 保存 JSON
|
|
232
|
+
result.save(output_dir / f"{name}_{self.algorithm}.json")
|
|
233
|
+
|
|
234
|
+
# 保存可视化
|
|
235
|
+
self.visualizer.plot(
|
|
236
|
+
embeddings, cluster_result.labels,
|
|
237
|
+
output_dir / f"{name}_{self.algorithm}.png",
|
|
238
|
+
title=f"{name} ({self.algorithm}, n={cluster_result.n_clusters})",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
logger.info("聚类分析完成")
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
def analyze_chroma(
|
|
245
|
+
self,
|
|
246
|
+
persist_dir: Union[str, Path],
|
|
247
|
+
collection_name: str,
|
|
248
|
+
output_dir: Union[str, Path, None] = None,
|
|
249
|
+
name: Optional[str] = None,
|
|
250
|
+
sample_size: Optional[int] = None,
|
|
251
|
+
where: Optional[dict] = None,
|
|
252
|
+
where_document: Optional[dict] = None,
|
|
253
|
+
) -> ClusterResult:
|
|
254
|
+
"""
|
|
255
|
+
从 ChromaDB 加载数据并执行聚类分析
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
persist_dir: ChromaDB 持久化目录
|
|
259
|
+
collection_name: 集合名称
|
|
260
|
+
output_dir: 输出目录(可选)
|
|
261
|
+
name: 结果文件名前缀(默认使用 collection_name)
|
|
262
|
+
sample_size: 限制加载的样本数量(可选,用于大规模数据)
|
|
263
|
+
where: 元数据过滤条件(可选),例如 {"category": "tech"}
|
|
264
|
+
where_document: 文档内容过滤条件(可选),例如 {"$contains": "python"}
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
ClusterResult: 聚类分析结果
|
|
268
|
+
|
|
269
|
+
Example:
|
|
270
|
+
>>> # 全量数据分析
|
|
271
|
+
>>> analyzer = ClusterAnalyzer(algorithm="hdbscan")
|
|
272
|
+
>>> result = analyzer.analyze_chroma(
|
|
273
|
+
... persist_dir="./chroma_db",
|
|
274
|
+
... collection_name="my_data",
|
|
275
|
+
... )
|
|
276
|
+
|
|
277
|
+
>>> # 采样分析(处理大规模数据)
|
|
278
|
+
>>> result = analyzer.analyze_chroma(
|
|
279
|
+
... persist_dir="./chroma_db",
|
|
280
|
+
... collection_name="my_data",
|
|
281
|
+
... sample_size=10000,
|
|
282
|
+
... )
|
|
283
|
+
|
|
284
|
+
>>> # 过滤特定类别的数据
|
|
285
|
+
>>> result = analyzer.analyze_chroma(
|
|
286
|
+
... persist_dir="./chroma_db",
|
|
287
|
+
... collection_name="my_data",
|
|
288
|
+
... where={"category": "tech"},
|
|
289
|
+
... )
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
import chromadb
|
|
293
|
+
except ImportError:
|
|
294
|
+
raise ImportError("请安装 chromadb: pip install chromadb")
|
|
295
|
+
|
|
296
|
+
logger.info(f"从 ChromaDB 加载数据: {collection_name}")
|
|
297
|
+
|
|
298
|
+
client = chromadb.PersistentClient(path=str(persist_dir))
|
|
299
|
+
collection = client.get_collection(collection_name)
|
|
300
|
+
|
|
301
|
+
# 构建查询参数
|
|
302
|
+
query_kwargs = {
|
|
303
|
+
"include": ["embeddings", "documents", "metadatas"]
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if sample_size is not None:
|
|
307
|
+
query_kwargs["limit"] = sample_size
|
|
308
|
+
logger.info(f"限制加载前 {sample_size} 条数据")
|
|
309
|
+
|
|
310
|
+
if where is not None:
|
|
311
|
+
query_kwargs["where"] = where
|
|
312
|
+
logger.info(f"应用元数据过滤: {where}")
|
|
313
|
+
|
|
314
|
+
if where_document is not None:
|
|
315
|
+
query_kwargs["where_document"] = where_document
|
|
316
|
+
logger.info(f"应用文档过滤: {where_document}")
|
|
317
|
+
|
|
318
|
+
# 加载数据
|
|
319
|
+
results = collection.get(**query_kwargs)
|
|
320
|
+
|
|
321
|
+
ids = results["ids"]
|
|
322
|
+
embeddings = np.array(results["embeddings"])
|
|
323
|
+
documents = results["documents"]
|
|
324
|
+
|
|
325
|
+
logger.info(f"加载 {len(embeddings)} 个向量, 维度={embeddings.shape[1]}")
|
|
326
|
+
|
|
327
|
+
return self.analyze(
|
|
328
|
+
embeddings,
|
|
329
|
+
ids=ids,
|
|
330
|
+
documents=documents,
|
|
331
|
+
output_dir=output_dir,
|
|
332
|
+
name=name or collection_name,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def _log_cluster_stats(self, samples: List[ClusterSample]) -> None:
|
|
336
|
+
"""输出簇大小统计"""
|
|
337
|
+
total = sum(s.count for s in samples)
|
|
338
|
+
sorted_samples = sorted(samples, key=lambda s: -s.count)
|
|
339
|
+
|
|
340
|
+
logger.info("簇大小分布:")
|
|
341
|
+
for s in sorted_samples[:10]:
|
|
342
|
+
pct = s.count / total * 100
|
|
343
|
+
label = "Noise" if s.label == "noise" else f"Cluster {s.label}"
|
|
344
|
+
logger.info(f" {label}: {s.count} ({pct:.1f}%)")
|
|
345
|
+
|
|
346
|
+
if len(sorted_samples) > 10:
|
|
347
|
+
logger.info(f" ... 还有 {len(sorted_samples) - 10} 个簇")
|