maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
聚类算法集合
|
|
4
|
+
|
|
5
|
+
支持多种聚类算法:
|
|
6
|
+
- K-Means: 经典算法,需指定聚类数
|
|
7
|
+
- HDBSCAN: 自动确定聚类数,对噪声鲁棒
|
|
8
|
+
- DBSCAN: 基于密度,自动确定聚类数
|
|
9
|
+
- Agglomerative: 层次聚类
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Dict, Literal, Optional, Tuple, Union
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from loguru import logger
|
|
18
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
19
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ClusteringResult:
|
|
23
|
+
"""聚类结果"""
|
|
24
|
+
labels: np.ndarray
|
|
25
|
+
n_clusters: int
|
|
26
|
+
algorithm: str
|
|
27
|
+
silhouette: Optional[float] = None
|
|
28
|
+
calinski_harabasz: Optional[float] = None
|
|
29
|
+
davies_bouldin: Optional[float] = None
|
|
30
|
+
n_noise: int = 0
|
|
31
|
+
extra_info: dict = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ClusterMetrics:
|
|
36
|
+
"""聚类质量指标"""
|
|
37
|
+
silhouette: Optional[float] = None
|
|
38
|
+
calinski_harabasz: Optional[float] = None
|
|
39
|
+
davies_bouldin: Optional[float] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BaseClusterer(ABC):
|
|
43
|
+
"""聚类器基类"""
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def fit(self, embeddings: np.ndarray) -> ClusteringResult:
|
|
47
|
+
"""执行聚类"""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
def _compute_metrics(self, embeddings: np.ndarray, labels: np.ndarray) -> ClusterMetrics:
|
|
51
|
+
"""
|
|
52
|
+
计算聚类质量指标(排除噪声点)
|
|
53
|
+
|
|
54
|
+
指标说明:
|
|
55
|
+
- silhouette: 轮廓系数 [-1, 1],越大越好
|
|
56
|
+
- calinski_harabasz: CH 指数,越大越好
|
|
57
|
+
- davies_bouldin: DB 指数,越小越好
|
|
58
|
+
"""
|
|
59
|
+
valid_mask = labels != -1
|
|
60
|
+
if valid_mask.sum() < 2:
|
|
61
|
+
return ClusterMetrics()
|
|
62
|
+
|
|
63
|
+
unique_labels = set(labels[valid_mask])
|
|
64
|
+
if len(unique_labels) < 2:
|
|
65
|
+
return ClusterMetrics()
|
|
66
|
+
|
|
67
|
+
valid_embeddings = embeddings[valid_mask]
|
|
68
|
+
valid_labels = labels[valid_mask]
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
silhouette = silhouette_score(valid_embeddings, valid_labels)
|
|
72
|
+
calinski = calinski_harabasz_score(valid_embeddings, valid_labels)
|
|
73
|
+
davies = davies_bouldin_score(valid_embeddings, valid_labels)
|
|
74
|
+
return ClusterMetrics(
|
|
75
|
+
silhouette=round(silhouette, 4),
|
|
76
|
+
calinski_harabasz=round(calinski, 2),
|
|
77
|
+
davies_bouldin=round(davies, 4),
|
|
78
|
+
)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.warning(f"计算聚类指标失败: {e}")
|
|
81
|
+
return ClusterMetrics()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class KMeansClusterer(BaseClusterer):
|
|
85
|
+
"""
|
|
86
|
+
K-Means 聚类器
|
|
87
|
+
|
|
88
|
+
支持自动选择最优聚类数(轮廓系数法)
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> clusterer = KMeansClusterer(n_clusters=10)
|
|
92
|
+
>>> result = clusterer.fit(embeddings)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(
|
|
96
|
+
self,
|
|
97
|
+
n_clusters: Union[int, Literal["auto"]] = "auto",
|
|
98
|
+
k_range: Tuple[int, int] = (2, 30),
|
|
99
|
+
random_state: int = 42,
|
|
100
|
+
n_init: int = 10,
|
|
101
|
+
):
|
|
102
|
+
self.n_clusters = n_clusters
|
|
103
|
+
self.k_range = k_range
|
|
104
|
+
self.random_state = random_state
|
|
105
|
+
self.n_init = n_init
|
|
106
|
+
|
|
107
|
+
def fit(self, embeddings: np.ndarray) -> ClusteringResult:
|
|
108
|
+
extra_info = {}
|
|
109
|
+
|
|
110
|
+
if self.n_clusters == "auto":
|
|
111
|
+
n_clusters, k_scores = self._auto_select_k(embeddings)
|
|
112
|
+
extra_info["k_selection_method"] = "silhouette"
|
|
113
|
+
extra_info["k_scores"] = {str(k): round(v, 4) for k, v in k_scores.items()}
|
|
114
|
+
logger.info(f"自动选择 k={n_clusters}")
|
|
115
|
+
else:
|
|
116
|
+
n_clusters = self.n_clusters
|
|
117
|
+
|
|
118
|
+
logger.info(f"执行 K-Means 聚类 (k={n_clusters})...")
|
|
119
|
+
self._model = KMeans(
|
|
120
|
+
n_clusters=n_clusters,
|
|
121
|
+
random_state=self.random_state,
|
|
122
|
+
n_init=self.n_init,
|
|
123
|
+
)
|
|
124
|
+
labels = self._model.fit_predict(embeddings)
|
|
125
|
+
extra_info["inertia"] = round(self._model.inertia_, 2)
|
|
126
|
+
|
|
127
|
+
metrics = self._compute_metrics(embeddings, labels)
|
|
128
|
+
if metrics.silhouette:
|
|
129
|
+
logger.info(f"轮廓系数: {metrics.silhouette:.4f}, CH: {metrics.calinski_harabasz:.2f}, DB: {metrics.davies_bouldin:.4f}")
|
|
130
|
+
|
|
131
|
+
return ClusteringResult(
|
|
132
|
+
labels=labels,
|
|
133
|
+
n_clusters=n_clusters,
|
|
134
|
+
algorithm="kmeans",
|
|
135
|
+
silhouette=metrics.silhouette,
|
|
136
|
+
calinski_harabasz=metrics.calinski_harabasz,
|
|
137
|
+
davies_bouldin=metrics.davies_bouldin,
|
|
138
|
+
extra_info=extra_info,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def predict(self, embeddings: np.ndarray) -> np.ndarray:
|
|
142
|
+
"""预测新样本的簇标签"""
|
|
143
|
+
if not hasattr(self, '_model'):
|
|
144
|
+
raise RuntimeError("请先调用 fit() 方法")
|
|
145
|
+
return self._model.predict(embeddings)
|
|
146
|
+
|
|
147
|
+
def _auto_select_k(self, embeddings: np.ndarray) -> Tuple[int, Dict[int, float]]:
|
|
148
|
+
n_samples = len(embeddings)
|
|
149
|
+
k_min, k_max = self.k_range
|
|
150
|
+
k_max = min(k_max, n_samples - 1)
|
|
151
|
+
|
|
152
|
+
if k_min >= k_max:
|
|
153
|
+
return k_min, {}
|
|
154
|
+
|
|
155
|
+
logger.info(f"搜索最优 k (范围: {k_min}-{k_max})...")
|
|
156
|
+
scores = {}
|
|
157
|
+
|
|
158
|
+
for k in range(k_min, k_max + 1):
|
|
159
|
+
kmeans = KMeans(n_clusters=k, random_state=self.random_state, n_init=self.n_init)
|
|
160
|
+
labels = kmeans.fit_predict(embeddings)
|
|
161
|
+
scores[k] = silhouette_score(embeddings, labels)
|
|
162
|
+
|
|
163
|
+
best_k = max(scores, key=scores.get)
|
|
164
|
+
return best_k, scores
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class HDBSCANClusterer(BaseClusterer):
|
|
168
|
+
"""
|
|
169
|
+
HDBSCAN 聚类器
|
|
170
|
+
|
|
171
|
+
自动确定聚类数,对噪声鲁棒。适合发现任意形状的簇。
|
|
172
|
+
|
|
173
|
+
注意:HDBSCAN 在高维空间(如 >50 维的嵌入向量)效果较差,
|
|
174
|
+
建议开启 reduce_dim 参数先用 UMAP 降维再聚类。
|
|
175
|
+
|
|
176
|
+
Example:
|
|
177
|
+
>>> # 高维嵌入向量推荐使用预降维
|
|
178
|
+
>>> clusterer = HDBSCANClusterer(min_cluster_size=15, reduce_dim=20)
|
|
179
|
+
>>> result = clusterer.fit(embeddings)
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
min_cluster_size: int = 15,
|
|
185
|
+
min_samples: Optional[int] = None,
|
|
186
|
+
cluster_selection_epsilon: float = 0.0,
|
|
187
|
+
metric: str = "euclidean",
|
|
188
|
+
reduce_dim: Union[int, Literal["auto"], None] = None,
|
|
189
|
+
reduce_dim_threshold: int = 50,
|
|
190
|
+
):
|
|
191
|
+
"""
|
|
192
|
+
Args:
|
|
193
|
+
min_cluster_size: 最小簇大小
|
|
194
|
+
min_samples: 核心点的最小邻居数
|
|
195
|
+
cluster_selection_epsilon: 簇选择阈值
|
|
196
|
+
metric: 距离度量
|
|
197
|
+
reduce_dim: 预降维目标维度
|
|
198
|
+
- None: 不降维(默认,适合低维数据)
|
|
199
|
+
- "auto": 当维度 > reduce_dim_threshold 时自动降到 20 维
|
|
200
|
+
- int: 指定降维目标维度
|
|
201
|
+
reduce_dim_threshold: 自动降维的维度阈值(默认 50)
|
|
202
|
+
"""
|
|
203
|
+
self.min_cluster_size = min_cluster_size
|
|
204
|
+
self.min_samples = min_samples
|
|
205
|
+
self.cluster_selection_epsilon = cluster_selection_epsilon
|
|
206
|
+
self.metric = metric
|
|
207
|
+
self.reduce_dim = reduce_dim
|
|
208
|
+
self.reduce_dim_threshold = reduce_dim_threshold
|
|
209
|
+
|
|
210
|
+
def _reduce_with_umap(self, embeddings: np.ndarray, n_components: int) -> np.ndarray:
|
|
211
|
+
"""使用 UMAP 降维"""
|
|
212
|
+
try:
|
|
213
|
+
import umap
|
|
214
|
+
except ImportError:
|
|
215
|
+
raise ImportError("预降维需要 umap-learn: pip install umap-learn")
|
|
216
|
+
|
|
217
|
+
logger.info(f"使用 UMAP 预降维: {embeddings.shape[1]}维 -> {n_components}维...")
|
|
218
|
+
reducer = umap.UMAP(
|
|
219
|
+
n_components=n_components,
|
|
220
|
+
n_neighbors=15,
|
|
221
|
+
min_dist=0.0, # 聚类用途,设为 0 保持局部结构
|
|
222
|
+
metric="cosine",
|
|
223
|
+
random_state=42,
|
|
224
|
+
)
|
|
225
|
+
return reducer.fit_transform(embeddings)
|
|
226
|
+
|
|
227
|
+
def fit(self, embeddings: np.ndarray) -> ClusteringResult:
|
|
228
|
+
try:
|
|
229
|
+
import hdbscan
|
|
230
|
+
except ImportError:
|
|
231
|
+
raise ImportError("HDBSCAN 需要: pip install hdbscan")
|
|
232
|
+
|
|
233
|
+
original_dim = embeddings.shape[1]
|
|
234
|
+
reduced_dim = None
|
|
235
|
+
extra_info = {
|
|
236
|
+
"min_cluster_size": self.min_cluster_size,
|
|
237
|
+
"min_samples": self.min_samples,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
# 判断是否需要预降维
|
|
241
|
+
if self.reduce_dim == "auto":
|
|
242
|
+
if original_dim > self.reduce_dim_threshold:
|
|
243
|
+
reduced_dim = 20
|
|
244
|
+
elif isinstance(self.reduce_dim, int):
|
|
245
|
+
reduced_dim = self.reduce_dim
|
|
246
|
+
|
|
247
|
+
# 执行预降维
|
|
248
|
+
if reduced_dim and reduced_dim < original_dim:
|
|
249
|
+
embeddings_for_cluster = self._reduce_with_umap(embeddings, reduced_dim)
|
|
250
|
+
extra_info["umap_reduced_dim"] = reduced_dim
|
|
251
|
+
extra_info["original_dim"] = original_dim
|
|
252
|
+
else:
|
|
253
|
+
embeddings_for_cluster = embeddings
|
|
254
|
+
|
|
255
|
+
logger.info(f"执行 HDBSCAN 聚类 (min_cluster_size={self.min_cluster_size})...")
|
|
256
|
+
|
|
257
|
+
clusterer = hdbscan.HDBSCAN(
|
|
258
|
+
min_cluster_size=self.min_cluster_size,
|
|
259
|
+
min_samples=self.min_samples,
|
|
260
|
+
cluster_selection_epsilon=self.cluster_selection_epsilon,
|
|
261
|
+
metric=self.metric,
|
|
262
|
+
cluster_selection_method='eom',
|
|
263
|
+
)
|
|
264
|
+
labels = clusterer.fit_predict(embeddings_for_cluster)
|
|
265
|
+
|
|
266
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
267
|
+
n_noise = (labels == -1).sum()
|
|
268
|
+
|
|
269
|
+
logger.info(f"发现 {n_clusters} 个簇, {n_noise} 个噪声点 ({n_noise/len(labels)*100:.1f}%)")
|
|
270
|
+
|
|
271
|
+
# 在降维后的空间计算指标
|
|
272
|
+
metrics = self._compute_metrics(embeddings_for_cluster, labels)
|
|
273
|
+
if metrics.silhouette:
|
|
274
|
+
logger.info(f"轮廓系数: {metrics.silhouette:.4f}, CH: {metrics.calinski_harabasz:.2f}, DB: {metrics.davies_bouldin:.4f}")
|
|
275
|
+
|
|
276
|
+
return ClusteringResult(
|
|
277
|
+
labels=labels,
|
|
278
|
+
n_clusters=n_clusters,
|
|
279
|
+
algorithm="hdbscan",
|
|
280
|
+
silhouette=metrics.silhouette,
|
|
281
|
+
calinski_harabasz=metrics.calinski_harabasz,
|
|
282
|
+
davies_bouldin=metrics.davies_bouldin,
|
|
283
|
+
n_noise=int(n_noise),
|
|
284
|
+
extra_info=extra_info,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class DBSCANClusterer(BaseClusterer):
|
|
289
|
+
"""
|
|
290
|
+
DBSCAN 聚类器
|
|
291
|
+
|
|
292
|
+
基于密度的聚类,自动确定聚类数。
|
|
293
|
+
|
|
294
|
+
Example:
|
|
295
|
+
>>> clusterer = DBSCANClusterer(eps=0.5, min_samples=5)
|
|
296
|
+
>>> result = clusterer.fit(embeddings)
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
def __init__(
|
|
300
|
+
self,
|
|
301
|
+
eps: Union[float, Literal["auto"]] = "auto",
|
|
302
|
+
min_samples: int = 5,
|
|
303
|
+
metric: str = "euclidean",
|
|
304
|
+
):
|
|
305
|
+
self.eps = eps
|
|
306
|
+
self.min_samples = min_samples
|
|
307
|
+
self.metric = metric
|
|
308
|
+
|
|
309
|
+
def fit(self, embeddings: np.ndarray) -> ClusteringResult:
|
|
310
|
+
eps = self.eps
|
|
311
|
+
if eps == "auto":
|
|
312
|
+
eps = self._auto_select_eps(embeddings)
|
|
313
|
+
logger.info(f"自动选择 eps={eps:.4f}")
|
|
314
|
+
|
|
315
|
+
logger.info(f"执行 DBSCAN 聚类 (eps={eps:.4f}, min_samples={self.min_samples})...")
|
|
316
|
+
|
|
317
|
+
dbscan = DBSCAN(eps=eps, min_samples=self.min_samples, metric=self.metric)
|
|
318
|
+
labels = dbscan.fit_predict(embeddings)
|
|
319
|
+
|
|
320
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
321
|
+
n_noise = (labels == -1).sum()
|
|
322
|
+
|
|
323
|
+
logger.info(f"发现 {n_clusters} 个簇, {n_noise} 个噪声点 ({n_noise/len(labels)*100:.1f}%)")
|
|
324
|
+
|
|
325
|
+
metrics = self._compute_metrics(embeddings, labels)
|
|
326
|
+
if metrics.silhouette:
|
|
327
|
+
logger.info(f"轮廓系数: {metrics.silhouette:.4f}, CH: {metrics.calinski_harabasz:.2f}, DB: {metrics.davies_bouldin:.4f}")
|
|
328
|
+
|
|
329
|
+
return ClusteringResult(
|
|
330
|
+
labels=labels,
|
|
331
|
+
n_clusters=n_clusters,
|
|
332
|
+
algorithm="dbscan",
|
|
333
|
+
silhouette=metrics.silhouette,
|
|
334
|
+
calinski_harabasz=metrics.calinski_harabasz,
|
|
335
|
+
davies_bouldin=metrics.davies_bouldin,
|
|
336
|
+
n_noise=int(n_noise),
|
|
337
|
+
extra_info={"eps": round(eps, 4), "min_samples": self.min_samples},
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def _auto_select_eps(self, embeddings: np.ndarray) -> float:
|
|
341
|
+
"""使用 k-距离图自动选择 eps"""
|
|
342
|
+
from sklearn.neighbors import NearestNeighbors
|
|
343
|
+
|
|
344
|
+
nn = NearestNeighbors(n_neighbors=self.min_samples)
|
|
345
|
+
nn.fit(embeddings)
|
|
346
|
+
distances, _ = nn.kneighbors(embeddings)
|
|
347
|
+
distances = np.sort(distances[:, -1])
|
|
348
|
+
|
|
349
|
+
# 使用二阶差分找拐点
|
|
350
|
+
d1 = np.diff(distances)
|
|
351
|
+
d2 = np.diff(d1)
|
|
352
|
+
knee_idx = np.argmax(d2) + 1
|
|
353
|
+
return float(distances[knee_idx])
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class AgglomerativeClusterer(BaseClusterer):
|
|
357
|
+
"""
|
|
358
|
+
层次聚类器
|
|
359
|
+
|
|
360
|
+
自底向上的层次聚类算法。
|
|
361
|
+
|
|
362
|
+
Example:
|
|
363
|
+
>>> clusterer = AgglomerativeClusterer(n_clusters=10)
|
|
364
|
+
>>> result = clusterer.fit(embeddings)
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
def __init__(
|
|
368
|
+
self,
|
|
369
|
+
n_clusters: Union[int, Literal["auto"]] = "auto",
|
|
370
|
+
linkage: Literal["ward", "complete", "average", "single"] = "ward",
|
|
371
|
+
distance_threshold: Optional[float] = None,
|
|
372
|
+
k_range: Tuple[int, int] = (2, 30),
|
|
373
|
+
):
|
|
374
|
+
self.n_clusters = n_clusters
|
|
375
|
+
self.linkage = linkage
|
|
376
|
+
self.distance_threshold = distance_threshold
|
|
377
|
+
self.k_range = k_range
|
|
378
|
+
|
|
379
|
+
def fit(self, embeddings: np.ndarray) -> ClusteringResult:
|
|
380
|
+
extra_info = {"linkage": self.linkage}
|
|
381
|
+
|
|
382
|
+
if self.distance_threshold is not None:
|
|
383
|
+
logger.info(f"执行层次聚类 (distance_threshold={self.distance_threshold})...")
|
|
384
|
+
agg = AgglomerativeClustering(
|
|
385
|
+
n_clusters=None,
|
|
386
|
+
distance_threshold=self.distance_threshold,
|
|
387
|
+
linkage=self.linkage,
|
|
388
|
+
)
|
|
389
|
+
extra_info["distance_threshold"] = self.distance_threshold
|
|
390
|
+
else:
|
|
391
|
+
if self.n_clusters == "auto":
|
|
392
|
+
n_clusters = self._auto_select_k(embeddings)
|
|
393
|
+
logger.info(f"自动选择 k={n_clusters}")
|
|
394
|
+
else:
|
|
395
|
+
n_clusters = self.n_clusters
|
|
396
|
+
|
|
397
|
+
logger.info(f"执行层次聚类 (n_clusters={n_clusters}, linkage={self.linkage})...")
|
|
398
|
+
agg = AgglomerativeClustering(n_clusters=n_clusters, linkage=self.linkage)
|
|
399
|
+
|
|
400
|
+
labels = agg.fit_predict(embeddings)
|
|
401
|
+
n_clusters = len(set(labels))
|
|
402
|
+
|
|
403
|
+
metrics = self._compute_metrics(embeddings, labels)
|
|
404
|
+
if metrics.silhouette:
|
|
405
|
+
logger.info(f"轮廓系数: {metrics.silhouette:.4f}, CH: {metrics.calinski_harabasz:.2f}, DB: {metrics.davies_bouldin:.4f}")
|
|
406
|
+
|
|
407
|
+
return ClusteringResult(
|
|
408
|
+
labels=labels,
|
|
409
|
+
n_clusters=n_clusters,
|
|
410
|
+
algorithm="agglomerative",
|
|
411
|
+
silhouette=metrics.silhouette,
|
|
412
|
+
calinski_harabasz=metrics.calinski_harabasz,
|
|
413
|
+
davies_bouldin=metrics.davies_bouldin,
|
|
414
|
+
extra_info=extra_info,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
def _auto_select_k(self, embeddings: np.ndarray) -> int:
|
|
418
|
+
"""使用轮廓系数自动选择 k"""
|
|
419
|
+
n_samples = len(embeddings)
|
|
420
|
+
k_min, k_max = self.k_range
|
|
421
|
+
k_max = min(k_max, n_samples - 1)
|
|
422
|
+
|
|
423
|
+
logger.info(f"搜索最优 k (范围: {k_min}-{k_max})...")
|
|
424
|
+
best_k, best_score = k_min, -1
|
|
425
|
+
|
|
426
|
+
for k in range(k_min, k_max + 1):
|
|
427
|
+
agg = AgglomerativeClustering(n_clusters=k, linkage=self.linkage)
|
|
428
|
+
labels = agg.fit_predict(embeddings)
|
|
429
|
+
score = silhouette_score(embeddings, labels)
|
|
430
|
+
if score > best_score:
|
|
431
|
+
best_k, best_score = k, score
|
|
432
|
+
|
|
433
|
+
return best_k
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def create_clusterer(
|
|
437
|
+
algorithm: Literal["kmeans", "hdbscan", "dbscan", "agglomerative"] = "kmeans",
|
|
438
|
+
**kwargs
|
|
439
|
+
) -> BaseClusterer:
|
|
440
|
+
"""
|
|
441
|
+
创建聚类器的工厂函数
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
algorithm: 聚类算法
|
|
445
|
+
**kwargs: 传递给聚类器的参数
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
BaseClusterer: 聚类器实例
|
|
449
|
+
|
|
450
|
+
Example:
|
|
451
|
+
>>> clusterer = create_clusterer("hdbscan", min_cluster_size=20)
|
|
452
|
+
>>> result = clusterer.fit(embeddings)
|
|
453
|
+
"""
|
|
454
|
+
clusterers = {
|
|
455
|
+
"kmeans": KMeansClusterer,
|
|
456
|
+
"hdbscan": HDBSCANClusterer,
|
|
457
|
+
"dbscan": DBSCANClusterer,
|
|
458
|
+
"agglomerative": AgglomerativeClusterer,
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if algorithm not in clusterers:
|
|
462
|
+
raise ValueError(f"未知算法: {algorithm}, 支持: {list(clusterers.keys())}")
|
|
463
|
+
|
|
464
|
+
return clusterers[algorithm](**kwargs)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
聚类样本采样器
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import List, Literal, Optional, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ClusterSample:
|
|
15
|
+
"""单个簇的样本信息"""
|
|
16
|
+
label: Union[int, str]
|
|
17
|
+
count: int
|
|
18
|
+
sample_indices: List[int]
|
|
19
|
+
sample_ids: Optional[List[str]] = None
|
|
20
|
+
sample_docs: Optional[List[str]] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CentroidSampler:
|
|
24
|
+
"""
|
|
25
|
+
基于质心的样本采样器
|
|
26
|
+
|
|
27
|
+
从每个簇中选择距离质心最近的样本,这些样本最能代表簇的特征。
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> sampler = CentroidSampler(n_samples=5, metric="cosine")
|
|
31
|
+
>>> samples = sampler.sample(embeddings, labels)
|
|
32
|
+
>>> for s in samples:
|
|
33
|
+
... print(f"Cluster {s.label}: {s.count} samples")
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
n_samples: int = 5,
|
|
39
|
+
max_doc_length: int = 500,
|
|
40
|
+
metric: Literal["euclidean", "cosine"] = "cosine",
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Args:
|
|
44
|
+
n_samples: 每个簇采样的样本数
|
|
45
|
+
max_doc_length: 文档截断长度
|
|
46
|
+
metric: 距离度量方式,"euclidean" 或 "cosine"(推荐用于文本嵌入)
|
|
47
|
+
"""
|
|
48
|
+
self.n_samples = n_samples
|
|
49
|
+
self.max_doc_length = max_doc_length
|
|
50
|
+
self.metric = metric
|
|
51
|
+
|
|
52
|
+
def sample(
|
|
53
|
+
self,
|
|
54
|
+
embeddings: np.ndarray,
|
|
55
|
+
labels: np.ndarray,
|
|
56
|
+
ids: Optional[List[str]] = None,
|
|
57
|
+
documents: Optional[List[str]] = None,
|
|
58
|
+
) -> List[ClusterSample]:
|
|
59
|
+
"""
|
|
60
|
+
从每个簇中采样距离质心最近的样本
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
embeddings: 向量矩阵
|
|
64
|
+
labels: 聚类标签
|
|
65
|
+
ids: 样本 ID 列表
|
|
66
|
+
documents: 文档内容列表
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
list[ClusterSample]: 每个簇的样本信息
|
|
70
|
+
"""
|
|
71
|
+
unique_labels = sorted(set(labels))
|
|
72
|
+
samples = []
|
|
73
|
+
|
|
74
|
+
logger.debug(f"从 {len(unique_labels)} 个簇中采样...")
|
|
75
|
+
|
|
76
|
+
for label in unique_labels:
|
|
77
|
+
mask = labels == label
|
|
78
|
+
cluster_indices = np.where(mask)[0]
|
|
79
|
+
cluster_embeddings = embeddings[mask]
|
|
80
|
+
|
|
81
|
+
if label == -1:
|
|
82
|
+
# 噪声点:取前 n 个
|
|
83
|
+
selected_indices = cluster_indices[:self.n_samples].tolist()
|
|
84
|
+
else:
|
|
85
|
+
# 计算质心并选择最近的样本
|
|
86
|
+
centroid = cluster_embeddings.mean(axis=0)
|
|
87
|
+
distances = self._compute_distances(cluster_embeddings, centroid)
|
|
88
|
+
nearest_indices = np.argsort(distances)[:self.n_samples]
|
|
89
|
+
selected_indices = cluster_indices[nearest_indices].tolist()
|
|
90
|
+
|
|
91
|
+
# 构建样本信息
|
|
92
|
+
sample = ClusterSample(
|
|
93
|
+
label="noise" if label == -1 else label,
|
|
94
|
+
count=int(mask.sum()),
|
|
95
|
+
sample_indices=selected_indices,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if ids is not None:
|
|
99
|
+
sample.sample_ids = [ids[i] for i in selected_indices]
|
|
100
|
+
|
|
101
|
+
if documents is not None:
|
|
102
|
+
sample.sample_docs = [
|
|
103
|
+
self._truncate(documents[i]) for i in selected_indices
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
samples.append(sample)
|
|
107
|
+
|
|
108
|
+
return samples
|
|
109
|
+
|
|
110
|
+
def _compute_distances(self, embeddings: np.ndarray, centroid: np.ndarray) -> np.ndarray:
|
|
111
|
+
"""
|
|
112
|
+
计算样本到质心的距离
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
embeddings: 样本向量矩阵 (n, dim)
|
|
116
|
+
centroid: 质心向量 (dim,)
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
距离数组 (n,)
|
|
120
|
+
"""
|
|
121
|
+
if self.metric == "cosine":
|
|
122
|
+
# 余弦距离 = 1 - 余弦相似度
|
|
123
|
+
norm_emb = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10)
|
|
124
|
+
norm_cen = centroid / (np.linalg.norm(centroid) + 1e-10)
|
|
125
|
+
return 1 - np.dot(norm_emb, norm_cen)
|
|
126
|
+
else:
|
|
127
|
+
# 欧氏距离
|
|
128
|
+
return np.linalg.norm(embeddings - centroid, axis=1)
|
|
129
|
+
|
|
130
|
+
def _truncate(self, text: str) -> str:
|
|
131
|
+
"""截断文本"""
|
|
132
|
+
if len(text) > self.max_doc_length:
|
|
133
|
+
return text[:self.max_doc_length] + "..."
|
|
134
|
+
return text
|