maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/cli/script.py ADDED
@@ -0,0 +1,52 @@
1
+ import os
2
+ from rich import print
3
+ import subprocess
4
+ from pathlib import Path
5
+
6
+
7
+ def source_and_run(filename):
8
+ command = f'source {filename}; env'
9
+ pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, executable="/bin/bash")
10
+ output = pipe.communicate()[0]
11
+ env = dict((line.decode().split('=', 1) for line in output.splitlines()))
12
+ os.environ.update(env)
13
+
14
+ def install_node_with_nvm(version='16'):
15
+ args = (f"curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash &&"
16
+ f"source ~/.nvm/nvm.sh &&"
17
+ f"nvm install {version}")
18
+ os.system(args)
19
+ print("Install Node.js success, please exec command `nvm use {version}` to use Node.js v{version}")
20
+
21
+
22
+ def install_nvim(version="0.9.2"):
23
+ nvim_dir = Path("~/software/nvim/")
24
+ if not nvim_dir.exists():
25
+ nvim_dir.mkdir(parents=True)
26
+
27
+ args = (
28
+ f"wget https://github.com/neovim/neovim/releases/download/v{version}/nvim.appimage &&"
29
+ f"sudo mv nvim.appimage ~/software/nvim/ &&"
30
+ f"chmod u+x ~/software/nvim/nvim.appimage &&"
31
+ f"sudo ln -s ~/software/nvim/nvim.appimage /usr/local/bin/nvim &&"
32
+ f"sudo ln -s ~/software/nvim/nvim.appimage /usr/bin/nvim" # If your system does not have FUSE you can extract the appimage: https://github.com/AppImage/AppImageKit/wiki/FUSE#install-fuse
33
+ )
34
+ os.system(args)
35
+ print(f"Install neovim {version} success.")
36
+
37
+ def uninstall_nvim():
38
+ args = (
39
+ f"sudo rm -f /user/local/bin/nvim &&"
40
+ f"sudo rm -f /user/bin/nvim &&"
41
+ f"rm -f ~/software/nvim/nvim.appimage"
42
+ )
43
+ os.system(args)
44
+ print("Uninstall neovim success.")
45
+
46
+
47
+ def install_make():
48
+ os.system("sudo apt install build-essential")
49
+
50
+
51
+ def install_cargo():
52
+ os.system("curl https://sh.rustup.rs -sSf | sh")
maque/cli/tree.py ADDED
@@ -0,0 +1,49 @@
1
+ import pathlib
2
+ from rich import print
3
+ from rich.filesize import decimal
4
+ from rich.markup import escape
5
+ from rich.text import Text
6
+ from rich.tree import Tree
7
+
8
+
9
+ def walk_directory(directory: pathlib.Path, tree: Tree) -> None:
10
+ """Recursively build a Tree with directory contents."""
11
+ # Sort dirs first then by filename
12
+ paths = sorted(
13
+ pathlib.Path(directory).iterdir(),
14
+ key=lambda path: (path.is_file(), path.name.lower()),
15
+ )
16
+ for path in paths:
17
+ # Remove hidden files
18
+ if path.name.startswith("."):
19
+ continue
20
+ if path.is_dir():
21
+ style = "dim" if path.name.startswith("__") else ""
22
+ branch = tree.add(
23
+ f"[bold magenta]:open_file_folder: [link file://{path}]{escape(path.name)}",
24
+ style=style,
25
+ guide_style=style,
26
+ )
27
+ walk_directory(path, branch)
28
+ else:
29
+ text_filename = Text(path.name, "green")
30
+ text_filename.highlight_regex(r"\..*$", "bold red")
31
+ text_filename.stylize(f"link file://{path}")
32
+ file_size = path.stat().st_size
33
+ text_filename.append(f" ({decimal(file_size)})", "blue")
34
+ icon = "🐍 " if path.suffix == ".py" else "📄 "
35
+ tree.add(Text(icon) + text_filename)
36
+
37
+
38
+ def print_directory_tree(directory="."):
39
+ try:
40
+ directory = pathlib.Path(directory).absolute()
41
+ except IndexError:
42
+ print("[b]Usage:[/] python tree.py <DIRECTORY>")
43
+ else:
44
+ tree = Tree(
45
+ f":open_file_folder: [link file://{directory}]{directory}",
46
+ guide_style="bold bright_blue",
47
+ )
48
+ walk_directory(pathlib.Path(directory), tree)
49
+ print(tree)
@@ -0,0 +1,52 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 聚类分析模块
4
+
5
+ 提供通用的聚类分析工具,支持多种算法和可视化方法。
6
+
7
+ 聚类算法:
8
+ - K-Means: 经典算法,需指定聚类数
9
+ - HDBSCAN: 自动确定聚类数,对噪声鲁棒
10
+ - DBSCAN: 基于密度,自动确定聚类数
11
+ - Agglomerative: 层次聚类
12
+
13
+ 可视化方法:
14
+ - UMAP: 速度快,保持全局结构(推荐)
15
+ - t-SNE: 保持局部结构
16
+ - PCA: 最快,线性降维
17
+ """
18
+
19
+ from .clusterers import (
20
+ BaseClusterer,
21
+ ClusteringResult,
22
+ ClusterMetrics,
23
+ KMeansClusterer,
24
+ HDBSCANClusterer,
25
+ DBSCANClusterer,
26
+ AgglomerativeClusterer,
27
+ create_clusterer,
28
+ )
29
+ from .sampler import CentroidSampler, ClusterSample
30
+ from .visualizer import ClusterVisualizer, DimReductionMethod
31
+ from .analyzer import ClusterAnalyzer, ClusterResult
32
+
33
+ __all__ = [
34
+ # 聚类器
35
+ "BaseClusterer",
36
+ "ClusteringResult",
37
+ "ClusterMetrics",
38
+ "KMeansClusterer",
39
+ "HDBSCANClusterer",
40
+ "DBSCANClusterer",
41
+ "AgglomerativeClusterer",
42
+ "create_clusterer",
43
+ # 采样器
44
+ "CentroidSampler",
45
+ "ClusterSample",
46
+ # 可视化
47
+ "ClusterVisualizer",
48
+ "DimReductionMethod",
49
+ # 分析器
50
+ "ClusterAnalyzer",
51
+ "ClusterResult",
52
+ ]
@@ -0,0 +1,347 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 聚类分析器
4
+
5
+ 整合聚类、采样、可视化等功能的高级分析器。
6
+ """
7
+
8
+ import json
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import List, Literal, Optional, Tuple, Union
12
+
13
+ import numpy as np
14
+ from loguru import logger
15
+
16
+ from .clusterers import (
17
+ BaseClusterer,
18
+ ClusteringResult,
19
+ KMeansClusterer,
20
+ HDBSCANClusterer,
21
+ DBSCANClusterer,
22
+ AgglomerativeClusterer,
23
+ create_clusterer,
24
+ )
25
+ from .sampler import CentroidSampler, ClusterSample
26
+ from .visualizer import ClusterVisualizer, DimReductionMethod
27
+
28
+
29
+ AlgorithmType = Literal["kmeans", "hdbscan", "dbscan", "agglomerative"]
30
+
31
+
32
+ @dataclass
33
+ class ClusterResult:
34
+ """聚类分析结果"""
35
+ n_samples: int
36
+ embedding_dim: int
37
+ n_clusters: int
38
+ algorithm: str
39
+ silhouette: Optional[float]
40
+ calinski_harabasz: Optional[float]
41
+ davies_bouldin: Optional[float]
42
+ n_noise: int
43
+ clusters: List[ClusterSample]
44
+ algorithm_info: dict = field(default_factory=dict)
45
+
46
+ def to_dict(self) -> dict:
47
+ """转换为字典"""
48
+ return {
49
+ "n_samples": self.n_samples,
50
+ "embedding_dim": self.embedding_dim,
51
+ "n_clusters": self.n_clusters,
52
+ "algorithm": self.algorithm,
53
+ "metrics": {
54
+ "silhouette": self.silhouette,
55
+ "calinski_harabasz": self.calinski_harabasz,
56
+ "davies_bouldin": self.davies_bouldin,
57
+ },
58
+ "n_noise": self.n_noise,
59
+ "algorithm_info": self.algorithm_info,
60
+ "clusters": {
61
+ str(c.label): {
62
+ "count": c.count,
63
+ "sample_ids": c.sample_ids,
64
+ "sample_docs": c.sample_docs,
65
+ }
66
+ for c in self.clusters
67
+ },
68
+ }
69
+
70
+ def save(self, path: Union[str, Path]) -> None:
71
+ """保存结果到 JSON 文件"""
72
+ path = Path(path)
73
+ path.parent.mkdir(parents=True, exist_ok=True)
74
+ with open(path, "w", encoding="utf-8") as f:
75
+ json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
76
+ logger.info(f"结果保存到: {path}")
77
+
78
+
79
+ class ClusterAnalyzer:
80
+ """
81
+ 聚类分析器
82
+
83
+ 支持多种聚类算法和可视化方法。
84
+
85
+ Example:
86
+ >>> # K-Means 聚类 + UMAP 可视化
87
+ >>> analyzer = ClusterAnalyzer(algorithm="kmeans", n_clusters=20, viz_method="umap")
88
+ >>> result = analyzer.analyze(embeddings, ids=ids, documents=docs)
89
+
90
+ >>> # HDBSCAN 自动聚类
91
+ >>> analyzer = ClusterAnalyzer(algorithm="hdbscan", min_cluster_size=50)
92
+ >>> result = analyzer.analyze(embeddings)
93
+
94
+ >>> # 从 ChromaDB 分析
95
+ >>> result = analyzer.analyze_chroma(
96
+ ... persist_dir="./chroma_db",
97
+ ... collection_name="my_collection",
98
+ ... )
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ algorithm: AlgorithmType = "kmeans",
104
+ viz_method: DimReductionMethod = "umap",
105
+ n_samples: int = 5,
106
+ max_doc_length: int = 500,
107
+ random_state: int = 42,
108
+ # K-Means / Agglomerative 参数
109
+ n_clusters: Union[int, Literal["auto"]] = "auto",
110
+ k_range: Tuple[int, int] = (2, 30),
111
+ # HDBSCAN 参数
112
+ min_cluster_size: int = 15,
113
+ reduce_dim: Union[int, Literal["auto"], None] = "auto",
114
+ # DBSCAN 参数
115
+ eps: Union[float, Literal["auto"]] = "auto",
116
+ min_samples: int = 5,
117
+ # Agglomerative 参数
118
+ linkage: Literal["ward", "complete", "average", "single"] = "ward",
119
+ ):
120
+ """
121
+ Args:
122
+ algorithm: 聚类算法 ("kmeans", "hdbscan", "dbscan", "agglomerative")
123
+ viz_method: 可视化降维方法 ("umap", "tsne", "pca")
124
+ n_samples: 每个簇采样的样本数
125
+ max_doc_length: 文档截断长度
126
+ random_state: 随机种子
127
+ n_clusters: 聚类数 (kmeans/agglomerative)
128
+ k_range: 自动选 k 范围 (kmeans/agglomerative)
129
+ min_cluster_size: 最小簇大小 (hdbscan)
130
+ reduce_dim: HDBSCAN 预降维维度 ("auto"/int/None)
131
+ eps: 邻域半径 (dbscan)
132
+ min_samples: 最小样本数 (dbscan/hdbscan)
133
+ linkage: 链接方式 (agglomerative)
134
+ """
135
+ self.algorithm = algorithm
136
+ self.random_state = random_state
137
+
138
+ # 创建聚类器
139
+ if algorithm == "kmeans":
140
+ self.clusterer = KMeansClusterer(
141
+ n_clusters=n_clusters,
142
+ k_range=k_range,
143
+ random_state=random_state,
144
+ )
145
+ elif algorithm == "hdbscan":
146
+ self.clusterer = HDBSCANClusterer(
147
+ min_cluster_size=min_cluster_size,
148
+ min_samples=min_samples,
149
+ reduce_dim=reduce_dim,
150
+ )
151
+ elif algorithm == "dbscan":
152
+ self.clusterer = DBSCANClusterer(
153
+ eps=eps,
154
+ min_samples=min_samples,
155
+ )
156
+ elif algorithm == "agglomerative":
157
+ self.clusterer = AgglomerativeClusterer(
158
+ n_clusters=n_clusters,
159
+ linkage=linkage,
160
+ k_range=k_range,
161
+ )
162
+ else:
163
+ raise ValueError(f"未知算法: {algorithm}")
164
+
165
+ self.sampler = CentroidSampler(
166
+ n_samples=n_samples,
167
+ max_doc_length=max_doc_length,
168
+ metric="cosine", # 文本嵌入推荐使用余弦距离
169
+ )
170
+ self.visualizer = ClusterVisualizer(
171
+ method=viz_method,
172
+ random_state=random_state,
173
+ )
174
+
175
+ def analyze(
176
+ self,
177
+ embeddings: np.ndarray,
178
+ ids: Optional[List[str]] = None,
179
+ documents: Optional[List[str]] = None,
180
+ output_dir: Union[str, Path, None] = None,
181
+ name: str = "cluster",
182
+ ) -> ClusterResult:
183
+ """
184
+ 执行聚类分析
185
+
186
+ Args:
187
+ embeddings: 向量矩阵 (n_samples, n_features)
188
+ ids: 样本 ID 列表
189
+ documents: 文档内容列表
190
+ output_dir: 输出目录(可选)
191
+ name: 结果文件名前缀
192
+
193
+ Returns:
194
+ ClusterResult: 聚类分析结果
195
+ """
196
+ n_samples, embedding_dim = embeddings.shape
197
+ logger.info(f"开始聚类分析: {n_samples} 个样本, {embedding_dim} 维向量, 算法={self.algorithm}")
198
+
199
+ # 1. 聚类
200
+ cluster_result = self.clusterer.fit(embeddings)
201
+
202
+ # 2. 采样
203
+ logger.info("从每个簇中采样代表性样本...")
204
+ samples = self.sampler.sample(
205
+ embeddings, cluster_result.labels,
206
+ ids=ids, documents=documents,
207
+ )
208
+
209
+ # 3. 统计簇大小
210
+ self._log_cluster_stats(samples)
211
+
212
+ # 4. 构建结果
213
+ result = ClusterResult(
214
+ n_samples=n_samples,
215
+ embedding_dim=embedding_dim,
216
+ n_clusters=cluster_result.n_clusters,
217
+ algorithm=cluster_result.algorithm,
218
+ silhouette=cluster_result.silhouette,
219
+ calinski_harabasz=cluster_result.calinski_harabasz,
220
+ davies_bouldin=cluster_result.davies_bouldin,
221
+ n_noise=cluster_result.n_noise,
222
+ clusters=samples,
223
+ algorithm_info=cluster_result.extra_info,
224
+ )
225
+
226
+ # 5. 保存结果
227
+ if output_dir:
228
+ output_dir = Path(output_dir)
229
+ output_dir.mkdir(parents=True, exist_ok=True)
230
+
231
+ # 保存 JSON
232
+ result.save(output_dir / f"{name}_{self.algorithm}.json")
233
+
234
+ # 保存可视化
235
+ self.visualizer.plot(
236
+ embeddings, cluster_result.labels,
237
+ output_dir / f"{name}_{self.algorithm}.png",
238
+ title=f"{name} ({self.algorithm}, n={cluster_result.n_clusters})",
239
+ )
240
+
241
+ logger.info("聚类分析完成")
242
+ return result
243
+
244
+ def analyze_chroma(
245
+ self,
246
+ persist_dir: Union[str, Path],
247
+ collection_name: str,
248
+ output_dir: Union[str, Path, None] = None,
249
+ name: Optional[str] = None,
250
+ sample_size: Optional[int] = None,
251
+ where: Optional[dict] = None,
252
+ where_document: Optional[dict] = None,
253
+ ) -> ClusterResult:
254
+ """
255
+ 从 ChromaDB 加载数据并执行聚类分析
256
+
257
+ Args:
258
+ persist_dir: ChromaDB 持久化目录
259
+ collection_name: 集合名称
260
+ output_dir: 输出目录(可选)
261
+ name: 结果文件名前缀(默认使用 collection_name)
262
+ sample_size: 限制加载的样本数量(可选,用于大规模数据)
263
+ where: 元数据过滤条件(可选),例如 {"category": "tech"}
264
+ where_document: 文档内容过滤条件(可选),例如 {"$contains": "python"}
265
+
266
+ Returns:
267
+ ClusterResult: 聚类分析结果
268
+
269
+ Example:
270
+ >>> # 全量数据分析
271
+ >>> analyzer = ClusterAnalyzer(algorithm="hdbscan")
272
+ >>> result = analyzer.analyze_chroma(
273
+ ... persist_dir="./chroma_db",
274
+ ... collection_name="my_data",
275
+ ... )
276
+
277
+ >>> # 采样分析(处理大规模数据)
278
+ >>> result = analyzer.analyze_chroma(
279
+ ... persist_dir="./chroma_db",
280
+ ... collection_name="my_data",
281
+ ... sample_size=10000,
282
+ ... )
283
+
284
+ >>> # 过滤特定类别的数据
285
+ >>> result = analyzer.analyze_chroma(
286
+ ... persist_dir="./chroma_db",
287
+ ... collection_name="my_data",
288
+ ... where={"category": "tech"},
289
+ ... )
290
+ """
291
+ try:
292
+ import chromadb
293
+ except ImportError:
294
+ raise ImportError("请安装 chromadb: pip install chromadb")
295
+
296
+ logger.info(f"从 ChromaDB 加载数据: {collection_name}")
297
+
298
+ client = chromadb.PersistentClient(path=str(persist_dir))
299
+ collection = client.get_collection(collection_name)
300
+
301
+ # 构建查询参数
302
+ query_kwargs = {
303
+ "include": ["embeddings", "documents", "metadatas"]
304
+ }
305
+
306
+ if sample_size is not None:
307
+ query_kwargs["limit"] = sample_size
308
+ logger.info(f"限制加载前 {sample_size} 条数据")
309
+
310
+ if where is not None:
311
+ query_kwargs["where"] = where
312
+ logger.info(f"应用元数据过滤: {where}")
313
+
314
+ if where_document is not None:
315
+ query_kwargs["where_document"] = where_document
316
+ logger.info(f"应用文档过滤: {where_document}")
317
+
318
+ # 加载数据
319
+ results = collection.get(**query_kwargs)
320
+
321
+ ids = results["ids"]
322
+ embeddings = np.array(results["embeddings"])
323
+ documents = results["documents"]
324
+
325
+ logger.info(f"加载 {len(embeddings)} 个向量, 维度={embeddings.shape[1]}")
326
+
327
+ return self.analyze(
328
+ embeddings,
329
+ ids=ids,
330
+ documents=documents,
331
+ output_dir=output_dir,
332
+ name=name or collection_name,
333
+ )
334
+
335
+ def _log_cluster_stats(self, samples: List[ClusterSample]) -> None:
336
+ """输出簇大小统计"""
337
+ total = sum(s.count for s in samples)
338
+ sorted_samples = sorted(samples, key=lambda s: -s.count)
339
+
340
+ logger.info("簇大小分布:")
341
+ for s in sorted_samples[:10]:
342
+ pct = s.count / total * 100
343
+ label = "Noise" if s.label == "noise" else f"Cluster {s.label}"
344
+ logger.info(f" {label}: {s.count} ({pct:.1f}%)")
345
+
346
+ if len(sorted_samples) > 10:
347
+ logger.info(f" ... 还有 {len(sorted_samples) - 10} 个簇")