maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
现代化性能分析器
|
|
6
|
+
|
|
7
|
+
提供两种分析器:
|
|
8
|
+
1. Profile - 基于 pyinstrument,轻量级 CPU 分析
|
|
9
|
+
2. ScaleneProfile - 基于 Scalene,CPU + 内存 + GPU 全面分析
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
# pyinstrument (轻量级)
|
|
13
|
+
with Profile("数据处理") as p:
|
|
14
|
+
process_data()
|
|
15
|
+
|
|
16
|
+
# Scalene (全面分析)
|
|
17
|
+
with ScaleneProfile("内存分析", memory=True):
|
|
18
|
+
process_large_data()
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from typing import Optional, Literal
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
import functools
|
|
24
|
+
import subprocess
|
|
25
|
+
import sys
|
|
26
|
+
import tempfile
|
|
27
|
+
import os
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from pyinstrument import Profiler
|
|
31
|
+
PYINSTRUMENT_AVAILABLE = True
|
|
32
|
+
except ImportError:
|
|
33
|
+
PYINSTRUMENT_AVAILABLE = False
|
|
34
|
+
Profiler = None
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from scalene import scalene_profiler
|
|
38
|
+
SCALENE_AVAILABLE = True
|
|
39
|
+
except ImportError:
|
|
40
|
+
SCALENE_AVAILABLE = False
|
|
41
|
+
scalene_profiler = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
OutputFormat = Literal["text", "html", "json", "speedscope"]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Profile:
|
|
48
|
+
"""
|
|
49
|
+
现代化性能分析器
|
|
50
|
+
|
|
51
|
+
基于 pyinstrument 的采样式分析,低开销,支持异步代码。
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> with Profile("任务名称") as p:
|
|
55
|
+
... time.sleep(0.1)
|
|
56
|
+
... do_something()
|
|
57
|
+
|
|
58
|
+
# 查看 HTML 报告
|
|
59
|
+
>>> p.open_in_browser()
|
|
60
|
+
|
|
61
|
+
# 保存报告
|
|
62
|
+
>>> p.save("report.html")
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
name: str = "",
|
|
68
|
+
*,
|
|
69
|
+
interval: float = 0.001, # 采样间隔(秒)
|
|
70
|
+
async_mode: str = "enabled", # enabled, disabled, strict
|
|
71
|
+
show: bool = True, # 退出时是否自动打印
|
|
72
|
+
show_all: bool = False, # 显示所有帧(包括库代码)
|
|
73
|
+
timeline: bool = False, # 时间线模式
|
|
74
|
+
output: OutputFormat = "text", # 输出格式
|
|
75
|
+
):
|
|
76
|
+
if not PYINSTRUMENT_AVAILABLE:
|
|
77
|
+
raise ImportError(
|
|
78
|
+
"pyinstrument 未安装,请运行: pip install pyinstrument"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
self.name = name
|
|
82
|
+
self.show = show
|
|
83
|
+
self.show_all = show_all
|
|
84
|
+
self.output = output
|
|
85
|
+
self._profiler = Profiler(interval=interval, async_mode=async_mode)
|
|
86
|
+
self._timeline = timeline
|
|
87
|
+
|
|
88
|
+
def __enter__(self):
|
|
89
|
+
self._profiler.start()
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
93
|
+
self._profiler.stop()
|
|
94
|
+
if self.show:
|
|
95
|
+
self.print()
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
def start(self):
|
|
99
|
+
"""手动启动分析"""
|
|
100
|
+
self._profiler.start()
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def stop(self):
|
|
104
|
+
"""手动停止分析"""
|
|
105
|
+
self._profiler.stop()
|
|
106
|
+
return self
|
|
107
|
+
|
|
108
|
+
def print(self, **kwargs):
|
|
109
|
+
"""打印分析报告到终端"""
|
|
110
|
+
if self.name:
|
|
111
|
+
print(f"\n{'='*20} {self.name} {'='*20}")
|
|
112
|
+
print(self._profiler.output_text(
|
|
113
|
+
unicode=True,
|
|
114
|
+
color=True,
|
|
115
|
+
show_all=self.show_all,
|
|
116
|
+
timeline=self._timeline,
|
|
117
|
+
**kwargs
|
|
118
|
+
))
|
|
119
|
+
|
|
120
|
+
def to_html(self) -> str:
|
|
121
|
+
"""生成 HTML 报告"""
|
|
122
|
+
return self._profiler.output_html()
|
|
123
|
+
|
|
124
|
+
def to_text(self, **kwargs) -> str:
|
|
125
|
+
"""生成文本报告"""
|
|
126
|
+
return self._profiler.output_text(
|
|
127
|
+
unicode=True,
|
|
128
|
+
show_all=self.show_all,
|
|
129
|
+
timeline=self._timeline,
|
|
130
|
+
**kwargs
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def to_json(self) -> str:
|
|
134
|
+
"""生成 JSON 报告(用于程序化分析)"""
|
|
135
|
+
import json
|
|
136
|
+
return json.dumps(self._profiler.last_session.frame_records, indent=2)
|
|
137
|
+
|
|
138
|
+
def save(self, path: str):
|
|
139
|
+
"""
|
|
140
|
+
保存报告到文件
|
|
141
|
+
|
|
142
|
+
根据文件扩展名自动选择格式:
|
|
143
|
+
- .html -> HTML 交互式报告
|
|
144
|
+
- .txt -> 文本报告
|
|
145
|
+
- .json -> JSON 数据
|
|
146
|
+
"""
|
|
147
|
+
path = Path(path)
|
|
148
|
+
suffix = path.suffix.lower()
|
|
149
|
+
|
|
150
|
+
if suffix == ".html":
|
|
151
|
+
content = self.to_html()
|
|
152
|
+
elif suffix == ".json":
|
|
153
|
+
content = self.to_json()
|
|
154
|
+
else:
|
|
155
|
+
content = self.to_text()
|
|
156
|
+
|
|
157
|
+
path.write_text(content, encoding="utf-8")
|
|
158
|
+
print(f"报告已保存: {path}")
|
|
159
|
+
|
|
160
|
+
def open_in_browser(self):
|
|
161
|
+
"""在浏览器中打开交互式 HTML 报告"""
|
|
162
|
+
self._profiler.open_in_browser(timeline=self._timeline)
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def session(self):
|
|
166
|
+
"""获取原始 session 对象用于高级操作"""
|
|
167
|
+
return self._profiler.last_session
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def profile(
|
|
171
|
+
func=None,
|
|
172
|
+
*,
|
|
173
|
+
show: bool = True,
|
|
174
|
+
show_all: bool = False,
|
|
175
|
+
save_to: Optional[str] = None,
|
|
176
|
+
):
|
|
177
|
+
"""
|
|
178
|
+
函数装饰器 - 分析函数性能
|
|
179
|
+
|
|
180
|
+
Example:
|
|
181
|
+
@profile
|
|
182
|
+
def slow_function():
|
|
183
|
+
...
|
|
184
|
+
|
|
185
|
+
@profile(show=False, save_to="report.html")
|
|
186
|
+
def another_function():
|
|
187
|
+
...
|
|
188
|
+
"""
|
|
189
|
+
def decorator(fn):
|
|
190
|
+
@functools.wraps(fn)
|
|
191
|
+
def wrapper(*args, **kwargs):
|
|
192
|
+
with Profile(fn.__name__, show=show, show_all=show_all) as p:
|
|
193
|
+
result = fn(*args, **kwargs)
|
|
194
|
+
if save_to:
|
|
195
|
+
p.save(save_to)
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
@functools.wraps(fn)
|
|
199
|
+
async def async_wrapper(*args, **kwargs):
|
|
200
|
+
with Profile(fn.__name__, show=show, show_all=show_all) as p:
|
|
201
|
+
result = await fn(*args, **kwargs)
|
|
202
|
+
if save_to:
|
|
203
|
+
p.save(save_to)
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
import asyncio
|
|
207
|
+
if asyncio.iscoroutinefunction(fn):
|
|
208
|
+
return async_wrapper
|
|
209
|
+
return wrapper
|
|
210
|
+
|
|
211
|
+
if func is not None:
|
|
212
|
+
return decorator(func)
|
|
213
|
+
return decorator
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class ScaleneProfile:
|
|
217
|
+
"""
|
|
218
|
+
全面性能分析器 - 基于 Scalene
|
|
219
|
+
|
|
220
|
+
支持 CPU + 内存 + GPU 分析,自动检测内存泄漏。
|
|
221
|
+
|
|
222
|
+
Example:
|
|
223
|
+
# 基本使用
|
|
224
|
+
with ScaleneProfile("数据处理"):
|
|
225
|
+
process_data()
|
|
226
|
+
|
|
227
|
+
# 含内存分析
|
|
228
|
+
with ScaleneProfile("内存密集任务", memory=True, gpu=True):
|
|
229
|
+
train_model()
|
|
230
|
+
|
|
231
|
+
# 生成 HTML 报告
|
|
232
|
+
with ScaleneProfile("分析", output="report.html"):
|
|
233
|
+
heavy_work()
|
|
234
|
+
|
|
235
|
+
Note:
|
|
236
|
+
Scalene 使用采样分析,对于运行时间 < 1 秒的代码可能采样不足。
|
|
237
|
+
建议用于分析耗时较长的代码块。
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
def __init__(
|
|
241
|
+
self,
|
|
242
|
+
name: str = "",
|
|
243
|
+
*,
|
|
244
|
+
cpu: bool = True,
|
|
245
|
+
memory: bool = False,
|
|
246
|
+
gpu: bool = False,
|
|
247
|
+
output: Optional[str] = None, # HTML 报告路径
|
|
248
|
+
reduced_profile: bool = False, # 仅显示有性能问题的行
|
|
249
|
+
):
|
|
250
|
+
if not SCALENE_AVAILABLE:
|
|
251
|
+
raise ImportError(
|
|
252
|
+
"scalene 未安装,请运行: pip install scalene"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
self.name = name
|
|
256
|
+
self.cpu = cpu
|
|
257
|
+
self.memory = memory
|
|
258
|
+
self.gpu = gpu
|
|
259
|
+
self.output = output
|
|
260
|
+
self.reduced_profile = reduced_profile
|
|
261
|
+
|
|
262
|
+
def __enter__(self):
|
|
263
|
+
if self.name:
|
|
264
|
+
print(f"\n🔬 Scalene 分析开始: {self.name}")
|
|
265
|
+
scalene_profiler.start()
|
|
266
|
+
return self
|
|
267
|
+
|
|
268
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
269
|
+
scalene_profiler.stop()
|
|
270
|
+
if self.name:
|
|
271
|
+
print(f"✅ Scalene 分析完成: {self.name}")
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
def start(self):
|
|
275
|
+
"""手动启动分析"""
|
|
276
|
+
scalene_profiler.start()
|
|
277
|
+
return self
|
|
278
|
+
|
|
279
|
+
def stop(self):
|
|
280
|
+
"""手动停止分析"""
|
|
281
|
+
scalene_profiler.stop()
|
|
282
|
+
return self
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def scalene_run(
|
|
286
|
+
script: str,
|
|
287
|
+
*args,
|
|
288
|
+
cpu: bool = True,
|
|
289
|
+
memory: bool = True,
|
|
290
|
+
gpu: bool = False,
|
|
291
|
+
output: Optional[str] = None,
|
|
292
|
+
reduced: bool = False,
|
|
293
|
+
**kwargs,
|
|
294
|
+
) -> subprocess.CompletedProcess:
|
|
295
|
+
"""
|
|
296
|
+
使用 Scalene 运行 Python 脚本(推荐方式)
|
|
297
|
+
|
|
298
|
+
Scalene 的完整功能需要从命令行启动,此函数封装了命令行调用。
|
|
299
|
+
|
|
300
|
+
Example:
|
|
301
|
+
# 分析脚本
|
|
302
|
+
scalene_run("train.py", "--epochs", "10", output="report.html")
|
|
303
|
+
|
|
304
|
+
# 分析模块
|
|
305
|
+
scalene_run("-m", "pytest", "tests/", memory=True)
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
script: Python 脚本路径或 -m 模块名
|
|
309
|
+
*args: 传递给脚本的参数
|
|
310
|
+
cpu: 是否分析 CPU(默认 True)
|
|
311
|
+
memory: 是否分析内存(默认 True)
|
|
312
|
+
gpu: 是否分析 GPU
|
|
313
|
+
output: HTML 报告输出路径
|
|
314
|
+
reduced: 仅显示有问题的行
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
subprocess.CompletedProcess 对象
|
|
318
|
+
"""
|
|
319
|
+
cmd = [sys.executable, "-m", "scalene"]
|
|
320
|
+
|
|
321
|
+
if not cpu:
|
|
322
|
+
cmd.append("--cpu-only")
|
|
323
|
+
if memory:
|
|
324
|
+
cmd.append("--memory")
|
|
325
|
+
if gpu:
|
|
326
|
+
cmd.append("--gpu")
|
|
327
|
+
if reduced:
|
|
328
|
+
cmd.append("--reduced-profile")
|
|
329
|
+
if output:
|
|
330
|
+
cmd.extend(["--html", "--outfile", output])
|
|
331
|
+
|
|
332
|
+
cmd.append("---") # 分隔 Scalene 参数和脚本参数
|
|
333
|
+
cmd.append(script)
|
|
334
|
+
cmd.extend(args)
|
|
335
|
+
|
|
336
|
+
print(f"🚀 运行: {' '.join(cmd)}")
|
|
337
|
+
return subprocess.run(cmd, **kwargs)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def scalene_profile(
|
|
341
|
+
func=None,
|
|
342
|
+
*,
|
|
343
|
+
memory: bool = False,
|
|
344
|
+
gpu: bool = False,
|
|
345
|
+
):
|
|
346
|
+
"""
|
|
347
|
+
Scalene 函数装饰器
|
|
348
|
+
|
|
349
|
+
注意:需要用 `scalene` 命令启动脚本才能生效。
|
|
350
|
+
|
|
351
|
+
Example:
|
|
352
|
+
@scalene_profile(memory=True)
|
|
353
|
+
def process_data():
|
|
354
|
+
...
|
|
355
|
+
|
|
356
|
+
# 运行时使用: scalene script.py
|
|
357
|
+
"""
|
|
358
|
+
def decorator(fn):
|
|
359
|
+
@functools.wraps(fn)
|
|
360
|
+
def wrapper(*args, **kwargs):
|
|
361
|
+
with ScaleneProfile(fn.__name__, memory=memory, gpu=gpu):
|
|
362
|
+
return fn(*args, **kwargs)
|
|
363
|
+
return wrapper
|
|
364
|
+
|
|
365
|
+
if func is not None:
|
|
366
|
+
return decorator(func)
|
|
367
|
+
return decorator
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import psutil
|
|
3
|
+
|
|
4
|
+
_scale = {'KB': 1024, 'MB': 1024 * 1024, 'GB': 1024 * 1024 * 1024, }
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_virtual_memory(unit='GB'):
|
|
8
|
+
memory_tuple = psutil.virtual_memory()
|
|
9
|
+
_key = unit.upper()
|
|
10
|
+
memory_dict = {
|
|
11
|
+
"total": (memory_tuple[0] / _scale[_key], _key),
|
|
12
|
+
"available": (memory_tuple[1] / _scale[_key], _key),
|
|
13
|
+
"percent": (memory_tuple[2], '%'),
|
|
14
|
+
"used": (memory_tuple[3] / _scale[_key], _key),
|
|
15
|
+
"free": (memory_tuple[4] / _scale[_key], _key),
|
|
16
|
+
}
|
|
17
|
+
return memory_dict
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def elapsed_since(start):
|
|
21
|
+
return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_process_memory(unit='MB'):
|
|
25
|
+
return psutil.Process().memory_info().rss / _scale[unit.upper()]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _profile(func):
|
|
29
|
+
"""
|
|
30
|
+
注意,这个装饰器只能得到函数推出后的内存增量,而要函数内的内存增量不应用此装饰器
|
|
31
|
+
若要方便调试对函数的使用,请使用 memory_profiler这个包
|
|
32
|
+
from memory_profiler import profile
|
|
33
|
+
@profile
|
|
34
|
+
def func():
|
|
35
|
+
...
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def wrapper(*args, **kwargs):
|
|
39
|
+
_key = "MB"
|
|
40
|
+
mem_before = get_process_memory(_key)
|
|
41
|
+
start = time.time()
|
|
42
|
+
result = func(*args, **kwargs)
|
|
43
|
+
elapsed_time = elapsed_since(start)
|
|
44
|
+
mem_after = get_process_memory(_key)
|
|
45
|
+
print(
|
|
46
|
+
f"{func.__name__}: memory before: {mem_before:.2f} {_key}, after: {mem_after:.2f} {_key}, consumed: {mem_after - mem_before:.2f} {_key}; exec time: {elapsed_time}"
|
|
47
|
+
)
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
return wrapper
|
|
51
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Pipeline 模块 - 提供端到端的便利流水线
|
|
6
|
+
|
|
7
|
+
这些流水线是可选的便利层,封装常见的工作流模式。
|
|
8
|
+
用户仍可以使用底层 API (embedding, retriever, clustering) 进行完全自定义。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .clustering import ClusteringPipeline
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ClusteringPipeline",
|
|
15
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
聚类分析流水线
|
|
6
|
+
|
|
7
|
+
提供向量化 + 聚类的端到端便利接口。
|
|
8
|
+
这是可选的高层封装,用户仍可使用底层 API 进行完全自定义。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional, Union
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import chromadb
|
|
19
|
+
except ImportError:
|
|
20
|
+
chromadb = None
|
|
21
|
+
|
|
22
|
+
from ..embedding.base import BaseEmbedding
|
|
23
|
+
from ..retriever import ChromaRetriever, Document
|
|
24
|
+
from ..clustering import ClusterAnalyzer, ClusterResult
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ClusteringPipeline:
|
|
28
|
+
"""
|
|
29
|
+
向量化 + 聚类的端到端流水线
|
|
30
|
+
|
|
31
|
+
封装常见的工作流:数据加载 → 向量化 → 存储 → 聚类分析
|
|
32
|
+
|
|
33
|
+
这是可选的便利层,用户仍可使用底层组件:
|
|
34
|
+
- TextEmbedding: 向量化
|
|
35
|
+
- ChromaRetriever: 向量存储与检索
|
|
36
|
+
- ClusterAnalyzer: 聚类分析
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> # 基础用法
|
|
40
|
+
>>> from maque.embedding import TextEmbedding
|
|
41
|
+
>>> from maque.pipelines import ClusteringPipeline
|
|
42
|
+
>>> from maque.clustering import ClusterAnalyzer
|
|
43
|
+
>>> from maque.retriever import Document
|
|
44
|
+
>>>
|
|
45
|
+
>>> # 创建流水线
|
|
46
|
+
>>> pipeline = ClusteringPipeline(
|
|
47
|
+
... embedding=TextEmbedding(base_url="http://localhost:8000", model="jina-v3"),
|
|
48
|
+
... persist_dir="./chroma_db",
|
|
49
|
+
... collection_name="my_data",
|
|
50
|
+
... )
|
|
51
|
+
>>>
|
|
52
|
+
>>> # 向量化文档
|
|
53
|
+
>>> docs = [Document.text(content=text, id=f"doc_{i}") for i, text in enumerate(texts)]
|
|
54
|
+
>>> pipeline.build_vectors(docs, batch_size=32, skip_existing=True)
|
|
55
|
+
>>>
|
|
56
|
+
>>> # 聚类分析
|
|
57
|
+
>>> analyzer = ClusterAnalyzer(algorithm="hdbscan", min_cluster_size=15)
|
|
58
|
+
>>> result = pipeline.analyze(analyzer, output_dir="./results", name="my_cluster")
|
|
59
|
+
|
|
60
|
+
>>> # 高级用法:直接访问底层组件
|
|
61
|
+
>>> pipeline.retriever.search("查询文本", top_k=10) # 使用检索功能
|
|
62
|
+
>>> pipeline.embedding.embed(["文本1", "文本2"]) # 直接向量化
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
embedding: BaseEmbedding,
|
|
68
|
+
persist_dir: Union[str, Path] = None,
|
|
69
|
+
collection_name: str = "default",
|
|
70
|
+
distance_metric: str = "cosine",
|
|
71
|
+
retriever=None,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
初始化聚类流水线
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
embedding: Embedding 实例(TextEmbedding 或 MultiModalEmbedding)
|
|
78
|
+
persist_dir: ChromaDB 持久化目录(当 retriever=None 时必须提供)
|
|
79
|
+
collection_name: 集合名称(当 retriever=None 时使用)
|
|
80
|
+
distance_metric: 距离度量方式 (cosine/l2/ip)(当 retriever=None 时使用)
|
|
81
|
+
retriever: 可选,直接传入 Retriever 实例(ChromaRetriever 或 MilvusRetriever)
|
|
82
|
+
传入后将忽略 persist_dir, collection_name, distance_metric 参数
|
|
83
|
+
"""
|
|
84
|
+
self.embedding = embedding
|
|
85
|
+
|
|
86
|
+
# 支持传入 retriever 或自动创建 ChromaRetriever
|
|
87
|
+
if retriever is not None:
|
|
88
|
+
self.retriever = retriever
|
|
89
|
+
self.persist_dir = getattr(retriever, 'persist_dir', None)
|
|
90
|
+
self.collection_name = getattr(retriever, 'collection_name', 'unknown')
|
|
91
|
+
logger.info(f"初始化 ClusteringPipeline: {self.collection_name} (外部 retriever)")
|
|
92
|
+
else:
|
|
93
|
+
if persist_dir is None:
|
|
94
|
+
raise ValueError("persist_dir 必须提供(当 retriever=None 时)")
|
|
95
|
+
self.persist_dir = Path(persist_dir)
|
|
96
|
+
self.collection_name = collection_name
|
|
97
|
+
# 创建 ChromaRetriever(底层组件仍可访问)
|
|
98
|
+
self.retriever = ChromaRetriever(
|
|
99
|
+
embedding=embedding,
|
|
100
|
+
persist_dir=str(persist_dir),
|
|
101
|
+
collection_name=collection_name,
|
|
102
|
+
distance_metric=distance_metric,
|
|
103
|
+
)
|
|
104
|
+
logger.info(f"初始化 ClusteringPipeline: {collection_name} @ {persist_dir}")
|
|
105
|
+
|
|
106
|
+
def build_vectors(
|
|
107
|
+
self,
|
|
108
|
+
documents: List[Document],
|
|
109
|
+
batch_size: int = 32,
|
|
110
|
+
skip_existing: bool = True,
|
|
111
|
+
show_progress: bool = True,
|
|
112
|
+
) -> int:
|
|
113
|
+
"""
|
|
114
|
+
向量化并存储文档
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
documents: 文档列表
|
|
118
|
+
batch_size: 批处理大小
|
|
119
|
+
skip_existing: 是否跳过已存在的文档(增量更新)
|
|
120
|
+
show_progress: 是否显示进度条
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
实际插入的文档数量
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
>>> from maque.retriever import Document
|
|
127
|
+
>>> docs = [Document.text(content=f"text {i}", id=f"doc_{i}") for i in range(100)]
|
|
128
|
+
>>> count = pipeline.build_vectors(docs, skip_existing=True)
|
|
129
|
+
>>> print(f"插入 {count} 个文档")
|
|
130
|
+
"""
|
|
131
|
+
logger.info(f"开始向量化 {len(documents)} 个文档...")
|
|
132
|
+
|
|
133
|
+
count = self.retriever.upsert_batch(
|
|
134
|
+
documents=documents,
|
|
135
|
+
batch_size=batch_size,
|
|
136
|
+
skip_existing=skip_existing,
|
|
137
|
+
show_progress=show_progress,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
logger.info(f"完成向量化,当前集合共 {self.retriever.count()} 个文档")
|
|
141
|
+
return count
|
|
142
|
+
|
|
143
|
+
def analyze(
|
|
144
|
+
self,
|
|
145
|
+
analyzer: ClusterAnalyzer,
|
|
146
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
147
|
+
name: Optional[str] = None,
|
|
148
|
+
sample_size: Optional[int] = None,
|
|
149
|
+
where: Optional[dict] = None,
|
|
150
|
+
where_document: Optional[dict] = None,
|
|
151
|
+
) -> ClusterResult:
|
|
152
|
+
"""
|
|
153
|
+
从 ChromaDB 加载并执行聚类分析
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
analyzer: ClusterAnalyzer 实例(预配置算法参数)
|
|
157
|
+
output_dir: 输出目录(保存 JSON 和可视化图片)
|
|
158
|
+
name: 结果文件名前缀(默认使用 collection_name)
|
|
159
|
+
sample_size: 限制加载的样本数量(用于大规模数据)
|
|
160
|
+
where: 元数据过滤条件
|
|
161
|
+
where_document: 文档内容过滤条件
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
ClusterResult: 聚类分析结果
|
|
165
|
+
|
|
166
|
+
Example:
|
|
167
|
+
>>> # HDBSCAN 自动聚类
|
|
168
|
+
>>> analyzer = ClusterAnalyzer(algorithm="hdbscan", min_cluster_size=15)
|
|
169
|
+
>>> result = pipeline.analyze(analyzer, output_dir="./results")
|
|
170
|
+
|
|
171
|
+
>>> # K-Means 指定簇数
|
|
172
|
+
>>> analyzer = ClusterAnalyzer(algorithm="kmeans", n_clusters=20)
|
|
173
|
+
>>> result = pipeline.analyze(analyzer, output_dir="./results", sample_size=10000)
|
|
174
|
+
|
|
175
|
+
>>> # 过滤特定类别数据
|
|
176
|
+
>>> result = pipeline.analyze(
|
|
177
|
+
... analyzer,
|
|
178
|
+
... output_dir="./results",
|
|
179
|
+
... where={"category": "tech"},
|
|
180
|
+
... )
|
|
181
|
+
"""
|
|
182
|
+
logger.info("开始聚类分析...")
|
|
183
|
+
|
|
184
|
+
# 使用 ClusterAnalyzer 的 analyze_chroma 方法
|
|
185
|
+
result = analyzer.analyze_chroma(
|
|
186
|
+
persist_dir=self.persist_dir,
|
|
187
|
+
collection_name=self.collection_name,
|
|
188
|
+
output_dir=output_dir,
|
|
189
|
+
name=name or self.collection_name,
|
|
190
|
+
sample_size=sample_size,
|
|
191
|
+
where=where,
|
|
192
|
+
where_document=where_document,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
logger.info("聚类分析完成")
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
def load_vectors(
|
|
199
|
+
self,
|
|
200
|
+
sample_size: Optional[int] = None,
|
|
201
|
+
where: Optional[dict] = None,
|
|
202
|
+
) -> tuple[List[str], np.ndarray, List[str]]:
|
|
203
|
+
"""
|
|
204
|
+
从 ChromaDB 加载向量数据(供高级用户自定义分析)
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
sample_size: 限制加载的样本数量
|
|
208
|
+
where: 元数据过滤条件
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
(ids, embeddings, documents) 元组
|
|
212
|
+
|
|
213
|
+
Example:
|
|
214
|
+
>>> ids, embeddings, docs = pipeline.load_vectors(sample_size=1000)
|
|
215
|
+
>>> print(f"加载 {len(embeddings)} 个向量,维度={embeddings.shape[1]}")
|
|
216
|
+
>>> # 用户可自定义分析逻辑
|
|
217
|
+
>>> from sklearn.cluster import KMeans
|
|
218
|
+
>>> kmeans = KMeans(n_clusters=10)
|
|
219
|
+
>>> labels = kmeans.fit_predict(embeddings)
|
|
220
|
+
"""
|
|
221
|
+
if chromadb is None:
|
|
222
|
+
raise ImportError("需要安装 chromadb: pip install chromadb")
|
|
223
|
+
|
|
224
|
+
client = chromadb.PersistentClient(path=str(self.persist_dir))
|
|
225
|
+
collection = client.get_collection(self.collection_name)
|
|
226
|
+
|
|
227
|
+
query_kwargs = {"include": ["embeddings", "documents", "metadatas"]}
|
|
228
|
+
if sample_size is not None:
|
|
229
|
+
query_kwargs["limit"] = sample_size
|
|
230
|
+
if where is not None:
|
|
231
|
+
query_kwargs["where"] = where
|
|
232
|
+
|
|
233
|
+
results = collection.get(**query_kwargs)
|
|
234
|
+
|
|
235
|
+
ids = results["ids"]
|
|
236
|
+
embeddings = np.array(results["embeddings"])
|
|
237
|
+
documents = results["documents"]
|
|
238
|
+
|
|
239
|
+
logger.info(f"加载 {len(embeddings)} 个向量, 维度={embeddings.shape[1]}")
|
|
240
|
+
return ids, embeddings, documents
|
|
241
|
+
|
|
242
|
+
def count(self) -> int:
|
|
243
|
+
"""返回集合中的文档数量"""
|
|
244
|
+
return self.retriever.count()
|
|
245
|
+
|
|
246
|
+
def __repr__(self) -> str:
|
|
247
|
+
return (
|
|
248
|
+
f"ClusteringPipeline("
|
|
249
|
+
f"collection={self.collection_name!r}, "
|
|
250
|
+
f"count={self.count()}, "
|
|
251
|
+
f"persist_dir={str(self.persist_dir)!r})"
|
|
252
|
+
)
|