maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""
|
|
2
|
+
第3步:第一轮校验与粗筛(可选)
|
|
3
|
+
对数据进行基本校验和粗筛,过滤明显不符合要求的数据
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Tuple
|
|
10
|
+
from ..core import PipelineStep, StepResult, PipelineConfig
|
|
11
|
+
from maque.performance import MeasureTime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DataValidationStep(PipelineStep):
|
|
15
|
+
"""数据校验步骤"""
|
|
16
|
+
|
|
17
|
+
CONFIG_SCHEMA = {
|
|
18
|
+
"type": "object",
|
|
19
|
+
"properties": {
|
|
20
|
+
"skip_validation": {
|
|
21
|
+
"type": "boolean",
|
|
22
|
+
"default": False,
|
|
23
|
+
"description": "是否跳过验证步骤"
|
|
24
|
+
},
|
|
25
|
+
"min_text_length": {
|
|
26
|
+
"type": "integer",
|
|
27
|
+
"default": 10,
|
|
28
|
+
"description": "最小文本长度"
|
|
29
|
+
},
|
|
30
|
+
"max_text_length": {
|
|
31
|
+
"type": "integer",
|
|
32
|
+
"default": 10000,
|
|
33
|
+
"description": "最大文本长度"
|
|
34
|
+
},
|
|
35
|
+
"require_images": {
|
|
36
|
+
"type": "boolean",
|
|
37
|
+
"default": False,
|
|
38
|
+
"description": "是否要求必须有图像"
|
|
39
|
+
},
|
|
40
|
+
"require_text": {
|
|
41
|
+
"type": "boolean",
|
|
42
|
+
"default": True,
|
|
43
|
+
"description": "是否要求必须有文本"
|
|
44
|
+
},
|
|
45
|
+
"image_extensions": {
|
|
46
|
+
"type": "array",
|
|
47
|
+
"items": {"type": "string"},
|
|
48
|
+
"default": [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"],
|
|
49
|
+
"description": "允许的图像文件扩展名"
|
|
50
|
+
},
|
|
51
|
+
"text_filters": {
|
|
52
|
+
"type": "object",
|
|
53
|
+
"properties": {
|
|
54
|
+
"forbidden_words": {
|
|
55
|
+
"type": "array",
|
|
56
|
+
"items": {"type": "string"},
|
|
57
|
+
"description": "禁用词列表"
|
|
58
|
+
},
|
|
59
|
+
"required_words": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"items": {"type": "string"},
|
|
62
|
+
"description": "必须包含的词列表"
|
|
63
|
+
},
|
|
64
|
+
"regex_patterns": {
|
|
65
|
+
"type": "array",
|
|
66
|
+
"items": {"type": "string"},
|
|
67
|
+
"description": "正则表达式模式列表"
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"default": {}
|
|
71
|
+
},
|
|
72
|
+
"quality_thresholds": {
|
|
73
|
+
"type": "object",
|
|
74
|
+
"properties": {
|
|
75
|
+
"min_image_size": {
|
|
76
|
+
"type": "integer",
|
|
77
|
+
"default": 1024,
|
|
78
|
+
"description": "最小图像文件大小(字节)"
|
|
79
|
+
},
|
|
80
|
+
"max_image_size": {
|
|
81
|
+
"type": "integer",
|
|
82
|
+
"default": 10485760,
|
|
83
|
+
"description": "最大图像文件大小(字节)"
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
"default": {}
|
|
87
|
+
},
|
|
88
|
+
"remove_invalid": {
|
|
89
|
+
"type": "boolean",
|
|
90
|
+
"default": True,
|
|
91
|
+
"description": "是否移除无效数据,否则只标记"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
def __init__(self, name: str = "data_validation", config: Dict[str, Any] = None):
|
|
97
|
+
super().__init__(name, config)
|
|
98
|
+
|
|
99
|
+
async def execute(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> StepResult:
|
|
100
|
+
"""执行数据校验"""
|
|
101
|
+
with MeasureTime(f"步骤 {self.name}"):
|
|
102
|
+
try:
|
|
103
|
+
step_config = self.get_step_config(config)
|
|
104
|
+
|
|
105
|
+
# 如果跳过验证
|
|
106
|
+
if step_config.get("skip_validation", False):
|
|
107
|
+
self.logger.info("跳过数据验证步骤")
|
|
108
|
+
return StepResult(
|
|
109
|
+
step_name=self.name,
|
|
110
|
+
success=True,
|
|
111
|
+
data=data,
|
|
112
|
+
metadata={"skipped": True}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
data_copy = data.copy()
|
|
116
|
+
|
|
117
|
+
# 添加验证结果列
|
|
118
|
+
data_copy['__validation_passed'] = True
|
|
119
|
+
data_copy['__validation_errors'] = ''
|
|
120
|
+
|
|
121
|
+
# 执行各种验证
|
|
122
|
+
await self._validate_text_content(data_copy, step_config)
|
|
123
|
+
await self._validate_image_content(data_copy, step_config)
|
|
124
|
+
await self._validate_data_quality(data_copy, step_config)
|
|
125
|
+
|
|
126
|
+
# 统计验证结果
|
|
127
|
+
total_rows = len(data_copy)
|
|
128
|
+
valid_rows = len(data_copy[data_copy['__validation_passed']])
|
|
129
|
+
invalid_rows = total_rows - valid_rows
|
|
130
|
+
|
|
131
|
+
# 是否移除无效数据
|
|
132
|
+
if step_config.get("remove_invalid", True) and invalid_rows > 0:
|
|
133
|
+
data_copy = data_copy[data_copy['__validation_passed']].copy()
|
|
134
|
+
self.logger.info(f"移除了 {invalid_rows} 行无效数据")
|
|
135
|
+
|
|
136
|
+
# 更新处理状态
|
|
137
|
+
data_copy['__processing_status'] = 'validated'
|
|
138
|
+
data_copy['__validated_at'] = pd.Timestamp.now()
|
|
139
|
+
|
|
140
|
+
metadata = {
|
|
141
|
+
"total_rows_before": total_rows,
|
|
142
|
+
"valid_rows": valid_rows,
|
|
143
|
+
"invalid_rows": invalid_rows,
|
|
144
|
+
"validation_rate": valid_rows / total_rows if total_rows > 0 else 0,
|
|
145
|
+
"removed_invalid": step_config.get("remove_invalid", True),
|
|
146
|
+
"final_rows": len(data_copy)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
self.logger.info(f"数据验证完成,有效率: {metadata['validation_rate']:.2%}")
|
|
150
|
+
|
|
151
|
+
return StepResult(
|
|
152
|
+
step_name=self.name,
|
|
153
|
+
success=True,
|
|
154
|
+
data=data_copy,
|
|
155
|
+
metadata=metadata
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
self.logger.error(f"数据验证失败: {e}")
|
|
160
|
+
return StepResult(
|
|
161
|
+
step_name=self.name,
|
|
162
|
+
success=False,
|
|
163
|
+
data=data,
|
|
164
|
+
metadata={},
|
|
165
|
+
error=str(e)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
async def _validate_text_content(self, data: pd.DataFrame, step_config: Dict[str, Any]):
|
|
169
|
+
"""验证文本内容"""
|
|
170
|
+
min_length = step_config.get("min_text_length", 10)
|
|
171
|
+
max_length = step_config.get("max_text_length", 10000)
|
|
172
|
+
require_text = step_config.get("require_text", True)
|
|
173
|
+
text_filters = step_config.get("text_filters", {})
|
|
174
|
+
|
|
175
|
+
text_col = step_config.get("output_text_column", "text")
|
|
176
|
+
if text_col not in data.columns:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
for idx, row in data.iterrows():
|
|
180
|
+
errors = []
|
|
181
|
+
text_content = str(row[text_col]) if pd.notna(row[text_col]) else ""
|
|
182
|
+
|
|
183
|
+
# 检查是否需要文本
|
|
184
|
+
if require_text and not text_content.strip():
|
|
185
|
+
errors.append("缺少必需的文本内容")
|
|
186
|
+
|
|
187
|
+
# 检查文本长度
|
|
188
|
+
text_length = len(text_content.strip())
|
|
189
|
+
if text_content.strip() and (text_length < min_length or text_length > max_length):
|
|
190
|
+
errors.append(f"文本长度不符合要求 ({text_length}), 要求 {min_length}-{max_length}")
|
|
191
|
+
|
|
192
|
+
# 检查禁用词
|
|
193
|
+
forbidden_words = text_filters.get("forbidden_words", [])
|
|
194
|
+
for word in forbidden_words:
|
|
195
|
+
if word.lower() in text_content.lower():
|
|
196
|
+
errors.append(f"包含禁用词: {word}")
|
|
197
|
+
|
|
198
|
+
# 检查必需词
|
|
199
|
+
required_words = text_filters.get("required_words", [])
|
|
200
|
+
for word in required_words:
|
|
201
|
+
if word.lower() not in text_content.lower():
|
|
202
|
+
errors.append(f"缺少必需词: {word}")
|
|
203
|
+
|
|
204
|
+
# 检查正则表达式
|
|
205
|
+
regex_patterns = text_filters.get("regex_patterns", [])
|
|
206
|
+
for pattern in regex_patterns:
|
|
207
|
+
try:
|
|
208
|
+
if not re.search(pattern, text_content, re.IGNORECASE):
|
|
209
|
+
errors.append(f"不符合模式: {pattern}")
|
|
210
|
+
except re.error:
|
|
211
|
+
self.logger.warning(f"无效的正则表达式: {pattern}")
|
|
212
|
+
|
|
213
|
+
# 更新验证结果
|
|
214
|
+
if errors:
|
|
215
|
+
data.at[idx, '__validation_passed'] = False
|
|
216
|
+
data.at[idx, '__validation_errors'] = "; ".join(errors)
|
|
217
|
+
|
|
218
|
+
async def _validate_image_content(self, data: pd.DataFrame, step_config: Dict[str, Any]):
|
|
219
|
+
"""验证图像内容"""
|
|
220
|
+
require_images = step_config.get("require_images", False)
|
|
221
|
+
image_extensions = step_config.get("image_extensions", [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"])
|
|
222
|
+
quality_thresholds = step_config.get("quality_thresholds", {})
|
|
223
|
+
|
|
224
|
+
image_col = step_config.get("output_image_column", "images")
|
|
225
|
+
if image_col not in data.columns:
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
separator = step_config.get("image_separator", "|")
|
|
229
|
+
|
|
230
|
+
for idx, row in data.iterrows():
|
|
231
|
+
errors = []
|
|
232
|
+
image_paths_str = str(row[image_col]) if pd.notna(row[image_col]) else ""
|
|
233
|
+
image_paths = [p.strip() for p in image_paths_str.split(separator) if p.strip()]
|
|
234
|
+
|
|
235
|
+
# 检查是否需要图像
|
|
236
|
+
if require_images and not image_paths:
|
|
237
|
+
errors.append("缺少必需的图像")
|
|
238
|
+
|
|
239
|
+
# 验证每个图像文件
|
|
240
|
+
for image_path in image_paths:
|
|
241
|
+
path_errors = await self._validate_single_image(image_path, image_extensions, quality_thresholds)
|
|
242
|
+
errors.extend(path_errors)
|
|
243
|
+
|
|
244
|
+
# 更新验证结果
|
|
245
|
+
if errors:
|
|
246
|
+
current_errors = data.at[idx, '__validation_errors']
|
|
247
|
+
if current_errors:
|
|
248
|
+
current_errors += "; " + "; ".join(errors)
|
|
249
|
+
else:
|
|
250
|
+
current_errors = "; ".join(errors)
|
|
251
|
+
data.at[idx, '__validation_errors'] = current_errors
|
|
252
|
+
data.at[idx, '__validation_passed'] = False
|
|
253
|
+
|
|
254
|
+
async def _validate_single_image(self, image_path: str, allowed_extensions: List[str], quality_thresholds: Dict[str, Any]) -> List[str]:
|
|
255
|
+
"""验证单个图像文件"""
|
|
256
|
+
errors = []
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
path_obj = Path(image_path)
|
|
260
|
+
|
|
261
|
+
# 检查文件是否存在
|
|
262
|
+
if not path_obj.exists():
|
|
263
|
+
errors.append(f"图像文件不存在: {image_path}")
|
|
264
|
+
return errors
|
|
265
|
+
|
|
266
|
+
# 检查文件扩展名
|
|
267
|
+
if allowed_extensions and path_obj.suffix.lower() not in [ext.lower() for ext in allowed_extensions]:
|
|
268
|
+
errors.append(f"不支持的图像格式: {path_obj.suffix}")
|
|
269
|
+
|
|
270
|
+
# 检查文件大小
|
|
271
|
+
file_size = path_obj.stat().st_size
|
|
272
|
+
min_size = quality_thresholds.get("min_image_size", 0)
|
|
273
|
+
max_size = quality_thresholds.get("max_image_size", float('inf'))
|
|
274
|
+
|
|
275
|
+
if file_size < min_size:
|
|
276
|
+
errors.append(f"图像文件太小: {file_size} < {min_size}")
|
|
277
|
+
if file_size > max_size:
|
|
278
|
+
errors.append(f"图像文件太大: {file_size} > {max_size}")
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
errors.append(f"验证图像时出错: {str(e)}")
|
|
282
|
+
|
|
283
|
+
return errors
|
|
284
|
+
|
|
285
|
+
async def _validate_data_quality(self, data: pd.DataFrame, step_config: Dict[str, Any]):
|
|
286
|
+
"""验证数据质量"""
|
|
287
|
+
# 检查重复数据
|
|
288
|
+
text_col = step_config.get("output_text_column", "text")
|
|
289
|
+
image_col = step_config.get("output_image_column", "images")
|
|
290
|
+
|
|
291
|
+
if text_col in data.columns and image_col in data.columns:
|
|
292
|
+
# 基于文本和图像内容查找重复
|
|
293
|
+
content_cols = [text_col, image_col]
|
|
294
|
+
duplicate_mask = data.duplicated(subset=content_cols, keep='first')
|
|
295
|
+
|
|
296
|
+
for idx in data[duplicate_mask].index:
|
|
297
|
+
current_errors = data.at[idx, '__validation_errors']
|
|
298
|
+
error_msg = "发现重复内容"
|
|
299
|
+
if current_errors:
|
|
300
|
+
current_errors += "; " + error_msg
|
|
301
|
+
else:
|
|
302
|
+
current_errors = error_msg
|
|
303
|
+
data.at[idx, '__validation_errors'] = current_errors
|
|
304
|
+
data.at[idx, '__validation_passed'] = False
|