maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,267 @@
1
+ """
2
+ 第2步:数据对齐步骤
3
+ 对齐为所需格式,主要是图像列 将多个图像处理为多个字符串地址的拼接
4
+ """
5
+
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ from typing import List, Dict, Any, Union
9
+ from ..core import PipelineStep, StepResult, PipelineConfig
10
+ from maque.performance import MeasureTime
11
+
12
+
13
+ class DataAlignmentStep(PipelineStep):
14
+ """数据对齐步骤"""
15
+
16
+ CONFIG_SCHEMA = {
17
+ "type": "object",
18
+ "properties": {
19
+ "image_separator": {
20
+ "type": "string",
21
+ "default": "|",
22
+ "description": "多个图像路径的分隔符"
23
+ },
24
+ "text_separator": {
25
+ "type": "string",
26
+ "default": "\\n",
27
+ "description": "多个文本的分隔符"
28
+ },
29
+ "output_image_column": {
30
+ "type": "string",
31
+ "default": "images",
32
+ "description": "合并后的图像列名"
33
+ },
34
+ "output_text_column": {
35
+ "type": "string",
36
+ "default": "text",
37
+ "description": "合并后的文本列名"
38
+ },
39
+ "output_harmful_image_column": {
40
+ "type": "string",
41
+ "default": "harmful_images",
42
+ "description": "合并后的有害图像列名"
43
+ },
44
+ "output_harmful_text_column": {
45
+ "type": "string",
46
+ "default": "harmful_text",
47
+ "description": "合并后的有害文本列名"
48
+ },
49
+ "normalize_paths": {
50
+ "type": "boolean",
51
+ "default": True,
52
+ "description": "是否标准化路径格式"
53
+ },
54
+ "check_image_exists": {
55
+ "type": "boolean",
56
+ "default": True,
57
+ "description": "是否检查图像文件是否存在"
58
+ }
59
+ }
60
+ }
61
+
62
+ def __init__(self, name: str = "data_alignment", config: Dict[str, Any] = None):
63
+ super().__init__(name, config)
64
+
65
+ async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
66
+ """执行数据对齐"""
67
+ with MeasureTime(f"步骤 {self.name}"):
68
+ try:
69
+ step_config = self.get_step_config(config)
70
+ data_copy = data.copy()
71
+
72
+ # 对齐图像列
73
+ aligned_images = await self._align_image_columns(
74
+ data_copy, config, step_config
75
+ )
76
+
77
+ # 对齐文本列
78
+ aligned_text = await self._align_text_columns(
79
+ data_copy, config, step_config
80
+ )
81
+
82
+ # 对齐有害内容列
83
+ aligned_harmful_images = await self._align_harmful_image_columns(
84
+ data_copy, config, step_config
85
+ )
86
+ aligned_harmful_text = await self._align_harmful_text_columns(
87
+ data_copy, config, step_config
88
+ )
89
+
90
+ # 添加对齐后的列到数据中
91
+ output_image_col = step_config.get("output_image_column", "images")
92
+ output_text_col = step_config.get("output_text_column", "text")
93
+ output_harmful_image_col = step_config.get("output_harmful_image_column", "harmful_images")
94
+ output_harmful_text_col = step_config.get("output_harmful_text_column", "harmful_text")
95
+
96
+ data_copy[output_image_col] = aligned_images
97
+ data_copy[output_text_col] = aligned_text
98
+ data_copy[output_harmful_image_col] = aligned_harmful_images
99
+ data_copy[output_harmful_text_col] = aligned_harmful_text
100
+
101
+ # 更新处理状态
102
+ data_copy['__processing_status'] = 'aligned'
103
+ data_copy['__aligned_at'] = pd.Timestamp.now()
104
+
105
+ # 统计信息
106
+ valid_images = sum(1 for img in aligned_images if img and img.strip())
107
+ valid_texts = sum(1 for txt in aligned_text if txt and txt.strip())
108
+
109
+ metadata = {
110
+ "total_rows": len(data_copy),
111
+ "valid_images": valid_images,
112
+ "valid_texts": valid_texts,
113
+ "image_columns_merged": config.image_columns,
114
+ "text_columns_merged": config.text_columns,
115
+ "harmful_image_columns_merged": config.harmful_image_columns,
116
+ "harmful_text_columns_merged": config.harmful_text_columns,
117
+ "output_columns": {
118
+ "images": output_image_col,
119
+ "text": output_text_col,
120
+ "harmful_images": output_harmful_image_col,
121
+ "harmful_text": output_harmful_text_col
122
+ }
123
+ }
124
+
125
+ self.logger.info(f"数据对齐完成,有效图像: {valid_images}, 有效文本: {valid_texts}")
126
+
127
+ return StepResult(
128
+ step_name=self.name,
129
+ success=True,
130
+ data=data_copy,
131
+ metadata=metadata
132
+ )
133
+
134
+ except Exception as e:
135
+ self.logger.error(f"数据对齐失败: {e}")
136
+ return StepResult(
137
+ step_name=self.name,
138
+ success=False,
139
+ data=data,
140
+ metadata={},
141
+ error=str(e)
142
+ )
143
+
144
+ async def _align_image_columns(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
145
+ """对齐图像列"""
146
+ image_separator = step_config.get("image_separator", "|")
147
+ normalize_paths = step_config.get("normalize_paths", True)
148
+ check_exists = step_config.get("check_image_exists", True)
149
+
150
+ aligned_images = []
151
+
152
+ for idx, row in data.iterrows():
153
+ image_paths = []
154
+
155
+ # 收集所有图像列的路径
156
+ for col in config.image_columns:
157
+ if col in data.columns:
158
+ value = row[col]
159
+ if pd.notna(value) and str(value).strip():
160
+ paths = self._parse_image_paths(str(value), image_separator)
161
+ image_paths.extend(paths)
162
+
163
+ # 处理路径
164
+ processed_paths = []
165
+ for path in image_paths:
166
+ if normalize_paths:
167
+ path = self._normalize_path(path)
168
+
169
+ if check_exists and not self._check_file_exists(path):
170
+ self.logger.warning(f"图像文件不存在: {path}")
171
+ continue
172
+
173
+ processed_paths.append(path)
174
+
175
+ # 合并为字符串
176
+ aligned_images.append(image_separator.join(processed_paths))
177
+
178
+ return aligned_images
179
+
180
+ async def _align_text_columns(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
181
+ """对齐文本列"""
182
+ text_separator = step_config.get("text_separator", "\\n").replace("\\n", "\n")
183
+
184
+ aligned_texts = []
185
+
186
+ for idx, row in data.iterrows():
187
+ text_parts = []
188
+
189
+ # 收集所有文本列的内容
190
+ for col in config.text_columns:
191
+ if col in data.columns:
192
+ value = row[col]
193
+ if pd.notna(value) and str(value).strip():
194
+ text_parts.append(str(value).strip())
195
+
196
+ # 合并为字符串
197
+ aligned_texts.append(text_separator.join(text_parts))
198
+
199
+ return aligned_texts
200
+
201
+ async def _align_harmful_image_columns(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
202
+ """对齐有害图像列"""
203
+ image_separator = step_config.get("image_separator", "|")
204
+
205
+ aligned_harmful_images = []
206
+
207
+ for idx, row in data.iterrows():
208
+ harmful_flags = []
209
+
210
+ # 收集所有有害图像列的标记
211
+ for col in config.harmful_image_columns:
212
+ if col in data.columns:
213
+ value = row[col]
214
+ if pd.notna(value):
215
+ harmful_flags.append(str(value))
216
+
217
+ # 合并为字符串
218
+ aligned_harmful_images.append(image_separator.join(harmful_flags))
219
+
220
+ return aligned_harmful_images
221
+
222
+ async def _align_harmful_text_columns(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
223
+ """对齐有害文本列"""
224
+ text_separator = step_config.get("text_separator", "\\n").replace("\\n", "\n")
225
+
226
+ aligned_harmful_texts = []
227
+
228
+ for idx, row in data.iterrows():
229
+ harmful_flags = []
230
+
231
+ # 收集所有有害文本列的标记
232
+ for col in config.harmful_text_columns:
233
+ if col in data.columns:
234
+ value = row[col]
235
+ if pd.notna(value):
236
+ harmful_flags.append(str(value))
237
+
238
+ # 合并为字符串
239
+ aligned_harmful_texts.append(text_separator.join(harmful_flags))
240
+
241
+ return aligned_harmful_texts
242
+
243
+ def _parse_image_paths(self, value: str, separator: str) -> List[str]:
244
+ """解析图像路径字符串"""
245
+ if not value or not value.strip():
246
+ return []
247
+
248
+ # 如果已经包含分隔符,按分隔符分割
249
+ if separator in value:
250
+ paths = value.split(separator)
251
+ else:
252
+ paths = [value]
253
+
254
+ return [path.strip() for path in paths if path.strip()]
255
+
256
+ def _normalize_path(self, path: str) -> str:
257
+ """标准化路径格式"""
258
+ # 转换路径分隔符
259
+ normalized = str(Path(path))
260
+ return normalized
261
+
262
+ def _check_file_exists(self, path: str) -> bool:
263
+ """检查文件是否存在"""
264
+ try:
265
+ return Path(path).exists()
266
+ except Exception:
267
+ return False
@@ -0,0 +1,172 @@
1
+ """
2
+ 第1步:数据加载步骤
3
+ 读取表格数据,支持csv、excel,处理图像链接、有害图像列、有害文本列
4
+ """
5
+
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ from typing import List, Dict, Any
9
+ from ..core import PipelineStep, StepResult, PipelineConfig
10
+
11
+
12
+ class DataLoaderStep(PipelineStep):
13
+ """数据加载步骤"""
14
+
15
+ CONFIG_SCHEMA = {
16
+ "type": "object",
17
+ "properties": {
18
+ "image_columns": {
19
+ "type": "array",
20
+ "items": {"type": "string"},
21
+ "description": "图像列名称列表",
22
+ },
23
+ "harmful_image_columns": {
24
+ "type": "array",
25
+ "items": {"type": "string"},
26
+ "description": "有害图像列名称列表",
27
+ },
28
+ "harmful_text_columns": {
29
+ "type": "array",
30
+ "items": {"type": "string"},
31
+ "description": "有害文本列名称列表",
32
+ },
33
+ "text_columns": {
34
+ "type": "array",
35
+ "items": {"type": "string"},
36
+ "description": "文本列名称列表",
37
+ },
38
+ "extract_images_from_excel": {
39
+ "type": "boolean",
40
+ "default": True,
41
+ "description": "是否从Excel中提取图像",
42
+ },
43
+ "image_output_dir": {
44
+ "type": "string",
45
+ "default": "extracted_images",
46
+ "description": "提取图像的输出目录",
47
+ },
48
+ },
49
+ }
50
+
51
+ def __init__(self, name: str = "data_loader", config: Dict[str, Any] = None):
52
+ super().__init__(name, config)
53
+
54
+ async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
55
+ """执行数据加载"""
56
+ try:
57
+ step_config = self.get_step_config(config)
58
+
59
+ # 如果是第一步,data可能为空,需要从文件加载
60
+ if data is None or data.empty:
61
+ data = await self._load_from_file(config, step_config)
62
+
63
+ # 验证必要的列是否存在
64
+ self._validate_columns(data, step_config)
65
+
66
+ # 处理图像列
67
+ if (
68
+ step_config.get("extract_images_from_excel", True)
69
+ and config.input_type == "excel"
70
+ ):
71
+ data = await self._extract_images_from_excel(data, config, step_config)
72
+
73
+ # 添加元数据列
74
+ data = self._add_metadata_columns(data, step_config)
75
+
76
+ metadata = {
77
+ "total_rows": len(data),
78
+ "columns": list(data.columns),
79
+ "image_columns": step_config.get("image_columns", []),
80
+ "harmful_image_columns": step_config.get("harmful_image_columns", []),
81
+ "harmful_text_columns": step_config.get("harmful_text_columns", []),
82
+ "text_columns": step_config.get("text_columns", []),
83
+ }
84
+
85
+ self.logger.info(f"数据加载完成,共{len(data)}行,{len(data.columns)}列")
86
+
87
+ return StepResult(
88
+ step_name=self.name, success=True, data=data, metadata=metadata
89
+ )
90
+
91
+ except Exception as e:
92
+ self.logger.error(f"数据加载失败: {e}")
93
+ return StepResult(
94
+ step_name=self.name, success=False, data=data, metadata={}, error=str(e)
95
+ )
96
+
97
+ async def _load_from_file(
98
+ self, config: PipelineConfig, step_config: Dict[str, Any]
99
+ ) -> pd.DataFrame:
100
+ """从文件加载数据"""
101
+ if config.input_type == "csv":
102
+ data = pd.read_csv(config.input_file, encoding="utf-8")
103
+ elif config.input_type == "excel":
104
+ if (
105
+ step_config.get("extract_images_from_excel", True)
106
+ and config.image_columns
107
+ ):
108
+ # 使用maque的excel_helper提取图像
109
+ from maque.utils.excel_helper import extract_excel_with_images
110
+
111
+ image_output_dir = Path(config.output_dir) / step_config.get(
112
+ "image_output_dir", "extracted_images"
113
+ )
114
+
115
+ data = extract_excel_with_images(
116
+ excel_path=config.input_file,
117
+ image_column_names=config.image_columns,
118
+ sheet_name=config.sheet_name,
119
+ image_output_dir=str(image_output_dir),
120
+ use_hash_filename=True,
121
+ use_absolute_path=False,
122
+ )
123
+ else:
124
+ data = pd.read_excel(config.input_file, sheet_name=config.sheet_name)
125
+ else:
126
+ raise ValueError(f"不支持的输入类型: {config.input_type}")
127
+
128
+ return data
129
+
130
+ def _validate_columns(self, data: pd.DataFrame, step_config: Dict[str, Any]):
131
+ """验证列是否存在"""
132
+ all_columns = set(data.columns)
133
+
134
+ # 检查必需的列
135
+ required_columns = []
136
+ required_columns.extend(step_config.get("image_columns", []))
137
+ required_columns.extend(step_config.get("text_columns", []))
138
+
139
+ missing_columns = [col for col in required_columns if col not in all_columns]
140
+ if missing_columns:
141
+ raise ValueError(f"缺少必需的列: {missing_columns}")
142
+
143
+ # 检查可选的列(有害内容列可能不存在)
144
+ optional_columns = []
145
+ optional_columns.extend(step_config.get("harmful_image_columns", []))
146
+ optional_columns.extend(step_config.get("harmful_text_columns", []))
147
+
148
+ missing_optional = [col for col in optional_columns if col not in all_columns]
149
+ if missing_optional:
150
+ self.logger.warning(f"可选列不存在: {missing_optional}")
151
+
152
+ async def _extract_images_from_excel(
153
+ self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]
154
+ ) -> pd.DataFrame:
155
+ """从Excel中提取图像(如果还没有提取的话)"""
156
+ # 如果已经在_load_from_file中处理过了,这里可能不需要再处理
157
+ return data
158
+
159
+ def _add_metadata_columns(
160
+ self, data: pd.DataFrame, step_config: Dict[str, Any]
161
+ ) -> pd.DataFrame:
162
+ """添加元数据列"""
163
+ # 添加行ID
164
+ data["__row_id"] = range(len(data))
165
+
166
+ # 添加处理状态列
167
+ data["__processing_status"] = "loaded"
168
+
169
+ # 添加时间戳
170
+ data["__loaded_at"] = pd.Timestamp.now()
171
+
172
+ return data