maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# !/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
模型评估指标工具
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from maque.ai_platform.metrics import MetricsCalculator, save_pred_metrics
|
|
8
|
+
from maque.utils.helper_parser import parse_generic_tags
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def calc_binary_metrics(
|
|
14
|
+
df: pd.DataFrame,
|
|
15
|
+
response_col="response",
|
|
16
|
+
label_col="labels",
|
|
17
|
+
parse_response_to_pred=False,
|
|
18
|
+
pred_parsed_tag="answer",
|
|
19
|
+
record_root_dir="record",
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
计算交叉二分类指标
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
df (pd.DataFrame): 包含预测结果和标签的数据框。
|
|
26
|
+
response_col (str, optional): 预测结果所在的列名,默认为 'response'。
|
|
27
|
+
label_col (str, optional): 标签所在的列名,默认为 'labels'。
|
|
28
|
+
parse_response_to_pred (bool, optional): 是否将预测结果解析为特定的格式,默认为 False。
|
|
29
|
+
record_root_dir (str, optional): 记录根目录,默认为 'record'。 会将预测结果保存到record_root_dir文件夹下。
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
pd.DataFrame: 包含每个标签下的二分类指标的数据框。
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# 如果需要将预测结果解析为特定的格式
|
|
37
|
+
if parse_response_to_pred:
|
|
38
|
+
parsed_results = [parse_generic_tags(response) for response in df[response_col]]
|
|
39
|
+
parsed_dict = {}
|
|
40
|
+
all_keys = set()
|
|
41
|
+
for d in parsed_results:
|
|
42
|
+
all_keys |= set(d.keys())
|
|
43
|
+
for d in parsed_results:
|
|
44
|
+
for key in all_keys:
|
|
45
|
+
parsed_dict.setdefault(key, []).append(d.get(key, "不规范"))
|
|
46
|
+
df["parsed_results"] = parsed_results
|
|
47
|
+
for param, values in parsed_dict.items():
|
|
48
|
+
df[f"parsed_{param}"] = values
|
|
49
|
+
df["preds"] = df[f"parsed_{pred_parsed_tag}"]
|
|
50
|
+
else:
|
|
51
|
+
# 如果不需要解析预测结果,则直接处理
|
|
52
|
+
df["preds"] = df[response_col].apply(
|
|
53
|
+
lambda x: x if not pd.isna(x) else "不规范"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
df["binary_pred"] = df["preds"].apply(
|
|
57
|
+
lambda x: "白" if x in ["正常", "不违规", "[不违规]"] else "黑"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if label_col is None:
|
|
61
|
+
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
|
|
62
|
+
p = Path(f"{record_root_dir}/binary_pred-{timestamp}.xlsx")
|
|
63
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
df.to_excel(p, index=False, engine="openpyxl")
|
|
65
|
+
else:
|
|
66
|
+
# 保存预测指标
|
|
67
|
+
# save_pred_metrics(
|
|
68
|
+
# df=df,
|
|
69
|
+
# pred_col="preds",
|
|
70
|
+
# label_col=label_col,
|
|
71
|
+
# record_folder=f"{record_root_dir}",
|
|
72
|
+
# )
|
|
73
|
+
|
|
74
|
+
df["binary_label"] = df[label_col].apply(
|
|
75
|
+
lambda x: "白" if x in ["正常", "不违规", "[不违规]"] else "黑"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# 替换 target_label 列的 无害 为 不违规
|
|
79
|
+
save_pred_metrics(
|
|
80
|
+
df,
|
|
81
|
+
pred_col="binary_pred",
|
|
82
|
+
label_col="binary_label",
|
|
83
|
+
record_folder=f"{record_root_dir}/binary",
|
|
84
|
+
)
|
|
85
|
+
# 计算labels 列下的每个值groupby的指标
|
|
86
|
+
metrics_dict = {}
|
|
87
|
+
for name, _df in df.groupby(label_col):
|
|
88
|
+
metrics_calculator = MetricsCalculator(
|
|
89
|
+
_df, pred_col="binary_pred", label_col="binary_label"
|
|
90
|
+
)
|
|
91
|
+
metrics = metrics_calculator.get_metrics()
|
|
92
|
+
classification_report = metrics["classification_report"]
|
|
93
|
+
# 判断分类报告中是否存在'黑'类别
|
|
94
|
+
if "黑" in classification_report:
|
|
95
|
+
binary_recall, support = (
|
|
96
|
+
classification_report["黑"]["recall"],
|
|
97
|
+
classification_report["黑"]["support"],
|
|
98
|
+
)
|
|
99
|
+
metrics_dict[name] = {
|
|
100
|
+
"binary_recall": binary_recall,
|
|
101
|
+
"binary_true": len(_df[_df["binary_pred"] == "黑"]),
|
|
102
|
+
"support": support,
|
|
103
|
+
}
|
|
104
|
+
else:
|
|
105
|
+
# 如果不存在'黑'类别,则使用'白'类别
|
|
106
|
+
binary_recall, support = (
|
|
107
|
+
classification_report["白"]["recall"],
|
|
108
|
+
classification_report["白"]["support"],
|
|
109
|
+
)
|
|
110
|
+
metrics_dict[name] = {
|
|
111
|
+
"binary_recall": binary_recall,
|
|
112
|
+
"binary_true": len(_df[_df["binary_pred"] == "白"]),
|
|
113
|
+
"support": support,
|
|
114
|
+
}
|
|
115
|
+
binary_metrics_df = pd.DataFrame(metrics_dict).T
|
|
116
|
+
markdown_str = f"{binary_metrics_df.to_markdown()}"
|
|
117
|
+
|
|
118
|
+
with open(f"{record_root_dir}/binary_metrics.md", "w") as f:
|
|
119
|
+
f.write(markdown_str)
|
|
120
|
+
print(markdown_str)
|
|
121
|
+
return binary_metrics_df
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# !/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""解析器"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import List
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_generic_tags(text: str, strict: bool = False) -> dict:
|
|
11
|
+
"""
|
|
12
|
+
通用的标签解析函数,可以解析字符串中混合存在的 <label>content</label> 和 <label>content 格式。
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
text: 待解析的字符串。
|
|
16
|
+
strict: 布尔值,用于控制是否启用严格匹配模式。
|
|
17
|
+
- False (默认): 解析所有闭合标签和开放式标签,并优先处理闭合标签。
|
|
18
|
+
- True: 仅解析格式完好的闭合标签 (<label>content</label>),忽略所有开放式标签。
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
dict: 解析结果,格式为 {label: content, ...}
|
|
22
|
+
"""
|
|
23
|
+
if not text:
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
result = {}
|
|
27
|
+
|
|
28
|
+
if strict:
|
|
29
|
+
# --- 严格模式 ---
|
|
30
|
+
# 只匹配拥有正确闭合标签的 <label>content</label> 格式
|
|
31
|
+
pattern_with_closing = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
32
|
+
matches = re.findall(pattern_with_closing, text, re.DOTALL)
|
|
33
|
+
for label, content in matches:
|
|
34
|
+
result[label.strip()] = content.strip()
|
|
35
|
+
|
|
36
|
+
else:
|
|
37
|
+
# --- 兼容模式 (非严格) ---
|
|
38
|
+
remaining_text = str(text)
|
|
39
|
+
|
|
40
|
+
# 1. 优先处理闭合标签,并从文本中“移除”它们
|
|
41
|
+
def process_closed_tag(match_obj):
|
|
42
|
+
label = match_obj.group(1).strip()
|
|
43
|
+
content = match_obj.group(2).strip()
|
|
44
|
+
result[label] = content
|
|
45
|
+
return ""
|
|
46
|
+
|
|
47
|
+
pattern_with_closing = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
48
|
+
remaining_text = re.sub(
|
|
49
|
+
pattern_with_closing, process_closed_tag, remaining_text, flags=re.DOTALL
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# 2. 在剩余文本中处理开放式标签
|
|
53
|
+
pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
|
|
54
|
+
matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
|
|
55
|
+
for label, content in matches_open:
|
|
56
|
+
label_stripped = label.strip()
|
|
57
|
+
if label_stripped not in result:
|
|
58
|
+
result[label_stripped] = content.strip()
|
|
59
|
+
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def split_urls(text: str) -> List[str]:
|
|
64
|
+
"""从文本中提取所有URL
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
text: 包含URL的字符串
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List[str]: 提取到的URL列表
|
|
71
|
+
"""
|
|
72
|
+
if not text:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
# 使用正则表达式匹配所有URL
|
|
76
|
+
url_pattern = r"https?://[^\s]+"
|
|
77
|
+
urls = re.findall(url_pattern, text)
|
|
78
|
+
urls = re.findall(r"https?://.*?(?=https?://|$)", text)
|
|
79
|
+
url_list = [url.strip(" ,|;;") for url in urls]
|
|
80
|
+
return url_list
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def split_image_paths(text: str, separators: List[str] = None) -> List[str]:
|
|
84
|
+
"""从文本中提取所有图像路径(包括HTTP URL和本地路径)
|
|
85
|
+
|
|
86
|
+
优化版本:能够正确处理URL中包含逗号等分隔符的情况,避免错误切割。
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
text: 包含图像路径的字符串
|
|
90
|
+
separators: 分隔符列表,如 [",", ";"] 或 ["\n"],默认为 [",", ";", "\n", "\r"]
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List[str]: 提取到的图像路径列表
|
|
94
|
+
"""
|
|
95
|
+
if not text or pd.isna(text):
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
text = str(text).strip()
|
|
99
|
+
if not text:
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
# 如果用户提供了自定义分隔符,优先使用用户的设置,不使用智能检测
|
|
103
|
+
if separators is not None:
|
|
104
|
+
# 用户明确指定了分隔符,跳过智能检测,直接使用用户的分隔符
|
|
105
|
+
pass
|
|
106
|
+
else:
|
|
107
|
+
# 只有在用户没有指定分隔符时,才使用智能提取完整的URL
|
|
108
|
+
# 使用正则表达式匹配完整的HTTP(S) URL
|
|
109
|
+
url_pattern = r'https?://[^\s<>"\';]*'
|
|
110
|
+
urls = re.findall(url_pattern, text)
|
|
111
|
+
|
|
112
|
+
# 如果找到了URL,检查是否覆盖了整个文本(说明是单个URL)
|
|
113
|
+
if urls:
|
|
114
|
+
# 如果只有一个URL且几乎覆盖了整个文本,直接返回
|
|
115
|
+
if len(urls) == 1 and len(urls[0]) > len(text) * 0.8:
|
|
116
|
+
return [urls[0]]
|
|
117
|
+
|
|
118
|
+
# 如果找到多个URL,返回所有URL
|
|
119
|
+
if len(urls) > 1:
|
|
120
|
+
return urls
|
|
121
|
+
|
|
122
|
+
# 如果只找到一个URL但不覆盖全文,可能有其他路径,继续原逻辑
|
|
123
|
+
|
|
124
|
+
# 如果没有找到HTTP URL,或需要处理混合内容,使用原来的分隔符逻辑
|
|
125
|
+
# 但要更小心地处理逗号,避免切断URL参数
|
|
126
|
+
|
|
127
|
+
# 默认分隔符列表
|
|
128
|
+
if separators is None:
|
|
129
|
+
separators = [";", "\n", "\r"] # 永远不使用逗号作为分隔符,因为URL参数中常包含逗号
|
|
130
|
+
|
|
131
|
+
# 使用正则表达式进行分割,支持多个分隔符
|
|
132
|
+
if len(separators) == 1:
|
|
133
|
+
# 单个分隔符,直接分割
|
|
134
|
+
paths = text.split(separators[0])
|
|
135
|
+
else:
|
|
136
|
+
# 多个分隔符,构建正则表达式
|
|
137
|
+
escaped_separators = [re.escape(sep) for sep in separators]
|
|
138
|
+
pattern = "|".join(escaped_separators)
|
|
139
|
+
paths = re.split(pattern, text)
|
|
140
|
+
|
|
141
|
+
# 清理和过滤路径
|
|
142
|
+
cleaned_paths = []
|
|
143
|
+
for path in paths:
|
|
144
|
+
path = path.strip()
|
|
145
|
+
if path:
|
|
146
|
+
# 检查是否为HTTP(S) URL
|
|
147
|
+
if re.match(r"https?://", path):
|
|
148
|
+
cleaned_paths.append(path)
|
|
149
|
+
# 检查是否为有效的文件路径
|
|
150
|
+
elif (
|
|
151
|
+
path.startswith("./")
|
|
152
|
+
or path.startswith("../")
|
|
153
|
+
or path.startswith("/")
|
|
154
|
+
or path.startswith("\\")
|
|
155
|
+
or re.match(r"^[A-Za-z]:[/\\]", path) # Windows绝对路径如 C:\ 或 D:/
|
|
156
|
+
or (
|
|
157
|
+
("/" in path or "\\" in path)
|
|
158
|
+
and (
|
|
159
|
+
"." in path
|
|
160
|
+
or path.endswith(
|
|
161
|
+
(".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp")
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
):
|
|
166
|
+
cleaned_paths.append(path)
|
|
167
|
+
|
|
168
|
+
return cleaned_paths
|
maque/utils/net.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import socket
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_inner_ip():
|
|
6
|
+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
|
7
|
+
s.connect(('8.8.8.8', 80))
|
|
8
|
+
return s.getsockname()[0]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_outer_ip():
|
|
12
|
+
import requests
|
|
13
|
+
# headers = {
|
|
14
|
+
# 'User-Agent':
|
|
15
|
+
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
|
|
16
|
+
# }
|
|
17
|
+
# return requests.get('http://ip.42.pl/raw', headers=headers).text.strip()
|
|
18
|
+
return requests.get('http://ifconfig.me/ip', timeout=1).text.strip()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_ip(env="inner"):
|
|
22
|
+
"""Get in-net / out-ner ip address.
|
|
23
|
+
env: "inner" or "outer"
|
|
24
|
+
"""
|
|
25
|
+
if env == "inner":
|
|
26
|
+
return get_inner_ip()
|
|
27
|
+
elif env == "outer":
|
|
28
|
+
return get_outer_ip()
|
|
29
|
+
else:
|
|
30
|
+
raise ValueError("`env` invalid!")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def domain2ip(*domains):
|
|
34
|
+
domain_ip = [(domain, socket.gethostbyname(domain.strip())) for domain in domains]
|
|
35
|
+
return domain_ip
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_github_ip():
|
|
39
|
+
import requests
|
|
40
|
+
def get_ip(website):
|
|
41
|
+
request = requests.get('https://ipaddress.com/website/' + website)
|
|
42
|
+
domain_ip = None
|
|
43
|
+
if request.status_code == 200:
|
|
44
|
+
ips = re.findall(r"<strong>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?)</strong>", request.text)
|
|
45
|
+
domain_ip = {ip for ip in ips}
|
|
46
|
+
return domain_ip
|
|
47
|
+
|
|
48
|
+
ip_list = ['github.com',
|
|
49
|
+
'github.global.ssl.fastly.net',
|
|
50
|
+
'assets-cdn.github.com',
|
|
51
|
+
# 'codeload.github.com',
|
|
52
|
+
# 'google.com'
|
|
53
|
+
]
|
|
54
|
+
return [[i, get_ip(i)] for i in ip_list]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# getip('assets-cdn.github.com')
|
|
58
|
+
# getip('github.global.ssl.fastly.net')
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
print(get_inner_ip())
|
|
62
|
+
print(get_outer_ip())
|
|
63
|
+
print(domain2ip("www.baidu.com", "www.google.com", "github.com"))
|
|
64
|
+
# print(get_github_ip())
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import io
|
|
3
|
+
import csv
|
|
4
|
+
import collections
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def commandexists(shellcommand):
|
|
8
|
+
status, output = subprocess.getstatusoutput(shellcommand)
|
|
9
|
+
exists = status == 0
|
|
10
|
+
if not exists:
|
|
11
|
+
print("Could not execute: {0}".format(shellcommand))
|
|
12
|
+
return exists
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def command(args):
|
|
16
|
+
return subprocess.check_output(args).decode()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def csvtodictdict(csvdata, colnames, keycols, fmtcols={}):
|
|
20
|
+
"""
|
|
21
|
+
Returns a dict of dicts from csv file with specified column names and primary key column
|
|
22
|
+
accepts and optional element formatting per column as a dictionary of format functions
|
|
23
|
+
"""
|
|
24
|
+
fmtcols = collections.defaultdict(lambda: lambda x: x, **fmtcols)
|
|
25
|
+
d = {}
|
|
26
|
+
rows = csv.reader(csvdata)
|
|
27
|
+
for row in rows:
|
|
28
|
+
drow = {colname: fmtcols[colname](val) for colname, val in zip(colnames, row)}
|
|
29
|
+
if isinstance(keycols, str):
|
|
30
|
+
key = drow.pop(keycols)
|
|
31
|
+
else:
|
|
32
|
+
key = tuple([drow.pop(keycol) for keycol in keycols])
|
|
33
|
+
d[key] = drow
|
|
34
|
+
return d
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def csvheaderargs(fmtcol, cols):
|
|
38
|
+
return ",".join([fmtcol.format(col) for col in cols])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def commandtodictdict(baseargs, cols, keycols=None, queryargfmt="{0}", colargfmt="{0}", outputfmt={}, skipheader=False):
|
|
42
|
+
queryarg = queryargfmt.format(csvheaderargs(colargfmt, cols))
|
|
43
|
+
args = baseargs + [queryarg]
|
|
44
|
+
csvoutput = io.StringIO(command(args))
|
|
45
|
+
if skipheader:
|
|
46
|
+
csvoutput.readline()
|
|
47
|
+
if keycols is None:
|
|
48
|
+
keycols = cols[0]
|
|
49
|
+
return csvtodictdict(csvoutput, cols, keycols, fmtcols=outputfmt)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def renamekeys(d, names):
|
|
53
|
+
'''
|
|
54
|
+
updates key names in d based on dict of old/new name pairs
|
|
55
|
+
returning resulting updated dict
|
|
56
|
+
'''
|
|
57
|
+
for oldname, newname in names.items():
|
|
58
|
+
d[newname] = d.pop(oldname)
|
|
59
|
+
return d
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def docker_gpu_stat():
|
|
63
|
+
# get results of all commands without container arguments
|
|
64
|
+
dockerps = commandtodictdict(['docker', 'ps', '--format'],
|
|
65
|
+
['ID', 'Image', 'Ports'],
|
|
66
|
+
keycols='ID',
|
|
67
|
+
queryargfmt="'{0}'",
|
|
68
|
+
colargfmt="{{{{.{0}}}}}",
|
|
69
|
+
outputfmt={'ID': lambda s: s[1:]})
|
|
70
|
+
dockerstats = commandtodictdict(['docker', 'stats', '--no-stream', '--format'],
|
|
71
|
+
['Container', 'MemUsage', 'CPUPerc'],
|
|
72
|
+
keycols='Container',
|
|
73
|
+
queryargfmt="'{0}'",
|
|
74
|
+
colargfmt="{{{{.{0}}}}}",
|
|
75
|
+
outputfmt={'Container': lambda s: s[1:]})
|
|
76
|
+
unitstats = commandtodictdict(['nvidia-smi', '--format=csv'],
|
|
77
|
+
['gpu_uuid', 'utilization.gpu', 'utilization.memory'],
|
|
78
|
+
keycols='gpu_uuid',
|
|
79
|
+
queryargfmt="--query-gpu={0}",
|
|
80
|
+
outputfmt={'gpu_uuid': lambda s: s.lstrip()},
|
|
81
|
+
skipheader=True)
|
|
82
|
+
unitprocstats = commandtodictdict(['nvidia-smi', '--format=csv'],
|
|
83
|
+
['pid', 'process_name', 'gpu_uuid', 'used_memory'],
|
|
84
|
+
keycols=['pid', 'gpu_uuid'],
|
|
85
|
+
queryargfmt="--query-compute-apps={0}",
|
|
86
|
+
outputfmt={'gpu_uuid': lambda s: s.lstrip()},
|
|
87
|
+
skipheader=True)
|
|
88
|
+
|
|
89
|
+
# map gpu_uuids to short ids in unit info rename columns
|
|
90
|
+
shortunitids = {gpu_uuid: "{0}".format(shortid) for gpu_uuid, shortid in
|
|
91
|
+
zip(unitstats.keys(), range(len(unitstats)))}
|
|
92
|
+
colnames = {'utilization.gpu': 'used_gpu'}
|
|
93
|
+
unitstats = {shortunitids[gpu_uuid]: renamekeys(stats, colnames) for gpu_uuid, stats in unitstats.items()}
|
|
94
|
+
unitprocstats = {(pid, shortunitids[gpu_uuid]): stats for (pid, gpu_uuid), stats in unitprocstats.items()}
|
|
95
|
+
|
|
96
|
+
# display fmt data
|
|
97
|
+
basedisplaycols = collections.OrderedDict([('Container', 12),
|
|
98
|
+
('Image', 18)])
|
|
99
|
+
optdisplaycols = collections.OrderedDict([('pid', 7),
|
|
100
|
+
('gpu_uuid', 8),
|
|
101
|
+
('used_memory', 12),
|
|
102
|
+
('used_gpu', 9)])
|
|
103
|
+
displaycols = collections.OrderedDict(list(basedisplaycols.items()) +
|
|
104
|
+
list(optdisplaycols.items()))
|
|
105
|
+
|
|
106
|
+
# display fmt strings
|
|
107
|
+
basedisplayfmt = '\t'.join(['{{{0}:{1}.{1}}}'.format(col, width) for col, width in basedisplaycols.items()])
|
|
108
|
+
optdisplayfmt = '\t'.join(['{{{0}:{1}.{1}}}'.format(col, width) for col, width in optdisplaycols.items()])
|
|
109
|
+
displayfmt = '\t'.join([basedisplayfmt, optdisplayfmt])
|
|
110
|
+
|
|
111
|
+
# print rows of relevant container processes
|
|
112
|
+
# (everything below a bit janky in terms of argument expectations and generalization)
|
|
113
|
+
dockerall = {container: {**dockerps[container], **dockerstats[container]} for container in dockerstats.keys()}
|
|
114
|
+
someunitsactive = False
|
|
115
|
+
print(displayfmt.format(**{col: col for col in displaycols.keys()}))
|
|
116
|
+
for container, dockerinfo in dockerall.items():
|
|
117
|
+
# very particular incantation needed here for top options to function correctly:
|
|
118
|
+
# https://www.projectatomic.io/blog/2016/01/understanding-docker-top-and-ps/
|
|
119
|
+
pids = command(['docker', 'top', container, '-eo', 'pid']).split('\n')[1:-1] # obviously could be a bit brittle
|
|
120
|
+
|
|
121
|
+
containerunitstatslist = [((proc, unit), stats) for (proc, unit), stats in sorted(unitprocstats.items()) if
|
|
122
|
+
proc in pids]
|
|
123
|
+
containerunitstats = collections.OrderedDict(containerunitstatslist)
|
|
124
|
+
|
|
125
|
+
if containerunitstats:
|
|
126
|
+
someunitsactive = True
|
|
127
|
+
basedisplaystr = basedisplayfmt.format(Container=container, **dockerinfo)
|
|
128
|
+
print(basedisplaystr)
|
|
129
|
+
for (pid, gpu_uuid), stats in containerunitstats.items():
|
|
130
|
+
print(optdisplayfmt.rjust(99).format(pid=pid, gpu_uuid=gpu_uuid, **stats, **unitstats[gpu_uuid]))
|
|
131
|
+
if not someunitsactive:
|
|
132
|
+
print("\n\t\t no gpu units being used by docker containers ")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == '__main__':
|
|
136
|
+
# check for existence of docker and nvidia-smi commands
|
|
137
|
+
if commandexists('docker') and commandexists('nvidia-smi'):
|
|
138
|
+
docker_gpu_stat()
|
|
139
|
+
else:
|
|
140
|
+
print('Command(s) not found')
|
maque/utils/ops.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def find_all_index(pattern, string, flags=0):
|
|
5
|
+
"""find all matched index of string"""
|
|
6
|
+
return [i.span() for i in re.finditer(pattern, string, flags=flags)]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def string_add(string: str, dx=1):
|
|
10
|
+
# count_points = string.count('.')
|
|
11
|
+
items = find_all_index(r"\.", string)
|
|
12
|
+
number_list = [i for i in string.split(".")]
|
|
13
|
+
number_str = "".join(number_list)
|
|
14
|
+
number_len = len(number_str)
|
|
15
|
+
number = int(number_str)
|
|
16
|
+
number += dx
|
|
17
|
+
new_number_str = f"{number:0>{number_len}d}"
|
|
18
|
+
new_number_list = list(new_number_str)
|
|
19
|
+
[new_number_list.insert(idx[0], ".") for idx in items]
|
|
20
|
+
return "".join(new_number_list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def index_char(size=1000):
|
|
24
|
+
"""
|
|
25
|
+
Get the index of all characters.
|
|
26
|
+
use chr()
|
|
27
|
+
"""
|
|
28
|
+
index_token = {i: chr(i) for i in range(size)}
|
|
29
|
+
token_index = {chr(i): i for i in range(size)}
|
|
30
|
+
return index_token, token_index
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def find_match(start, end, S, flag=0):
|
|
34
|
+
"""find the string between `start` and `end` of `S`
|
|
35
|
+
flag=0 defaults, means no special specification
|
|
36
|
+
flag options:
|
|
37
|
+
re.I IGNORECASE, 忽略大小写的匹配模式
|
|
38
|
+
re.M MULTILINE,多行模式, 改变 ^ 和 $ 的行为
|
|
39
|
+
re.S DOTALL,此模式下 '.' 的匹配不受限制,可匹配任何字符,包括换行符,也就是默认是不能匹配换行符
|
|
40
|
+
re.X VERBOSE,冗余模式, 此模式忽略正则表达式中的空白和#号的注释
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
START = re.search(start, S, flags=flag).span()[1]
|
|
44
|
+
END = re.search(end, S, flags=flag).span()[0]
|
|
45
|
+
return S[START:END]
|
|
46
|
+
except:
|
|
47
|
+
print("Do not match anything.")
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def find_match2(pattern, S, flag=0):
|
|
52
|
+
res = re.search(pattern, S, flags=flag)
|
|
53
|
+
return res.group()
|
maque/utils/packages.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
import importlib.util
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _is_package_available(name: str) -> bool:
|
|
6
|
+
return importlib.util.find_spec(name) is not None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_package_version(name: str) -> str:
|
|
10
|
+
try:
|
|
11
|
+
return importlib.metadata.version(name)
|
|
12
|
+
except Exception:
|
|
13
|
+
return "0.0.0"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def is_flash_attn2_available():
|
|
17
|
+
return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_jieba_available():
|
|
21
|
+
return _is_package_available("jieba")
|
|
22
|
+
|
|
23
|
+
def is_levenshtein_available():
|
|
24
|
+
return _is_package_available("Levenshtein")
|
|
25
|
+
|
|
26
|
+
def is_nltk_available():
|
|
27
|
+
return _is_package_available("nltk")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_rouge_available():
|
|
31
|
+
return _is_package_available("rouge_chinese")
|
maque/utils/path.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union, List
|
|
3
|
+
import inspect
|
|
4
|
+
from glob import glob as _glob
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def rel_to_abs(rel_path: Union[str, Path], parents=0, return_str=True, strict=False):
|
|
10
|
+
"""Return absolute path relative to the called file
|
|
11
|
+
args:
|
|
12
|
+
parent: <int> The number of times `f_back` will be calledd.
|
|
13
|
+
"""
|
|
14
|
+
currentframe = inspect.currentframe()
|
|
15
|
+
f = currentframe.f_back
|
|
16
|
+
for _ in range(parents):
|
|
17
|
+
f = f.f_back
|
|
18
|
+
current_path = Path(f.f_code.co_filename).parent
|
|
19
|
+
pathlib_path = current_path / rel_path
|
|
20
|
+
pathlib_path = pathlib_path.resolve(strict=strict)
|
|
21
|
+
if return_str:
|
|
22
|
+
return str(pathlib_path)
|
|
23
|
+
else:
|
|
24
|
+
return pathlib_path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def rel_path_join(*paths: Union[str, Path], return_str=True):
|
|
28
|
+
return rel_to_abs(os.path.join(*paths), parents=1, return_str=return_str)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def ls(_dir, *patterns, relp=True, concat='extend', recursive=False) -> List[str]:
|
|
32
|
+
"""
|
|
33
|
+
Example:
|
|
34
|
+
--------
|
|
35
|
+
>>> ls("./data/", "*.jpg", "*.png")
|
|
36
|
+
"""
|
|
37
|
+
if relp:
|
|
38
|
+
_dir = rel_to_abs(_dir, parents=1, return_str=True, strict=False)
|
|
39
|
+
path_list = []
|
|
40
|
+
for pattern in patterns:
|
|
41
|
+
if concat == 'extend':
|
|
42
|
+
path_list.extend(_glob(os.path.join(_dir, pattern), recursive=recursive))
|
|
43
|
+
else:
|
|
44
|
+
path_list.append(_glob(os.path.join(_dir, pattern), recursive=recursive))
|
|
45
|
+
return path_list
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def add_env_path(*rel_paths: str):
|
|
49
|
+
"""
|
|
50
|
+
Example:
|
|
51
|
+
--------
|
|
52
|
+
>>> add_env_path('..')
|
|
53
|
+
>>> add_env_path('..', '../..')
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
for i in rel_paths:
|
|
57
|
+
sys.path.append(rel_to_abs(i, parents=1))
|