maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,121 @@
1
+ # !/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 模型评估指标工具
5
+ """
6
+
7
+ from maque.ai_platform.metrics import MetricsCalculator, save_pred_metrics
8
+ from maque.utils.helper_parser import parse_generic_tags
9
+ from pathlib import Path
10
+ import pandas as pd
11
+
12
+
13
+ def calc_binary_metrics(
14
+ df: pd.DataFrame,
15
+ response_col="response",
16
+ label_col="labels",
17
+ parse_response_to_pred=False,
18
+ pred_parsed_tag="answer",
19
+ record_root_dir="record",
20
+ ):
21
+ """
22
+ 计算交叉二分类指标
23
+
24
+ Args:
25
+ df (pd.DataFrame): 包含预测结果和标签的数据框。
26
+ response_col (str, optional): 预测结果所在的列名,默认为 'response'。
27
+ label_col (str, optional): 标签所在的列名,默认为 'labels'。
28
+ parse_response_to_pred (bool, optional): 是否将预测结果解析为特定的格式,默认为 False。
29
+ record_root_dir (str, optional): 记录根目录,默认为 'record'。 会将预测结果保存到record_root_dir文件夹下。
30
+
31
+ Returns:
32
+ pd.DataFrame: 包含每个标签下的二分类指标的数据框。
33
+
34
+ """
35
+
36
+ # 如果需要将预测结果解析为特定的格式
37
+ if parse_response_to_pred:
38
+ parsed_results = [parse_generic_tags(response) for response in df[response_col]]
39
+ parsed_dict = {}
40
+ all_keys = set()
41
+ for d in parsed_results:
42
+ all_keys |= set(d.keys())
43
+ for d in parsed_results:
44
+ for key in all_keys:
45
+ parsed_dict.setdefault(key, []).append(d.get(key, "不规范"))
46
+ df["parsed_results"] = parsed_results
47
+ for param, values in parsed_dict.items():
48
+ df[f"parsed_{param}"] = values
49
+ df["preds"] = df[f"parsed_{pred_parsed_tag}"]
50
+ else:
51
+ # 如果不需要解析预测结果,则直接处理
52
+ df["preds"] = df[response_col].apply(
53
+ lambda x: x if not pd.isna(x) else "不规范"
54
+ )
55
+
56
+ df["binary_pred"] = df["preds"].apply(
57
+ lambda x: "白" if x in ["正常", "不违规", "[不违规]"] else "黑"
58
+ )
59
+
60
+ if label_col is None:
61
+ timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
62
+ p = Path(f"{record_root_dir}/binary_pred-{timestamp}.xlsx")
63
+ p.parent.mkdir(parents=True, exist_ok=True)
64
+ df.to_excel(p, index=False, engine="openpyxl")
65
+ else:
66
+ # 保存预测指标
67
+ # save_pred_metrics(
68
+ # df=df,
69
+ # pred_col="preds",
70
+ # label_col=label_col,
71
+ # record_folder=f"{record_root_dir}",
72
+ # )
73
+
74
+ df["binary_label"] = df[label_col].apply(
75
+ lambda x: "白" if x in ["正常", "不违规", "[不违规]"] else "黑"
76
+ )
77
+
78
+ # 替换 target_label 列的 无害 为 不违规
79
+ save_pred_metrics(
80
+ df,
81
+ pred_col="binary_pred",
82
+ label_col="binary_label",
83
+ record_folder=f"{record_root_dir}/binary",
84
+ )
85
+ # 计算labels 列下的每个值groupby的指标
86
+ metrics_dict = {}
87
+ for name, _df in df.groupby(label_col):
88
+ metrics_calculator = MetricsCalculator(
89
+ _df, pred_col="binary_pred", label_col="binary_label"
90
+ )
91
+ metrics = metrics_calculator.get_metrics()
92
+ classification_report = metrics["classification_report"]
93
+ # 判断分类报告中是否存在'黑'类别
94
+ if "黑" in classification_report:
95
+ binary_recall, support = (
96
+ classification_report["黑"]["recall"],
97
+ classification_report["黑"]["support"],
98
+ )
99
+ metrics_dict[name] = {
100
+ "binary_recall": binary_recall,
101
+ "binary_true": len(_df[_df["binary_pred"] == "黑"]),
102
+ "support": support,
103
+ }
104
+ else:
105
+ # 如果不存在'黑'类别,则使用'白'类别
106
+ binary_recall, support = (
107
+ classification_report["白"]["recall"],
108
+ classification_report["白"]["support"],
109
+ )
110
+ metrics_dict[name] = {
111
+ "binary_recall": binary_recall,
112
+ "binary_true": len(_df[_df["binary_pred"] == "白"]),
113
+ "support": support,
114
+ }
115
+ binary_metrics_df = pd.DataFrame(metrics_dict).T
116
+ markdown_str = f"{binary_metrics_df.to_markdown()}"
117
+
118
+ with open(f"{record_root_dir}/binary_metrics.md", "w") as f:
119
+ f.write(markdown_str)
120
+ print(markdown_str)
121
+ return binary_metrics_df
@@ -0,0 +1,168 @@
1
+ # !/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """解析器"""
4
+
5
+ import re
6
+ from typing import List
7
+ import pandas as pd
8
+
9
+
10
+ def parse_generic_tags(text: str, strict: bool = False) -> dict:
11
+ """
12
+ 通用的标签解析函数,可以解析字符串中混合存在的 <label>content</label> 和 <label>content 格式。
13
+
14
+ Args:
15
+ text: 待解析的字符串。
16
+ strict: 布尔值,用于控制是否启用严格匹配模式。
17
+ - False (默认): 解析所有闭合标签和开放式标签,并优先处理闭合标签。
18
+ - True: 仅解析格式完好的闭合标签 (<label>content</label>),忽略所有开放式标签。
19
+
20
+ Returns:
21
+ dict: 解析结果,格式为 {label: content, ...}
22
+ """
23
+ if not text:
24
+ return {}
25
+
26
+ result = {}
27
+
28
+ if strict:
29
+ # --- 严格模式 ---
30
+ # 只匹配拥有正确闭合标签的 <label>content</label> 格式
31
+ pattern_with_closing = r"<([^>]+)>\s*(.*?)\s*</\1>"
32
+ matches = re.findall(pattern_with_closing, text, re.DOTALL)
33
+ for label, content in matches:
34
+ result[label.strip()] = content.strip()
35
+
36
+ else:
37
+ # --- 兼容模式 (非严格) ---
38
+ remaining_text = str(text)
39
+
40
+ # 1. 优先处理闭合标签,并从文本中“移除”它们
41
+ def process_closed_tag(match_obj):
42
+ label = match_obj.group(1).strip()
43
+ content = match_obj.group(2).strip()
44
+ result[label] = content
45
+ return ""
46
+
47
+ pattern_with_closing = r"<([^>]+)>\s*(.*?)\s*</\1>"
48
+ remaining_text = re.sub(
49
+ pattern_with_closing, process_closed_tag, remaining_text, flags=re.DOTALL
50
+ )
51
+
52
+ # 2. 在剩余文本中处理开放式标签
53
+ pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
54
+ matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
55
+ for label, content in matches_open:
56
+ label_stripped = label.strip()
57
+ if label_stripped not in result:
58
+ result[label_stripped] = content.strip()
59
+
60
+ return result
61
+
62
+
63
+ def split_urls(text: str) -> List[str]:
64
+ """从文本中提取所有URL
65
+
66
+ Args:
67
+ text: 包含URL的字符串
68
+
69
+ Returns:
70
+ List[str]: 提取到的URL列表
71
+ """
72
+ if not text:
73
+ return []
74
+
75
+ # 使用正则表达式匹配所有URL
76
+ url_pattern = r"https?://[^\s]+"
77
+ urls = re.findall(url_pattern, text)
78
+ urls = re.findall(r"https?://.*?(?=https?://|$)", text)
79
+ url_list = [url.strip(" ,|;;") for url in urls]
80
+ return url_list
81
+
82
+
83
+ def split_image_paths(text: str, separators: List[str] = None) -> List[str]:
84
+ """从文本中提取所有图像路径(包括HTTP URL和本地路径)
85
+
86
+ 优化版本:能够正确处理URL中包含逗号等分隔符的情况,避免错误切割。
87
+
88
+ Args:
89
+ text: 包含图像路径的字符串
90
+ separators: 分隔符列表,如 [",", ";"] 或 ["\n"],默认为 [",", ";", "\n", "\r"]
91
+
92
+ Returns:
93
+ List[str]: 提取到的图像路径列表
94
+ """
95
+ if not text or pd.isna(text):
96
+ return []
97
+
98
+ text = str(text).strip()
99
+ if not text:
100
+ return []
101
+
102
+ # 如果用户提供了自定义分隔符,优先使用用户的设置,不使用智能检测
103
+ if separators is not None:
104
+ # 用户明确指定了分隔符,跳过智能检测,直接使用用户的分隔符
105
+ pass
106
+ else:
107
+ # 只有在用户没有指定分隔符时,才使用智能提取完整的URL
108
+ # 使用正则表达式匹配完整的HTTP(S) URL
109
+ url_pattern = r'https?://[^\s<>"\';]*'
110
+ urls = re.findall(url_pattern, text)
111
+
112
+ # 如果找到了URL,检查是否覆盖了整个文本(说明是单个URL)
113
+ if urls:
114
+ # 如果只有一个URL且几乎覆盖了整个文本,直接返回
115
+ if len(urls) == 1 and len(urls[0]) > len(text) * 0.8:
116
+ return [urls[0]]
117
+
118
+ # 如果找到多个URL,返回所有URL
119
+ if len(urls) > 1:
120
+ return urls
121
+
122
+ # 如果只找到一个URL但不覆盖全文,可能有其他路径,继续原逻辑
123
+
124
+ # 如果没有找到HTTP URL,或需要处理混合内容,使用原来的分隔符逻辑
125
+ # 但要更小心地处理逗号,避免切断URL参数
126
+
127
+ # 默认分隔符列表
128
+ if separators is None:
129
+ separators = [";", "\n", "\r"] # 永远不使用逗号作为分隔符,因为URL参数中常包含逗号
130
+
131
+ # 使用正则表达式进行分割,支持多个分隔符
132
+ if len(separators) == 1:
133
+ # 单个分隔符,直接分割
134
+ paths = text.split(separators[0])
135
+ else:
136
+ # 多个分隔符,构建正则表达式
137
+ escaped_separators = [re.escape(sep) for sep in separators]
138
+ pattern = "|".join(escaped_separators)
139
+ paths = re.split(pattern, text)
140
+
141
+ # 清理和过滤路径
142
+ cleaned_paths = []
143
+ for path in paths:
144
+ path = path.strip()
145
+ if path:
146
+ # 检查是否为HTTP(S) URL
147
+ if re.match(r"https?://", path):
148
+ cleaned_paths.append(path)
149
+ # 检查是否为有效的文件路径
150
+ elif (
151
+ path.startswith("./")
152
+ or path.startswith("../")
153
+ or path.startswith("/")
154
+ or path.startswith("\\")
155
+ or re.match(r"^[A-Za-z]:[/\\]", path) # Windows绝对路径如 C:\ 或 D:/
156
+ or (
157
+ ("/" in path or "\\" in path)
158
+ and (
159
+ "." in path
160
+ or path.endswith(
161
+ (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp")
162
+ )
163
+ )
164
+ )
165
+ ):
166
+ cleaned_paths.append(path)
167
+
168
+ return cleaned_paths
maque/utils/net.py ADDED
@@ -0,0 +1,64 @@
1
+ import socket
2
+ import re
3
+
4
+
5
+ def get_inner_ip():
6
+ with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
7
+ s.connect(('8.8.8.8', 80))
8
+ return s.getsockname()[0]
9
+
10
+
11
+ def get_outer_ip():
12
+ import requests
13
+ # headers = {
14
+ # 'User-Agent':
15
+ # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
16
+ # }
17
+ # return requests.get('http://ip.42.pl/raw', headers=headers).text.strip()
18
+ return requests.get('http://ifconfig.me/ip', timeout=1).text.strip()
19
+
20
+
21
+ def get_ip(env="inner"):
22
+ """Get in-net / out-ner ip address.
23
+ env: "inner" or "outer"
24
+ """
25
+ if env == "inner":
26
+ return get_inner_ip()
27
+ elif env == "outer":
28
+ return get_outer_ip()
29
+ else:
30
+ raise ValueError("`env` invalid!")
31
+
32
+
33
+ def domain2ip(*domains):
34
+ domain_ip = [(domain, socket.gethostbyname(domain.strip())) for domain in domains]
35
+ return domain_ip
36
+
37
+
38
+ def get_github_ip():
39
+ import requests
40
+ def get_ip(website):
41
+ request = requests.get('https://ipaddress.com/website/' + website)
42
+ domain_ip = None
43
+ if request.status_code == 200:
44
+ ips = re.findall(r"<strong>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}?)</strong>", request.text)
45
+ domain_ip = {ip for ip in ips}
46
+ return domain_ip
47
+
48
+ ip_list = ['github.com',
49
+ 'github.global.ssl.fastly.net',
50
+ 'assets-cdn.github.com',
51
+ # 'codeload.github.com',
52
+ # 'google.com'
53
+ ]
54
+ return [[i, get_ip(i)] for i in ip_list]
55
+
56
+
57
+ # getip('assets-cdn.github.com')
58
+ # getip('github.global.ssl.fastly.net')
59
+
60
+ if __name__ == "__main__":
61
+ print(get_inner_ip())
62
+ print(get_outer_ip())
63
+ print(domain2ip("www.baidu.com", "www.google.com", "github.com"))
64
+ # print(get_github_ip())
@@ -0,0 +1,140 @@
1
+ import subprocess
2
+ import io
3
+ import csv
4
+ import collections
5
+
6
+
7
+ def commandexists(shellcommand):
8
+ status, output = subprocess.getstatusoutput(shellcommand)
9
+ exists = status == 0
10
+ if not exists:
11
+ print("Could not execute: {0}".format(shellcommand))
12
+ return exists
13
+
14
+
15
+ def command(args):
16
+ return subprocess.check_output(args).decode()
17
+
18
+
19
+ def csvtodictdict(csvdata, colnames, keycols, fmtcols={}):
20
+ """
21
+ Returns a dict of dicts from csv file with specified column names and primary key column
22
+ accepts and optional element formatting per column as a dictionary of format functions
23
+ """
24
+ fmtcols = collections.defaultdict(lambda: lambda x: x, **fmtcols)
25
+ d = {}
26
+ rows = csv.reader(csvdata)
27
+ for row in rows:
28
+ drow = {colname: fmtcols[colname](val) for colname, val in zip(colnames, row)}
29
+ if isinstance(keycols, str):
30
+ key = drow.pop(keycols)
31
+ else:
32
+ key = tuple([drow.pop(keycol) for keycol in keycols])
33
+ d[key] = drow
34
+ return d
35
+
36
+
37
+ def csvheaderargs(fmtcol, cols):
38
+ return ",".join([fmtcol.format(col) for col in cols])
39
+
40
+
41
+ def commandtodictdict(baseargs, cols, keycols=None, queryargfmt="{0}", colargfmt="{0}", outputfmt={}, skipheader=False):
42
+ queryarg = queryargfmt.format(csvheaderargs(colargfmt, cols))
43
+ args = baseargs + [queryarg]
44
+ csvoutput = io.StringIO(command(args))
45
+ if skipheader:
46
+ csvoutput.readline()
47
+ if keycols is None:
48
+ keycols = cols[0]
49
+ return csvtodictdict(csvoutput, cols, keycols, fmtcols=outputfmt)
50
+
51
+
52
+ def renamekeys(d, names):
53
+ '''
54
+ updates key names in d based on dict of old/new name pairs
55
+ returning resulting updated dict
56
+ '''
57
+ for oldname, newname in names.items():
58
+ d[newname] = d.pop(oldname)
59
+ return d
60
+
61
+
62
+ def docker_gpu_stat():
63
+ # get results of all commands without container arguments
64
+ dockerps = commandtodictdict(['docker', 'ps', '--format'],
65
+ ['ID', 'Image', 'Ports'],
66
+ keycols='ID',
67
+ queryargfmt="'{0}'",
68
+ colargfmt="{{{{.{0}}}}}",
69
+ outputfmt={'ID': lambda s: s[1:]})
70
+ dockerstats = commandtodictdict(['docker', 'stats', '--no-stream', '--format'],
71
+ ['Container', 'MemUsage', 'CPUPerc'],
72
+ keycols='Container',
73
+ queryargfmt="'{0}'",
74
+ colargfmt="{{{{.{0}}}}}",
75
+ outputfmt={'Container': lambda s: s[1:]})
76
+ unitstats = commandtodictdict(['nvidia-smi', '--format=csv'],
77
+ ['gpu_uuid', 'utilization.gpu', 'utilization.memory'],
78
+ keycols='gpu_uuid',
79
+ queryargfmt="--query-gpu={0}",
80
+ outputfmt={'gpu_uuid': lambda s: s.lstrip()},
81
+ skipheader=True)
82
+ unitprocstats = commandtodictdict(['nvidia-smi', '--format=csv'],
83
+ ['pid', 'process_name', 'gpu_uuid', 'used_memory'],
84
+ keycols=['pid', 'gpu_uuid'],
85
+ queryargfmt="--query-compute-apps={0}",
86
+ outputfmt={'gpu_uuid': lambda s: s.lstrip()},
87
+ skipheader=True)
88
+
89
+ # map gpu_uuids to short ids in unit info rename columns
90
+ shortunitids = {gpu_uuid: "{0}".format(shortid) for gpu_uuid, shortid in
91
+ zip(unitstats.keys(), range(len(unitstats)))}
92
+ colnames = {'utilization.gpu': 'used_gpu'}
93
+ unitstats = {shortunitids[gpu_uuid]: renamekeys(stats, colnames) for gpu_uuid, stats in unitstats.items()}
94
+ unitprocstats = {(pid, shortunitids[gpu_uuid]): stats for (pid, gpu_uuid), stats in unitprocstats.items()}
95
+
96
+ # display fmt data
97
+ basedisplaycols = collections.OrderedDict([('Container', 12),
98
+ ('Image', 18)])
99
+ optdisplaycols = collections.OrderedDict([('pid', 7),
100
+ ('gpu_uuid', 8),
101
+ ('used_memory', 12),
102
+ ('used_gpu', 9)])
103
+ displaycols = collections.OrderedDict(list(basedisplaycols.items()) +
104
+ list(optdisplaycols.items()))
105
+
106
+ # display fmt strings
107
+ basedisplayfmt = '\t'.join(['{{{0}:{1}.{1}}}'.format(col, width) for col, width in basedisplaycols.items()])
108
+ optdisplayfmt = '\t'.join(['{{{0}:{1}.{1}}}'.format(col, width) for col, width in optdisplaycols.items()])
109
+ displayfmt = '\t'.join([basedisplayfmt, optdisplayfmt])
110
+
111
+ # print rows of relevant container processes
112
+ # (everything below a bit janky in terms of argument expectations and generalization)
113
+ dockerall = {container: {**dockerps[container], **dockerstats[container]} for container in dockerstats.keys()}
114
+ someunitsactive = False
115
+ print(displayfmt.format(**{col: col for col in displaycols.keys()}))
116
+ for container, dockerinfo in dockerall.items():
117
+ # very particular incantation needed here for top options to function correctly:
118
+ # https://www.projectatomic.io/blog/2016/01/understanding-docker-top-and-ps/
119
+ pids = command(['docker', 'top', container, '-eo', 'pid']).split('\n')[1:-1] # obviously could be a bit brittle
120
+
121
+ containerunitstatslist = [((proc, unit), stats) for (proc, unit), stats in sorted(unitprocstats.items()) if
122
+ proc in pids]
123
+ containerunitstats = collections.OrderedDict(containerunitstatslist)
124
+
125
+ if containerunitstats:
126
+ someunitsactive = True
127
+ basedisplaystr = basedisplayfmt.format(Container=container, **dockerinfo)
128
+ print(basedisplaystr)
129
+ for (pid, gpu_uuid), stats in containerunitstats.items():
130
+ print(optdisplayfmt.rjust(99).format(pid=pid, gpu_uuid=gpu_uuid, **stats, **unitstats[gpu_uuid]))
131
+ if not someunitsactive:
132
+ print("\n\t\t no gpu units being used by docker containers ")
133
+
134
+
135
+ if __name__ == '__main__':
136
+ # check for existence of docker and nvidia-smi commands
137
+ if commandexists('docker') and commandexists('nvidia-smi'):
138
+ docker_gpu_stat()
139
+ else:
140
+ print('Command(s) not found')
maque/utils/ops.py ADDED
@@ -0,0 +1,53 @@
1
+ import re
2
+
3
+
4
+ def find_all_index(pattern, string, flags=0):
5
+ """find all matched index of string"""
6
+ return [i.span() for i in re.finditer(pattern, string, flags=flags)]
7
+
8
+
9
+ def string_add(string: str, dx=1):
10
+ # count_points = string.count('.')
11
+ items = find_all_index(r"\.", string)
12
+ number_list = [i for i in string.split(".")]
13
+ number_str = "".join(number_list)
14
+ number_len = len(number_str)
15
+ number = int(number_str)
16
+ number += dx
17
+ new_number_str = f"{number:0>{number_len}d}"
18
+ new_number_list = list(new_number_str)
19
+ [new_number_list.insert(idx[0], ".") for idx in items]
20
+ return "".join(new_number_list)
21
+
22
+
23
+ def index_char(size=1000):
24
+ """
25
+ Get the index of all characters.
26
+ use chr()
27
+ """
28
+ index_token = {i: chr(i) for i in range(size)}
29
+ token_index = {chr(i): i for i in range(size)}
30
+ return index_token, token_index
31
+
32
+
33
+ def find_match(start, end, S, flag=0):
34
+ """find the string between `start` and `end` of `S`
35
+ flag=0 defaults, means no special specification
36
+ flag options:
37
+ re.I IGNORECASE, 忽略大小写的匹配模式
38
+ re.M MULTILINE,多行模式, 改变 ^ 和 $ 的行为
39
+ re.S  DOTALL,此模式下 '.' 的匹配不受限制,可匹配任何字符,包括换行符,也就是默认是不能匹配换行符
40
+ re.X VERBOSE,冗余模式, 此模式忽略正则表达式中的空白和#号的注释
41
+ """
42
+ try:
43
+ START = re.search(start, S, flags=flag).span()[1]
44
+ END = re.search(end, S, flags=flag).span()[0]
45
+ return S[START:END]
46
+ except:
47
+ print("Do not match anything.")
48
+ return None
49
+
50
+
51
+ def find_match2(pattern, S, flag=0):
52
+ res = re.search(pattern, S, flags=flag)
53
+ return res.group()
@@ -0,0 +1,31 @@
1
+ import importlib.metadata
2
+ import importlib.util
3
+
4
+
5
+ def _is_package_available(name: str) -> bool:
6
+ return importlib.util.find_spec(name) is not None
7
+
8
+
9
+ def _get_package_version(name: str) -> str:
10
+ try:
11
+ return importlib.metadata.version(name)
12
+ except Exception:
13
+ return "0.0.0"
14
+
15
+
16
+ def is_flash_attn2_available():
17
+ return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2")
18
+
19
+
20
+ def is_jieba_available():
21
+ return _is_package_available("jieba")
22
+
23
+ def is_levenshtein_available():
24
+ return _is_package_available("Levenshtein")
25
+
26
+ def is_nltk_available():
27
+ return _is_package_available("nltk")
28
+
29
+
30
+ def is_rouge_available():
31
+ return _is_package_available("rouge_chinese")
maque/utils/path.py ADDED
@@ -0,0 +1,57 @@
1
+ from pathlib import Path
2
+ from typing import Union, List
3
+ import inspect
4
+ from glob import glob as _glob
5
+ import os
6
+ import sys
7
+
8
+
9
+ def rel_to_abs(rel_path: Union[str, Path], parents=0, return_str=True, strict=False):
10
+ """Return absolute path relative to the called file
11
+ args:
12
+ parent: <int> The number of times `f_back` will be calledd.
13
+ """
14
+ currentframe = inspect.currentframe()
15
+ f = currentframe.f_back
16
+ for _ in range(parents):
17
+ f = f.f_back
18
+ current_path = Path(f.f_code.co_filename).parent
19
+ pathlib_path = current_path / rel_path
20
+ pathlib_path = pathlib_path.resolve(strict=strict)
21
+ if return_str:
22
+ return str(pathlib_path)
23
+ else:
24
+ return pathlib_path
25
+
26
+
27
+ def rel_path_join(*paths: Union[str, Path], return_str=True):
28
+ return rel_to_abs(os.path.join(*paths), parents=1, return_str=return_str)
29
+
30
+
31
+ def ls(_dir, *patterns, relp=True, concat='extend', recursive=False) -> List[str]:
32
+ """
33
+ Example:
34
+ --------
35
+ >>> ls("./data/", "*.jpg", "*.png")
36
+ """
37
+ if relp:
38
+ _dir = rel_to_abs(_dir, parents=1, return_str=True, strict=False)
39
+ path_list = []
40
+ for pattern in patterns:
41
+ if concat == 'extend':
42
+ path_list.extend(_glob(os.path.join(_dir, pattern), recursive=recursive))
43
+ else:
44
+ path_list.append(_glob(os.path.join(_dir, pattern), recursive=recursive))
45
+ return path_list
46
+
47
+
48
+ def add_env_path(*rel_paths: str):
49
+ """
50
+ Example:
51
+ --------
52
+ >>> add_env_path('..')
53
+ >>> add_env_path('..', '../..')
54
+ """
55
+
56
+ for i in rel_paths:
57
+ sys.path.append(rel_to_abs(i, parents=1))