addftool 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- addftool/addf_portal.py +4 -0
- addftool/sleep.py +218 -0
- {addftool-0.2.7.dist-info → addftool-0.2.8.dist-info}/METADATA +3 -2
- {addftool-0.2.7.dist-info → addftool-0.2.8.dist-info}/RECORD +7 -6
- {addftool-0.2.7.dist-info → addftool-0.2.8.dist-info}/WHEEL +1 -1
- {addftool-0.2.7.dist-info → addftool-0.2.8.dist-info}/entry_points.txt +0 -0
- {addftool-0.2.7.dist-info → addftool-0.2.8.dist-info}/top_level.txt +0 -0
addftool/addf_portal.py
CHANGED
|
@@ -3,6 +3,7 @@ from addftool.process import add_killer_args, killer_main
|
|
|
3
3
|
from addftool.sync import add_sync_args, sync_main
|
|
4
4
|
from addftool.deploy import add_deploy_args, deploy_main
|
|
5
5
|
from addftool.broadcast_folder import add_broadcast_folder_args, broadcast_folder_main
|
|
6
|
+
from addftool.sleep import add_sleep_args, sleep_main
|
|
6
7
|
|
|
7
8
|
from addftool.blob import add_blob_args, blob_main
|
|
8
9
|
|
|
@@ -16,6 +17,7 @@ def get_args():
|
|
|
16
17
|
add_deploy_args(subparsers)
|
|
17
18
|
add_broadcast_folder_args(subparsers)
|
|
18
19
|
add_blob_args(subparsers)
|
|
20
|
+
add_sleep_args(subparsers)
|
|
19
21
|
|
|
20
22
|
return parser.parse_args()
|
|
21
23
|
|
|
@@ -32,6 +34,8 @@ def main():
|
|
|
32
34
|
broadcast_folder_main(args)
|
|
33
35
|
elif args.command == "blob":
|
|
34
36
|
blob_main(args)
|
|
37
|
+
elif args.command == "sleep":
|
|
38
|
+
sleep_main(args)
|
|
35
39
|
else:
|
|
36
40
|
print("Unknown command: ", args.command)
|
|
37
41
|
|
addftool/sleep.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import torch
|
|
8
|
+
except ImportError:
|
|
9
|
+
print("PyTorch is not installed. Please install it to run this script.")
|
|
10
|
+
sys.exit(1)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_gpu_stats(device_id):
|
|
14
|
+
"""获取指定GPU的利用率和显存使用情况"""
|
|
15
|
+
try:
|
|
16
|
+
cmd = f"nvidia-smi --id={device_id} --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits"
|
|
17
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
18
|
+
|
|
19
|
+
if result.returncode != 0:
|
|
20
|
+
print(f"Error running nvidia-smi for GPU {device_id}")
|
|
21
|
+
return None, None
|
|
22
|
+
|
|
23
|
+
# 解析输出
|
|
24
|
+
output = result.stdout.strip()
|
|
25
|
+
if output:
|
|
26
|
+
parts = output.split(',')
|
|
27
|
+
if len(parts) == 2:
|
|
28
|
+
gpu_util = int(parts[0]) # GPU利用率百分比
|
|
29
|
+
memory_used = int(parts[1]) # 显存使用量(MB)
|
|
30
|
+
return gpu_util, memory_used
|
|
31
|
+
|
|
32
|
+
return None, None
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Error getting GPU stats for device {device_id}: {e}")
|
|
35
|
+
return None, None
|
|
36
|
+
|
|
37
|
+
def check_gpu_occupied(device_id, util_threshold=20, memory_threshold=2048):
|
|
38
|
+
"""检查GPU是否被其他进程占用
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
device_id: GPU设备ID
|
|
42
|
+
util_threshold: GPU利用率阈值(默认20%)
|
|
43
|
+
memory_threshold: 显存使用阈值(默认2048MB = 2GB)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
bool: True表示GPU被占用,False表示GPU空闲
|
|
47
|
+
"""
|
|
48
|
+
gpu_util, memory_used = get_gpu_stats(device_id)
|
|
49
|
+
|
|
50
|
+
if gpu_util is None or memory_used is None:
|
|
51
|
+
# 获取失败时保守处理
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
# 判断是否被占用
|
|
55
|
+
is_occupied = gpu_util > util_threshold or (memory_threshold > 0 and memory_used > memory_threshold)
|
|
56
|
+
|
|
57
|
+
if is_occupied:
|
|
58
|
+
print(f"GPU {device_id}: Util={gpu_util}%, Memory={memory_used}MB - Occupied")
|
|
59
|
+
|
|
60
|
+
return is_occupied
|
|
61
|
+
|
|
62
|
+
def check_all_gpus(num_gpus, util_threshold=20, memory_threshold=-1):
|
|
63
|
+
"""检查所有GPU是否被占用"""
|
|
64
|
+
for device_id in range(num_gpus):
|
|
65
|
+
if check_gpu_occupied(device_id, util_threshold, memory_threshold):
|
|
66
|
+
return True, device_id
|
|
67
|
+
return False, -1
|
|
68
|
+
|
|
69
|
+
def get_all_gpu_status(num_gpus):
|
|
70
|
+
"""获取所有GPU的状态信息"""
|
|
71
|
+
print("\nGPU Status:")
|
|
72
|
+
print("-" * 50)
|
|
73
|
+
for device_id in range(num_gpus):
|
|
74
|
+
gpu_util, memory_used = get_gpu_stats(device_id)
|
|
75
|
+
if gpu_util is not None and memory_used is not None:
|
|
76
|
+
status = "Available" if (gpu_util <= 20 and memory_used <= 2048) else "Occupied"
|
|
77
|
+
print(f"GPU {device_id}: Util={gpu_util:3d}%, Memory={memory_used:5d}MB - {status}")
|
|
78
|
+
else:
|
|
79
|
+
print(f"GPU {device_id}: Unable to get stats")
|
|
80
|
+
print("-" * 50)
|
|
81
|
+
|
|
82
|
+
def matrix_multiply_worker(matrix_size=8192, time_duration=4.0, sleep_duration=1.0, util_threshold=20, memory_threshold=-1):
|
|
83
|
+
# 获取GPU数量
|
|
84
|
+
num_gpus = torch.cuda.device_count()
|
|
85
|
+
if num_gpus == 0:
|
|
86
|
+
print("No GPUs available!")
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
matrices = {}
|
|
90
|
+
# print(f"Creating {matrix_size}x{matrix_size} matrices on all GPUs...")
|
|
91
|
+
for device_id in range(num_gpus):
|
|
92
|
+
device = torch.device(f'cuda:{device_id}')
|
|
93
|
+
matrices[device_id] = {
|
|
94
|
+
'a': torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32),
|
|
95
|
+
'b': torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# 主循环
|
|
99
|
+
while True:
|
|
100
|
+
try:
|
|
101
|
+
# 检查所有GPU是否被占用
|
|
102
|
+
has_occupied_gpu, occupied_gpu = check_all_gpus(num_gpus, util_threshold, memory_threshold)
|
|
103
|
+
if has_occupied_gpu:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
start_time = time.time()
|
|
107
|
+
perform_count = 0
|
|
108
|
+
while True:
|
|
109
|
+
# 在所有GPU上同时执行矩阵乘法
|
|
110
|
+
results = {}
|
|
111
|
+
for device_id in range(num_gpus):
|
|
112
|
+
results[device_id] = torch.matmul(matrices[device_id]['a'], matrices[device_id]['b'])
|
|
113
|
+
|
|
114
|
+
perform_count += 1
|
|
115
|
+
|
|
116
|
+
if perform_count % 10 == 0:
|
|
117
|
+
for device_id in range(num_gpus):
|
|
118
|
+
torch.cuda.synchronize(device_id)
|
|
119
|
+
|
|
120
|
+
torch.cuda.synchronize() # 确保所有GPU操作完成
|
|
121
|
+
elapsed_time = time.time() - start_time
|
|
122
|
+
if elapsed_time > time_duration:
|
|
123
|
+
break
|
|
124
|
+
|
|
125
|
+
# 清理内存
|
|
126
|
+
|
|
127
|
+
time.sleep(sleep_duration)
|
|
128
|
+
|
|
129
|
+
except KeyboardInterrupt:
|
|
130
|
+
print("\nKeyboardInterrupt received, stopping...")
|
|
131
|
+
stop_flag = True
|
|
132
|
+
exit(0)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
print(f"\nError occurred: {e}")
|
|
135
|
+
# 尝试清理内存
|
|
136
|
+
try:
|
|
137
|
+
for device_id in range(num_gpus):
|
|
138
|
+
torch.cuda.set_device(device_id)
|
|
139
|
+
torch.cuda.empty_cache()
|
|
140
|
+
except:
|
|
141
|
+
pass
|
|
142
|
+
time.sleep(5)
|
|
143
|
+
|
|
144
|
+
def sleep_main(args):
|
|
145
|
+
# 设置多进程启动方法
|
|
146
|
+
mp.set_start_method('spawn', force=True)
|
|
147
|
+
|
|
148
|
+
num_gpus = torch.cuda.device_count()
|
|
149
|
+
if num_gpus == 0:
|
|
150
|
+
print("No GPUs available!")
|
|
151
|
+
exit(1)
|
|
152
|
+
|
|
153
|
+
# 显示初始GPU状态
|
|
154
|
+
get_all_gpu_status(num_gpus)
|
|
155
|
+
|
|
156
|
+
current_process = None
|
|
157
|
+
|
|
158
|
+
# 主循环
|
|
159
|
+
while True:
|
|
160
|
+
try:
|
|
161
|
+
# 检查所有GPU是否被占用
|
|
162
|
+
has_occupied_gpu, occupied_gpu = check_all_gpus(num_gpus, util_threshold=args.util_threshold, memory_threshold=args.memory_threshold)
|
|
163
|
+
|
|
164
|
+
if has_occupied_gpu:
|
|
165
|
+
# 休眠60秒
|
|
166
|
+
print("Holding for 60 seconds...")
|
|
167
|
+
time.sleep(60)
|
|
168
|
+
|
|
169
|
+
else:
|
|
170
|
+
# GPU空闲,启动矩阵乘法进程
|
|
171
|
+
current_process = mp.Process(
|
|
172
|
+
target=matrix_multiply_worker,
|
|
173
|
+
args=(args.matrix_size, args.time_duration, args.sleep_duration, args.util_threshold, args.memory_threshold),
|
|
174
|
+
)
|
|
175
|
+
current_process.start()
|
|
176
|
+
current_process.join()
|
|
177
|
+
|
|
178
|
+
except KeyboardInterrupt:
|
|
179
|
+
print("\nKeyboardInterrupt received, stopping...")
|
|
180
|
+
stop_flag = True
|
|
181
|
+
break
|
|
182
|
+
except Exception as e:
|
|
183
|
+
print(f"\nError occurred: {e}")
|
|
184
|
+
time.sleep(5)
|
|
185
|
+
|
|
186
|
+
print("\nProgram stopped")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def add_sleep_args(subparsers):
|
|
190
|
+
sleep_parser = subparsers.add_parser('sleep', help='Sleep for a while and check GPU status')
|
|
191
|
+
add_args(sleep_parser)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def add_args(parser):
|
|
195
|
+
parser.add_argument('--matrix_size', type=int, default=8192, help='Size of the matrices to multiply')
|
|
196
|
+
parser.add_argument('--time_duration', type=float, default=4.0, help='Duration to perform matrix multiplication')
|
|
197
|
+
parser.add_argument('--sleep_duration', type=float, default=1.0, help='Duration to sleep between checks')
|
|
198
|
+
|
|
199
|
+
parser.add_argument('--util_threshold', type=int, default=20, help='GPU utilization threshold to consider it occupied')
|
|
200
|
+
parser.add_argument('--memory_threshold', type=int, default=-1, help='Memory usage threshold (in GB) to consider it occupied, set to -1 to disable')
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
import argparse
|
|
205
|
+
parser = argparse.ArgumentParser(description='Sleep and check GPU status')
|
|
206
|
+
add_args(parser)
|
|
207
|
+
args = parser.parse_args()
|
|
208
|
+
while True:
|
|
209
|
+
try:
|
|
210
|
+
sleep_main(args)
|
|
211
|
+
except KeyboardInterrupt:
|
|
212
|
+
print("\nKeyboardInterrupt received, exiting...")
|
|
213
|
+
sys.exit(0)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f"\nUnexpected error: {e}")
|
|
216
|
+
print("Restarting the program in 5 seconds...")
|
|
217
|
+
time.sleep(5)
|
|
218
|
+
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: addftool
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Requires-Dist: cryptography
|
|
5
5
|
Requires-Dist: requests
|
|
6
6
|
Requires-Dist: PyYAML
|
|
@@ -8,3 +8,4 @@ Requires-Dist: psutil
|
|
|
8
8
|
Requires-Dist: fabric
|
|
9
9
|
Requires-Dist: gevent
|
|
10
10
|
Requires-Dist: parallel-ssh
|
|
11
|
+
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
addftool/addf_portal.py,sha256=
|
|
2
|
+
addftool/addf_portal.py,sha256=vc8opPzValNFPwJne5C5LbZvgcJ0eMBJSWDSiM23OPM,1274
|
|
3
3
|
addftool/blob.py,sha256=y1HZaDBUNeXicVytvwpRXwufvvrgxR33ruBlYpxnSa4,9453
|
|
4
4
|
addftool/broadcast_folder.py,sha256=GQBuSL8Ch537V_fSBHesWyqT3KRYry68pbYOKy2bDj4,19619
|
|
5
|
+
addftool/sleep.py,sha256=FA1fTUI47eQq-9nBtXElkS7SZMunP_5tLiIBuFNSM6w,7823
|
|
5
6
|
addftool/sync.py,sha256=ZpYxbM8uiPFrV7ODmOaM7asVPCWaxBixA-arVc-1kfs,14045
|
|
6
7
|
addftool/tool.py,sha256=FmxRY3-pP0_Z0zCUAngjmEMmPUruMftg_iUlB1t2TnQ,2001
|
|
7
8
|
addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
|
|
@@ -12,8 +13,8 @@ addftool/deploy/vscode_server.py,sha256=tLtSvlcK2fEOaw6udWt8dNELVhwv9F59hF5DJJ-1
|
|
|
12
13
|
addftool/process/__init__.py,sha256=Dze8OrcyjQlAbPrjE_h8bMi8W4b3OJyZOjTucPrkJvM,3721
|
|
13
14
|
addftool/process/utils.py,sha256=JldxnwanLJOgxaPgmCJh7SeBRaaj5rFxWWxh1hpsvbA,2609
|
|
14
15
|
addftool/ssh/__init__.py,sha256=h5_rCO0A6q2Yw9vFguQZZp_ApAJsT1dcnKnbKKZ0cDM,4409
|
|
15
|
-
addftool-0.2.
|
|
16
|
-
addftool-0.2.
|
|
17
|
-
addftool-0.2.
|
|
18
|
-
addftool-0.2.
|
|
19
|
-
addftool-0.2.
|
|
16
|
+
addftool-0.2.8.dist-info/METADATA,sha256=rxu5Oy4lH7lQF99Z8gzz5QuoGxnZ739h0OBNhr_0NA0,221
|
|
17
|
+
addftool-0.2.8.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
18
|
+
addftool-0.2.8.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
|
|
19
|
+
addftool-0.2.8.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
|
|
20
|
+
addftool-0.2.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|