addftool 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
addftool/addf_portal.py CHANGED
@@ -3,6 +3,7 @@ from addftool.process import add_killer_args, killer_main
3
3
  from addftool.sync import add_sync_args, sync_main
4
4
  from addftool.deploy import add_deploy_args, deploy_main
5
5
  from addftool.broadcast_folder import add_broadcast_folder_args, broadcast_folder_main
6
+ from addftool.sleep import add_sleep_args, sleep_main
6
7
 
7
8
  from addftool.blob import add_blob_args, blob_main
8
9
 
@@ -16,6 +17,7 @@ def get_args():
16
17
  add_deploy_args(subparsers)
17
18
  add_broadcast_folder_args(subparsers)
18
19
  add_blob_args(subparsers)
20
+ add_sleep_args(subparsers)
19
21
 
20
22
  return parser.parse_args()
21
23
 
@@ -32,6 +34,8 @@ def main():
32
34
  broadcast_folder_main(args)
33
35
  elif args.command == "blob":
34
36
  blob_main(args)
37
+ elif args.command == "sleep":
38
+ sleep_main(args)
35
39
  else:
36
40
  print("Unknown command: ", args.command)
37
41
 
addftool/blob.py CHANGED
@@ -76,6 +76,14 @@ def install_main(args):
76
76
  # if has root permission, run install script
77
77
  # else, print install script
78
78
 
79
+ # make sure wget is installed
80
+ if not check_package_installed("wget"):
81
+ print("wget is not installed, installing wget")
82
+ command = "apt-get install wget -y"
83
+ if args.sudo:
84
+ command = "sudo " + command
85
+ execute_command(command, script_writer)
86
+
79
87
  print("Get ubuntu version: ", ubuntu_version)
80
88
  command = f"wget https://packages.microsoft.com/config/ubuntu/{ubuntu_version}/packages-microsoft-prod.deb -O /tmp/packages-microsoft-prod.deb"
81
89
  print("Install packages-microsoft-prod.deb")
addftool/sleep.py ADDED
@@ -0,0 +1,337 @@
1
+ import time
2
+ import subprocess
3
+ import sys
4
+ import multiprocessing as mp
5
+ import re
6
+
7
+ try:
8
+ import torch
9
+ except ImportError:
10
+ print("PyTorch is not installed. Please install it to run this script.")
11
+ sys.exit(1)
12
+
13
+ try:
14
+ import triton
15
+ import triton.runtime.driver
16
+ except ImportError:
17
+ print("Triton is not installed. Will try to detect GPU type using command line tools.")
18
+ triton = None
19
+
20
+ def is_cuda():
21
+ """使用 triton 检测是否是 CUDA 环境"""
22
+ try:
23
+ if triton is None:
24
+ return None
25
+ return triton.runtime.driver.active.get_current_target().backend == "cuda"
26
+ except:
27
+ return None
28
+
29
+ def get_gpu_type():
30
+ """检测GPU类型(NVIDIA/CUDA或AMD/ROCm)"""
31
+ # 首先尝试使用 triton 检测
32
+ cuda_detected = is_cuda()
33
+ if cuda_detected is True:
34
+ return "nvidia"
35
+ elif cuda_detected is False:
36
+ return "amd"
37
+
38
+ # 如果 triton 检测失败,回退到命令行检测
39
+ try:
40
+ # 尝试检测NVIDIA GPU
41
+ result = subprocess.run("nvidia-smi", shell=True, capture_output=True, text=True)
42
+ if result.returncode == 0:
43
+ return "nvidia"
44
+
45
+ # 尝试检测AMD GPU
46
+ result = subprocess.run("rocm-smi", shell=True, capture_output=True, text=True)
47
+ if result.returncode == 0:
48
+ return "amd"
49
+
50
+ return None
51
+ except:
52
+ return None
53
+
54
+ def get_gpu_stats(device_id):
55
+ """获取指定GPU的利用率和显存使用情况(支持NVIDIA和AMD)"""
56
+ gpu_type = get_gpu_type()
57
+
58
+ if gpu_type == "nvidia":
59
+ return get_nvidia_gpu_stats(device_id)
60
+ elif gpu_type == "amd":
61
+ return get_amd_gpu_stats(device_id)
62
+ else:
63
+ print("No supported GPU found (neither NVIDIA nor AMD)")
64
+ return None, None
65
+
66
+ def get_nvidia_gpu_stats(device_id):
67
+ """获取NVIDIA GPU的统计信息"""
68
+ try:
69
+ cmd = f"nvidia-smi --id={device_id} --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits"
70
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
71
+
72
+ if result.returncode != 0:
73
+ print(f"Error running nvidia-smi for GPU {device_id}")
74
+ return None, None
75
+
76
+ # 解析输出
77
+ output = result.stdout.strip()
78
+ if output:
79
+ parts = output.split(',')
80
+ if len(parts) == 2:
81
+ gpu_util = int(parts[0]) # GPU利用率百分比
82
+ memory_used = int(parts[1]) # 显存使用量(MB)
83
+ return gpu_util, memory_used
84
+
85
+ return None, None
86
+ except Exception as e:
87
+ print(f"Error getting NVIDIA GPU stats for device {device_id}: {e}")
88
+ return None, None
89
+
90
+ def get_amd_gpu_stats(device_id):
91
+ """获取AMD GPU的统计信息"""
92
+ try:
93
+ # 获取GPU利用率和显存使用情况
94
+ cmd = f"rocm-smi -d {device_id} --showuse --showmemuse"
95
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
96
+
97
+ if result.returncode != 0:
98
+ # 尝试备用命令
99
+ cmd = f"rocm-smi -d {device_id}"
100
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
101
+ if result.returncode != 0:
102
+ print(f"Error running rocm-smi for GPU {device_id}")
103
+ return None, None
104
+
105
+ gpu_util = None
106
+ memory_used = None
107
+
108
+ # 解析输出
109
+ output = result.stdout
110
+ lines = output.split('\n')
111
+
112
+ for line in lines:
113
+ # 查找GPU利用率
114
+ if 'GPU use' in line or '%' in line:
115
+ # 匹配百分比
116
+ match = re.search(r'(\d+)%', line)
117
+ if match:
118
+ gpu_util = int(match.group(1))
119
+
120
+ # 查找显存使用(MB)
121
+ if 'vram' in line.lower() or 'memory' in line.lower() or 'MB' in line:
122
+ # 匹配MB数值,格式可能是 "1024 MB" 或 "1024MB"
123
+ match = re.search(r'(\d+)\s*MB', line, re.IGNORECASE)
124
+ if match:
125
+ memory_used = int(match.group(1))
126
+
127
+ # 如果仍然无法获取利用率,设置为0(假设空闲)
128
+ if gpu_util is None:
129
+ gpu_util = 0
130
+
131
+ # 如果仍然无法获取内存使用,设置为0
132
+ if memory_used is None:
133
+ memory_used = 0
134
+
135
+ return gpu_util, memory_used
136
+
137
+ except Exception as e:
138
+ print(f"Error getting AMD GPU stats for device {device_id}: {e}")
139
+ return None, None
140
+
141
+ # 在程序启动时检测GPU类型
142
+ try:
143
+ GPU_TYPE = get_gpu_type()
144
+ if GPU_TYPE:
145
+ cuda_status = is_cuda()
146
+ if cuda_status is not None:
147
+ print(f"Detected {GPU_TYPE.upper()} GPU environment (triton backend: {'cuda' if cuda_status else 'hip'})")
148
+ else:
149
+ print(f"Detected {GPU_TYPE.upper()} GPU environment")
150
+ else:
151
+ print("No supported GPU environment detected")
152
+ except:
153
+ GPU_TYPE = None
154
+ print("Failed to detect GPU environment")
155
+
156
+ def check_gpu_occupied(device_id, util_threshold=20, memory_threshold=2048):
157
+ """检查GPU是否被其他进程占用
158
+
159
+ Args:
160
+ device_id: GPU设备ID
161
+ util_threshold: GPU利用率阈值(默认20%)
162
+ memory_threshold: 显存使用阈值(默认2048MB = 2GB)
163
+
164
+ Returns:
165
+ bool: True表示GPU被占用,False表示GPU空闲
166
+ """
167
+ gpu_util, memory_used = get_gpu_stats(device_id)
168
+
169
+ if gpu_util is None or memory_used is None:
170
+ # 获取失败时保守处理
171
+ return True
172
+
173
+ # 判断是否被占用
174
+ is_occupied = gpu_util > util_threshold or (memory_threshold > 0 and memory_used > memory_threshold)
175
+
176
+ if is_occupied:
177
+ print(f"GPU {device_id}: Util={gpu_util}%, Memory={memory_used}MB - Occupied")
178
+
179
+ return is_occupied
180
+
181
+ def check_all_gpus(num_gpus, util_threshold=20, memory_threshold=-1):
182
+ """检查所有GPU是否被占用"""
183
+ for device_id in range(num_gpus):
184
+ if check_gpu_occupied(device_id, util_threshold, memory_threshold):
185
+ return True, device_id
186
+ return False, -1
187
+
188
+ def get_all_gpu_status(num_gpus):
189
+ """获取所有GPU的状态信息"""
190
+ print("\nGPU Status:")
191
+ print("-" * 50)
192
+ for device_id in range(num_gpus):
193
+ gpu_util, memory_used = get_gpu_stats(device_id)
194
+ if gpu_util is not None and memory_used is not None:
195
+ status = "Available" if (gpu_util <= 20 and memory_used <= 2048) else "Occupied"
196
+ print(f"GPU {device_id}: Util={gpu_util:3d}%, Memory={memory_used:5d}MB - {status}")
197
+ else:
198
+ print(f"GPU {device_id}: Unable to get stats")
199
+ print("-" * 50)
200
+
201
+ def matrix_multiply_worker(matrix_size=8192, time_duration=4.0, sleep_duration=1.0, util_threshold=20, memory_threshold=-1):
202
+ # 获取GPU数量
203
+ num_gpus = torch.cuda.device_count()
204
+ if num_gpus == 0:
205
+ print("No GPUs available!")
206
+ return
207
+
208
+ matrices = {}
209
+ # print(f"Creating {matrix_size}x{matrix_size} matrices on all GPUs...")
210
+ for device_id in range(num_gpus):
211
+ device = torch.device(f'cuda:{device_id}')
212
+ matrices[device_id] = {
213
+ 'a': torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32),
214
+ 'b': torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32)
215
+ }
216
+
217
+ # 主循环
218
+ while True:
219
+ try:
220
+ # 检查所有GPU是否被占用
221
+ has_occupied_gpu, occupied_gpu = check_all_gpus(num_gpus, util_threshold, memory_threshold)
222
+ if has_occupied_gpu:
223
+ break
224
+
225
+ start_time = time.time()
226
+ perform_count = 0
227
+ while True:
228
+ # 在所有GPU上同时执行矩阵乘法
229
+ results = {}
230
+ for device_id in range(num_gpus):
231
+ results[device_id] = torch.matmul(matrices[device_id]['a'], matrices[device_id]['b'])
232
+
233
+ perform_count += 1
234
+
235
+ if perform_count % 10 == 0:
236
+ for device_id in range(num_gpus):
237
+ torch.cuda.synchronize(device_id)
238
+
239
+ torch.cuda.synchronize() # 确保所有GPU操作完成
240
+ elapsed_time = time.time() - start_time
241
+ if elapsed_time > time_duration:
242
+ break
243
+
244
+ # 清理内存
245
+
246
+ time.sleep(sleep_duration)
247
+
248
+ except KeyboardInterrupt:
249
+ print("\nKeyboardInterrupt received, stopping...")
250
+ stop_flag = True
251
+ exit(0)
252
+ except Exception as e:
253
+ print(f"\nError occurred: {e}")
254
+ # 尝试清理内存
255
+ try:
256
+ for device_id in range(num_gpus):
257
+ torch.cuda.set_device(device_id)
258
+ torch.cuda.empty_cache()
259
+ except:
260
+ pass
261
+ time.sleep(5)
262
+
263
+ def sleep_main(args):
264
+ # 设置多进程启动方法
265
+ mp.set_start_method('spawn', force=True)
266
+
267
+ num_gpus = torch.cuda.device_count()
268
+ if num_gpus == 0:
269
+ print("No GPUs available!")
270
+ exit(1)
271
+
272
+ # 显示初始GPU状态
273
+ get_all_gpu_status(num_gpus)
274
+
275
+ current_process = None
276
+
277
+ # 主循环
278
+ while True:
279
+ try:
280
+ # 检查所有GPU是否被占用
281
+ has_occupied_gpu, occupied_gpu = check_all_gpus(num_gpus, util_threshold=args.util_threshold, memory_threshold=args.memory_threshold)
282
+
283
+ if has_occupied_gpu:
284
+ # 休眠60秒
285
+ print("Holding for 60 seconds...")
286
+ time.sleep(60)
287
+
288
+ else:
289
+ # GPU空闲,启动矩阵乘法进程
290
+ current_process = mp.Process(
291
+ target=matrix_multiply_worker,
292
+ args=(args.matrix_size, args.time_duration, args.sleep_duration, args.util_threshold, args.memory_threshold),
293
+ )
294
+ current_process.start()
295
+ current_process.join()
296
+
297
+ except KeyboardInterrupt:
298
+ print("\nKeyboardInterrupt received, stopping...")
299
+ stop_flag = True
300
+ break
301
+ except Exception as e:
302
+ print(f"\nError occurred: {e}")
303
+ time.sleep(5)
304
+
305
+ print("\nProgram stopped")
306
+
307
+
308
+ def add_sleep_args(subparsers):
309
+ sleep_parser = subparsers.add_parser('sleep', help='Sleep for a while and check GPU status')
310
+ add_args(sleep_parser)
311
+
312
+
313
+ def add_args(parser):
314
+ parser.add_argument('--matrix_size', type=int, default=8192, help='Size of the matrices to multiply')
315
+ parser.add_argument('--time_duration', type=float, default=4.0, help='Duration to perform matrix multiplication')
316
+ parser.add_argument('--sleep_duration', type=float, default=1.0, help='Duration to sleep between checks')
317
+
318
+ parser.add_argument('--util_threshold', type=int, default=20, help='GPU utilization threshold to consider it occupied')
319
+ parser.add_argument('--memory_threshold', type=int, default=-1, help='Memory usage threshold (in GB) to consider it occupied, set to -1 to disable')
320
+
321
+
322
+ if __name__ == "__main__":
323
+ import argparse
324
+ parser = argparse.ArgumentParser(description='Sleep and check GPU status')
325
+ add_args(parser)
326
+ args = parser.parse_args()
327
+ while True:
328
+ try:
329
+ sleep_main(args)
330
+ except KeyboardInterrupt:
331
+ print("\nKeyboardInterrupt received, exiting...")
332
+ sys.exit(0)
333
+ except Exception as e:
334
+ print(f"\nUnexpected error: {e}")
335
+ print("Restarting the program in 5 seconds...")
336
+ time.sleep(5)
337
+ continue
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: addftool
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Requires-Dist: cryptography
5
5
  Requires-Dist: requests
6
6
  Requires-Dist: PyYAML
@@ -8,3 +8,4 @@ Requires-Dist: psutil
8
8
  Requires-Dist: fabric
9
9
  Requires-Dist: gevent
10
10
  Requires-Dist: parallel-ssh
11
+
@@ -1,7 +1,8 @@
1
1
  addftool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- addftool/addf_portal.py,sha256=6XjwGs5m2mRVDWVvCPOiqn1NlxDcGBTQ9Kr_0g5RsJc,1130
3
- addftool/blob.py,sha256=y1HZaDBUNeXicVytvwpRXwufvvrgxR33ruBlYpxnSa4,9453
2
+ addftool/addf_portal.py,sha256=vc8opPzValNFPwJne5C5LbZvgcJ0eMBJSWDSiM23OPM,1274
3
+ addftool/blob.py,sha256=vyjJHlQZuGrpEiaRF-Bdmow_TMnhXtXEGL31qA5Hb-g,9742
4
4
  addftool/broadcast_folder.py,sha256=GQBuSL8Ch537V_fSBHesWyqT3KRYry68pbYOKy2bDj4,19619
5
+ addftool/sleep.py,sha256=Y6gAJb2Ho4qvbd52-UBmwt7Rgv2HpJAcBnWO23asaY8,11787
5
6
  addftool/sync.py,sha256=ZpYxbM8uiPFrV7ODmOaM7asVPCWaxBixA-arVc-1kfs,14045
6
7
  addftool/tool.py,sha256=FmxRY3-pP0_Z0zCUAngjmEMmPUruMftg_iUlB1t2TnQ,2001
7
8
  addftool/util.py,sha256=zlNLu8Be8cGIpNRqBw8_0q7nFxWlsJ9cToN62ohjdXE,2335
@@ -12,8 +13,8 @@ addftool/deploy/vscode_server.py,sha256=tLtSvlcK2fEOaw6udWt8dNELVhwv9F59hF5DJJ-1
12
13
  addftool/process/__init__.py,sha256=Dze8OrcyjQlAbPrjE_h8bMi8W4b3OJyZOjTucPrkJvM,3721
13
14
  addftool/process/utils.py,sha256=JldxnwanLJOgxaPgmCJh7SeBRaaj5rFxWWxh1hpsvbA,2609
14
15
  addftool/ssh/__init__.py,sha256=h5_rCO0A6q2Yw9vFguQZZp_ApAJsT1dcnKnbKKZ0cDM,4409
15
- addftool-0.2.7.dist-info/METADATA,sha256=7wElFYgZp3OX387bAsiQTwzfrc4pHH8dlK6vPnDULWU,220
16
- addftool-0.2.7.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
17
- addftool-0.2.7.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
18
- addftool-0.2.7.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
19
- addftool-0.2.7.dist-info/RECORD,,
16
+ addftool-0.2.9.dist-info/METADATA,sha256=lWVPrPjbHKv10pKm1w4HPdccUy_122llzDR1WQ6c2mg,221
17
+ addftool-0.2.9.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ addftool-0.2.9.dist-info/entry_points.txt,sha256=9lkmuWMInwUAtev8w8poNkNd7iML9Bjd5CBCFVxg2b8,111
19
+ addftool-0.2.9.dist-info/top_level.txt,sha256=jqj56-plrBbyzY0tIxB6wPzjAA8kte4hUlajyyQygN4,9
20
+ addftool-0.2.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (75.3.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5