eqcctpro 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eqcctpro might be problematic. Click here for more details.
- eqcctpro/__init__.py +14 -2
- eqcctpro/functionality.py +1155 -0
- eqcctpro/parallelization.py +915 -0
- eqcctpro/tf_eqcct_models.py +407 -0
- eqcctpro/tools.py +871 -0
- {eqcctpro-0.6.3.dist-info → eqcctpro-0.6.5.dist-info}/METADATA +37 -36
- eqcctpro-0.6.5.dist-info/RECORD +9 -0
- eqcctpro-0.6.3.dist-info/RECORD +0 -5
- {eqcctpro-0.6.3.dist-info → eqcctpro-0.6.5.dist-info}/WHEEL +0 -0
- {eqcctpro-0.6.3.dist-info → eqcctpro-0.6.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
functionality.py controls all the functionality of EQCCTPro, specifically how we access mseed_predictor() and parallel_predict.
|
|
3
|
+
It is a level of abstraction so we can make the code more concise and cleaner
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import gc
|
|
7
|
+
import ray
|
|
8
|
+
import sys
|
|
9
|
+
import ast
|
|
10
|
+
import math
|
|
11
|
+
import queue
|
|
12
|
+
import psutil
|
|
13
|
+
import random
|
|
14
|
+
import numbers
|
|
15
|
+
import logging
|
|
16
|
+
import resource
|
|
17
|
+
import threading
|
|
18
|
+
from .tools import *
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from .parallelization import *
|
|
21
|
+
from obspy import UTCDateTime
|
|
22
|
+
from ray.util.queue import Queue
|
|
23
|
+
from datetime import datetime, timedelta
|
|
24
|
+
from .tools import _parse_gpus_field
|
|
25
|
+
from logging.handlers import QueueHandler, QueueListener
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RunEQCCTPro():
|
|
29
|
+
"""RunEQCCTPro class for running the RunEQCCTPro functions for multiple instances of the class"""
|
|
30
|
+
def __init__(self, # self is 'this instance' of the class
|
|
31
|
+
use_gpu: bool,
|
|
32
|
+
input_dir: str,
|
|
33
|
+
output_dir: str,
|
|
34
|
+
log_filepath: str,
|
|
35
|
+
p_model_filepath: str,
|
|
36
|
+
s_model_filepath: str,
|
|
37
|
+
number_of_concurrent_station_predictions: int,
|
|
38
|
+
number_of_concurrent_timechunk_predictions: int,
|
|
39
|
+
intra_threads: int = 1,
|
|
40
|
+
inter_threads: int = 1,
|
|
41
|
+
P_threshold: float = 0.001,
|
|
42
|
+
S_threshold: float = 0.02,
|
|
43
|
+
specific_stations: str = None,
|
|
44
|
+
csv_dir: str = None,
|
|
45
|
+
best_usecase_config: bool = None,
|
|
46
|
+
vram_mb: float = None,
|
|
47
|
+
selected_gpus: list = None,
|
|
48
|
+
cpu_id_list: list = [1],
|
|
49
|
+
start_time:str = None,
|
|
50
|
+
end_time:str = None,
|
|
51
|
+
timechunk_dt:int = None,
|
|
52
|
+
waveform_overlap:int = None):
|
|
53
|
+
|
|
54
|
+
self.use_gpu = use_gpu # 'this instance' of the classes object, use_gpu = use_gpu
|
|
55
|
+
self.input_dir = input_dir
|
|
56
|
+
self.output_dir = output_dir
|
|
57
|
+
self.log_filepath = log_filepath
|
|
58
|
+
self.p_model_filepath = p_model_filepath
|
|
59
|
+
self.s_model_filepath = s_model_filepath
|
|
60
|
+
self.number_of_concurrent_station_predictions = number_of_concurrent_station_predictions
|
|
61
|
+
self.number_of_concurrent_timechunk_predictions = number_of_concurrent_timechunk_predictions
|
|
62
|
+
self.intra_threads = intra_threads
|
|
63
|
+
self.inter_threads = inter_threads
|
|
64
|
+
self.P_threshold = P_threshold
|
|
65
|
+
self.S_threshold = S_threshold
|
|
66
|
+
self.specific_stations = specific_stations
|
|
67
|
+
self.csv_dir = csv_dir
|
|
68
|
+
self.best_usecase_config = best_usecase_config
|
|
69
|
+
self.vram_mb = vram_mb
|
|
70
|
+
self.selected_gpus = selected_gpus # a list of the GPU IDs
|
|
71
|
+
self.cpu_id_list = cpu_id_list
|
|
72
|
+
self.cpu_count = len(cpu_id_list)
|
|
73
|
+
self.start_time = start_time
|
|
74
|
+
self.end_time = end_time
|
|
75
|
+
self.timechunk_dt = timechunk_dt
|
|
76
|
+
self.waveform_overlap = waveform_overlap
|
|
77
|
+
|
|
78
|
+
# Ensures that the output_dir exists. If it doesn't, we create it
|
|
79
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
80
|
+
|
|
81
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
82
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
83
|
+
self.logger.setLevel(logging.INFO)
|
|
84
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
85
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
86
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
87
|
+
file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
|
|
88
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
89
|
+
file_h.setFormatter(fmt)
|
|
90
|
+
stream_h.setFormatter(fmt)
|
|
91
|
+
self.logger.addHandler(file_h)
|
|
92
|
+
self.logger.addHandler(stream_h)
|
|
93
|
+
|
|
94
|
+
self.logger.info("")
|
|
95
|
+
self.logger.info(f"------- Welcome to EQCCTPro -------")
|
|
96
|
+
self.logger.info("")
|
|
97
|
+
|
|
98
|
+
# If the user passed a GPU but no valid VRAM, need to exit
|
|
99
|
+
if self.use_gpu and not (isinstance(self.vram_mb, numbers.Real) and math.isfinite(self.vram_mb) and self.vram_mb > 0):
|
|
100
|
+
self.logger.error(f"No numerical VRAM passed. Please provide vram_mb (MB per Raylet per GPU) as a positive real number. Exiting...")
|
|
101
|
+
sys.exit(1)
|
|
102
|
+
|
|
103
|
+
# We need to ensure that the vram specified does not exceed the capabilities of the system, if not, we need to exit safely before it happens
|
|
104
|
+
if self.use_gpu:
|
|
105
|
+
check_vram_per_gpu_style(
|
|
106
|
+
selected_gpus=self.selected_gpus,
|
|
107
|
+
get_gpu_vram_fn=lambda gid: get_gpu_vram(gpu_index=gid),
|
|
108
|
+
intended_workers=self.number_of_concurrent_station_predictions * self.number_of_concurrent_timechunk_predictions,
|
|
109
|
+
vram_mb=self.vram_mb,
|
|
110
|
+
model_vram_mb=1500.0, # your safety reserve for EQCCT
|
|
111
|
+
safety_cap=0.95,
|
|
112
|
+
eqcct_overhead_gb=0.0,
|
|
113
|
+
logger=self.logger)
|
|
114
|
+
|
|
115
|
+
# To-Do: merge dt_task_generator and chunk_time into one function and concatenate the objects so we dont have so much stuff running around
|
|
116
|
+
# Generates the dt tasks list
|
|
117
|
+
def dt_task_generator(self):
|
|
118
|
+
# Modifies the times_list values (see chunk_time()) so it can be in a format the mseed_predictor can use
|
|
119
|
+
tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
|
|
120
|
+
self.tasks_picker = tasks
|
|
121
|
+
|
|
122
|
+
def chunk_time(self):
|
|
123
|
+
# Creates the timechunks, EI. from X specific time to Y specific time to generate the dt tasks (timechunk tasks that are run in parallel first at the top level)
|
|
124
|
+
# EX. [[UTCDateTime(2024, 12, 15, 11, 58), UTCDateTime(2024, 12, 15, 13, 0)], [UTCDateTime(2024, 12, 15, 12, 58), UTCDateTime(2024, 12, 15, 14, 0)]]
|
|
125
|
+
starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
|
|
126
|
+
endtime = UTCDateTime(self.end_time)
|
|
127
|
+
|
|
128
|
+
times_list = []
|
|
129
|
+
start = starttime
|
|
130
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
131
|
+
while start <= endtime:
|
|
132
|
+
if end >= endtime:
|
|
133
|
+
end = endtime
|
|
134
|
+
times_list.append([start, end])
|
|
135
|
+
break
|
|
136
|
+
times_list.append([start, end])
|
|
137
|
+
start = end - (self.waveform_overlap * 60)
|
|
138
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
139
|
+
|
|
140
|
+
self.times_list = times_list
|
|
141
|
+
|
|
142
|
+
def _drain_worker_logs(self):
|
|
143
|
+
while True:
|
|
144
|
+
rec = self.log_queue.get() # blocks until a record arrives
|
|
145
|
+
if rec is None: break # sentinel to stop thread
|
|
146
|
+
try:
|
|
147
|
+
self.logger.handle(rec) # routes to file+console handlers
|
|
148
|
+
except Exception:
|
|
149
|
+
# never crash on logging
|
|
150
|
+
self.logger.exception("Failed to handle worker log record")
|
|
151
|
+
|
|
152
|
+
def configure_cpu(self):
|
|
153
|
+
# We need to configure the tf_environ for the CPU configuration that is being inputted
|
|
154
|
+
self.logger.info(f"Running EQCCT over Requested MSeed Files using CPU(s)...")
|
|
155
|
+
if self.best_usecase_config:
|
|
156
|
+
# We use the best usecase configuration that was found using EvaluateSystem
|
|
157
|
+
result = find_optimal_configuration_cpu(best_overall_usecase=True, eval_sys_results_dir=self.csv_dir)
|
|
158
|
+
if result is None:
|
|
159
|
+
self.logger.info("")
|
|
160
|
+
self.logger.info(f"Error: Could not retrieve an optimal CPU configuration. Please check that the CSV file exists and try again. Exiting...")
|
|
161
|
+
exit() # Exit gracefully
|
|
162
|
+
cpus_to_use, num_concurrent_predictions, intra, inter, station_count = result
|
|
163
|
+
self.logger.info("")
|
|
164
|
+
self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, and {inter} Inter Threads...")
|
|
165
|
+
tf_environ(gpu_id=-1, intra_threads=intra, inter_threads=inter, logger=self.logger)
|
|
166
|
+
else:
|
|
167
|
+
# We pass the requested parameters to the tf_environ
|
|
168
|
+
tf_environ(gpu_id=-1, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger)
|
|
169
|
+
|
|
170
|
+
def configure_gpu(self):
|
|
171
|
+
# We need to configure the tf_environ for the GPU configuration that is being inputted
|
|
172
|
+
self.logger.info(f"Running EQCCT over Requested MSeed Files using GPU(s)...")
|
|
173
|
+
if self.best_usecase_config:
|
|
174
|
+
result = find_optimal_configuration_gpu(True, self.csv_dir)
|
|
175
|
+
if result is None:
|
|
176
|
+
self.logger.info("")
|
|
177
|
+
self.logger.error(f"Error: Could not retrieve an optimal GPU configuration. Please check that the CSV file exists and try again. Exiting...")
|
|
178
|
+
exit() # Exit gracefully
|
|
179
|
+
|
|
180
|
+
self.logger.info("")
|
|
181
|
+
cpus_to_use, num_concurrent_predictions, intra, inter, gpus, vram_mb, station_count = result # Unpack values only if result is valid
|
|
182
|
+
self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, {inter} Inter Threads, {gpus} GPU IDs, and {vram_mb} MB VRAM per Task...")
|
|
183
|
+
tf_environ(gpu_id=1, vram_limit_mb=vram_mb, gpus_to_use=gpus, intra_threads=intra, inter_threads=inter, logger=self.logger)
|
|
184
|
+
|
|
185
|
+
else:
|
|
186
|
+
self.logger.info("")
|
|
187
|
+
self.logger.info(f"User requested to use GPU(s): {self.selected_gpus} with {self.vram_mb} MB of VRAM per Raylet (intra-op threads = {self.intra_threads}, inter-op threads = {self.inter_threads})") # Use the selected GPUs
|
|
188
|
+
tf_environ(gpu_id=1, vram_limit_mb=self.vram_mb, gpus_to_use=self.selected_gpus, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger)
|
|
189
|
+
|
|
190
|
+
def eqcctpro_parallelization(self):
|
|
191
|
+
|
|
192
|
+
if self.specific_stations is None: # We check if the station dirs are consistent, if not, exit
|
|
193
|
+
statement, specific_stations_list, do_i_exit = check_station_dirs(input_dir=self.input_dir)
|
|
194
|
+
self.logger.info(f"{statement}")
|
|
195
|
+
if do_i_exit: exit()
|
|
196
|
+
|
|
197
|
+
# We want to use a specified amount of stations
|
|
198
|
+
else: specific_stations_list = [station.strip() for station in self.specific_stations.split(',')]
|
|
199
|
+
statement = f"Using {len(specific_stations_list)} selected station(s)."
|
|
200
|
+
self.logger.info(f"{statement}")
|
|
201
|
+
self.logger.info("")
|
|
202
|
+
|
|
203
|
+
# Submit timechunk tasks to mseed_predictor
|
|
204
|
+
tasks_queue = []
|
|
205
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
206
|
+
|
|
207
|
+
# Compute total analyis timeframe
|
|
208
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
209
|
+
|
|
210
|
+
max_pending_tasks = self.number_of_concurrent_timechunk_predictions
|
|
211
|
+
self.logger.info(f"------- Starting EQCCTPro... -------")
|
|
212
|
+
self.logger.info(f"Detailed subprocess information can be found in the log file.")
|
|
213
|
+
self.logger.info("")
|
|
214
|
+
for i in range(len(self.tasks_picker)):
|
|
215
|
+
mseed_timechunk_dir_name = self.tasks_picker[i][1]
|
|
216
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
217
|
+
|
|
218
|
+
# Concurrent Timechunks
|
|
219
|
+
while True:
|
|
220
|
+
if len(tasks_queue) < max_pending_tasks:
|
|
221
|
+
tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
|
|
222
|
+
P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
|
|
223
|
+
number_of_concurrent_station_predictions=self.number_of_concurrent_station_predictions, ray_cpus=self.cpu_id_list, use_gpu=self.use_gpu,
|
|
224
|
+
gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, specific_stations=specific_stations_list,
|
|
225
|
+
timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
|
|
226
|
+
number_of_concurrent_timechunk_predictions=self.number_of_concurrent_timechunk_predictions, total_analysis_time=total_analysis_time,
|
|
227
|
+
intra_threads=self.intra_threads, inter_threads=self.inter_threads))
|
|
228
|
+
break
|
|
229
|
+
|
|
230
|
+
else: # If there are more tasks than maximum, just process them
|
|
231
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
232
|
+
for finished_task in tasks_finished:
|
|
233
|
+
log_entry = ray.get(finished_task)
|
|
234
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
235
|
+
|
|
236
|
+
# After adding all the tasks to queue, process what's left
|
|
237
|
+
while tasks_queue:
|
|
238
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
239
|
+
for finished_task in tasks_finished:
|
|
240
|
+
log_entry = ray.get(finished_task)
|
|
241
|
+
self.logger.info(log_entry)
|
|
242
|
+
|
|
243
|
+
# stop log forwarder
|
|
244
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
245
|
+
self._log_thread.join(timeout=2)
|
|
246
|
+
|
|
247
|
+
ray.shutdown()
|
|
248
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
249
|
+
self.logger.info("------- Successfully Picked All Waveform(s) from all Timechunk(s) -------")
|
|
250
|
+
# self.logger.info("------- END OF FILE -------")
|
|
251
|
+
|
|
252
|
+
def run_eqcctpro(self):
|
|
253
|
+
# Set CPU affinity
|
|
254
|
+
process = psutil.Process(os.getpid())
|
|
255
|
+
process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
|
|
256
|
+
|
|
257
|
+
self.chunk_time() # Generates the UTC times for each of the timesets in the given time range
|
|
258
|
+
self.dt_task_generator() # Generates the task list so can know how many total tasks there are for our given time range
|
|
259
|
+
|
|
260
|
+
if self.use_gpu: # GPU
|
|
261
|
+
self.configure_gpu()
|
|
262
|
+
ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False) # Ray initalization using GPUs
|
|
263
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
264
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
265
|
+
self._log_thread.start() # Starts the thread
|
|
266
|
+
# Log some import info to user
|
|
267
|
+
statement = f"Ray Successfully Initialized with {self.selected_gpus} GPU(s) and {len(self.cpu_id_list)} CPU(s)."
|
|
268
|
+
self.logger.info(f"{statement}")
|
|
269
|
+
self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
|
|
270
|
+
|
|
271
|
+
# Running parllelization
|
|
272
|
+
self.eqcctpro_parallelization()
|
|
273
|
+
|
|
274
|
+
else: # CPU
|
|
275
|
+
self.configure_cpu()
|
|
276
|
+
ray.init(ignore_reinit_error=True, num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False) # Ray initalization using CPUs
|
|
277
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
278
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
279
|
+
self._log_thread.start() # Starts the thread
|
|
280
|
+
# Log some import info to user
|
|
281
|
+
statement = f"Ray Successfully Initialized with {len(self.cpu_id_list)} CPU(s)."
|
|
282
|
+
self.logger.info(f"{statement}")
|
|
283
|
+
self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
|
|
284
|
+
|
|
285
|
+
# Running parllelization
|
|
286
|
+
self.eqcctpro_parallelization()
|
|
287
|
+
|
|
288
|
+
class EvaluateSystem():
|
|
289
|
+
"""Evaluate System class for running the evaluation system functions for multiple instances of the class"""
|
|
290
|
+
def __init__(self,
|
|
291
|
+
eval_mode: str,
|
|
292
|
+
input_dir: str,
|
|
293
|
+
output_dir: str,
|
|
294
|
+
log_filepath: str,
|
|
295
|
+
csv_dir: str,
|
|
296
|
+
p_model_filepath: str,
|
|
297
|
+
s_model_filepath: str,
|
|
298
|
+
P_threshold: float = 0.001,
|
|
299
|
+
S_threshold: float = 0.02,
|
|
300
|
+
intra_threads: int = 1,
|
|
301
|
+
inter_threads: int = 1,
|
|
302
|
+
stations2use:int = None,
|
|
303
|
+
cpu_id_list:list = [1],
|
|
304
|
+
cpu_test_step_size:int = 1,
|
|
305
|
+
starting_amount_of_stations: int = 1,
|
|
306
|
+
station_list_step_size: int = 1,
|
|
307
|
+
min_cpu_amount: int = 1,
|
|
308
|
+
min_conc_stations: int = 1,
|
|
309
|
+
conc_station_tasks_step_size: int = 1,
|
|
310
|
+
vram_mb:float = None,
|
|
311
|
+
selected_gpus:list = None,
|
|
312
|
+
start_time:str = None,
|
|
313
|
+
end_time:str = None,
|
|
314
|
+
conc_timechunk_tasks_step_size: int = 1,
|
|
315
|
+
timechunk_dt:int = None,
|
|
316
|
+
waveform_overlap:int = None,
|
|
317
|
+
tmp_dir:str = None):
|
|
318
|
+
|
|
319
|
+
valid_modes = {"cpu", "gpu"}
|
|
320
|
+
if eval_mode not in valid_modes:
|
|
321
|
+
raise ValueError(f"Invalid mode '{eval_mode}'. Choose either 'cpu' or 'gpu'.")
|
|
322
|
+
|
|
323
|
+
self.eval_mode = eval_mode.lower()
|
|
324
|
+
self.intra_threads = intra_threads
|
|
325
|
+
self.inter_threads = inter_threads
|
|
326
|
+
self.input_dir = input_dir
|
|
327
|
+
self.output_dir = output_dir
|
|
328
|
+
self.log_filepath = log_filepath
|
|
329
|
+
self.csv_dir = csv_dir
|
|
330
|
+
self.P_threshold = P_threshold
|
|
331
|
+
self.S_threshold = S_threshold
|
|
332
|
+
self.p_model_filepath = p_model_filepath
|
|
333
|
+
self.s_model_filepath = s_model_filepath
|
|
334
|
+
self.stations2use = stations2use
|
|
335
|
+
self.cpu_id_list = cpu_id_list
|
|
336
|
+
self.vram_mb = vram_mb
|
|
337
|
+
self.selected_gpus = selected_gpus
|
|
338
|
+
self.cpu_count = len(cpu_id_list)
|
|
339
|
+
self.cpu_test_step_size = cpu_test_step_size
|
|
340
|
+
self.starting_amount_of_stations = starting_amount_of_stations
|
|
341
|
+
self.station_list_step_size = station_list_step_size
|
|
342
|
+
self.min_cpu_amount = min_cpu_amount
|
|
343
|
+
self.min_conc_stations = min_conc_stations # default is = 1
|
|
344
|
+
self.conc_station_tasks_step_size = conc_station_tasks_step_size # default is = 1
|
|
345
|
+
self.stations2use_list = list(range(1, 11)) + list(range(15, 50, 5)) if stations2use is None else generate_station_list(self.starting_amount_of_stations, stations2use, self.station_list_step_size,)
|
|
346
|
+
self.start_time = start_time
|
|
347
|
+
self.end_time = end_time
|
|
348
|
+
self.conc_timechunk_tasks_step_size = conc_timechunk_tasks_step_size
|
|
349
|
+
self.timechunk_dt = timechunk_dt
|
|
350
|
+
self.waveform_overlap = waveform_overlap
|
|
351
|
+
self.home_tmp_dir = tmp_dir
|
|
352
|
+
|
|
353
|
+
# Ensures that the output_dir exists. If it doesn't, we create it
|
|
354
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
355
|
+
|
|
356
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
357
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
358
|
+
self.logger.setLevel(logging.INFO)
|
|
359
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
360
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
361
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
362
|
+
file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
|
|
363
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
364
|
+
file_h.setFormatter(fmt)
|
|
365
|
+
stream_h.setFormatter(fmt)
|
|
366
|
+
self.logger.addHandler(file_h)
|
|
367
|
+
self.logger.addHandler(stream_h)
|
|
368
|
+
|
|
369
|
+
self.logger.info("")
|
|
370
|
+
self.logger.info(f"------- Welcome to EQCCTPro's EvaluateSystem Functionality -------")
|
|
371
|
+
self.logger.info("")
|
|
372
|
+
# Set up temp dir
|
|
373
|
+
import tempfile
|
|
374
|
+
tempfile.tempfile = self.home_tmp_dir
|
|
375
|
+
|
|
376
|
+
os.environ['TMPDIR'] = self.home_tmp_dir
|
|
377
|
+
os.environ['TEMP'] = self.home_tmp_dir
|
|
378
|
+
os.environ['TMP'] = self.home_tmp_dir
|
|
379
|
+
self.logger.info(f"Successfully set up temp files to be stored at {self.home_tmp_dir}")
|
|
380
|
+
|
|
381
|
+
# We need to ensure that the vram specified does not exceed the capabilities of t he system, if not, we need to exit safely before it happens
|
|
382
|
+
self.chunk_time()
|
|
383
|
+
intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
|
|
384
|
+
if self.eval_mode == 'gpu':
|
|
385
|
+
if not self.selected_gpus:
|
|
386
|
+
raise ValueError("selected_gpus must be set in GPU mode.")
|
|
387
|
+
self.chunk_time()
|
|
388
|
+
intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
|
|
389
|
+
|
|
390
|
+
per_gpu_free_mb = [get_gpu_vram(gpu_index=g)[1] * 1024.0 for g in self.selected_gpus] # free_gb -> MB
|
|
391
|
+
plan = evaluate_vram_capacity(
|
|
392
|
+
intended_workers=intended_workers,
|
|
393
|
+
vram_per_worker_mb=float(self.vram_mb),
|
|
394
|
+
per_gpu_free_mb=per_gpu_free_mb,
|
|
395
|
+
model_vram_mb=3000.0,
|
|
396
|
+
safety_cap=0.90,
|
|
397
|
+
eqcct_overhead_gb=1.1,
|
|
398
|
+
)
|
|
399
|
+
if not plan.ok_aggregate:
|
|
400
|
+
unit = plan.per_worker_mb + plan.overhead_mb
|
|
401
|
+
raise RuntimeError(
|
|
402
|
+
f"Insufficient aggregate VRAM. Cap={plan.aggregate_cap_mb:.0f} MB, "
|
|
403
|
+
f"Need={plan.aggregate_need_mb:.0f} MB (= {plan.model_vram_mb:.0f}×{len(self.selected_gpus)} + "
|
|
404
|
+
f"{plan.intended_workers}×{unit:.0f})."
|
|
405
|
+
)
|
|
406
|
+
self.logger.info(
|
|
407
|
+
f"VRAM budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ Cap {plan.aggregate_cap_mb:.0f} MB "
|
|
408
|
+
f"across {len(self.selected_gpus)} GPU(s)."
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def _generate_stations_list(self):
|
|
412
|
+
"""Generates station list"""
|
|
413
|
+
if self.station2use is None:
|
|
414
|
+
return list(range(1, 11)) + list(range(15, 50, 5))
|
|
415
|
+
return generate_station_list(self.stations2use, self.starting_amount_of_stations, self.station_list_step_size)
|
|
416
|
+
|
|
417
|
+
# def _prepare_environment(self):
|
|
418
|
+
# """Removed 'output_dir' so that there is no conflicts in the save for a clean output return"""
|
|
419
|
+
# remove_directory(self.output_dir)
|
|
420
|
+
|
|
421
|
+
def chunk_time(self):
|
|
422
|
+
starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
|
|
423
|
+
endtime = UTCDateTime(self.end_time)
|
|
424
|
+
|
|
425
|
+
times_list = []
|
|
426
|
+
start = starttime
|
|
427
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
428
|
+
while start <= endtime:
|
|
429
|
+
if end >= endtime:
|
|
430
|
+
end = endtime
|
|
431
|
+
times_list.append([start, end])
|
|
432
|
+
break
|
|
433
|
+
times_list.append([start, end])
|
|
434
|
+
start = end - (self.waveform_overlap * 60)
|
|
435
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
436
|
+
|
|
437
|
+
self.times_list = times_list
|
|
438
|
+
|
|
439
|
+
def _drain_worker_logs(self):
|
|
440
|
+
while True:
|
|
441
|
+
rec = self.log_queue.get() # blocks until a record arrives
|
|
442
|
+
if rec is None: break # sentinel to stop thread
|
|
443
|
+
try:
|
|
444
|
+
self.logger.handle(rec) # routes to file+console handlers
|
|
445
|
+
except Exception:
|
|
446
|
+
# never crash on logging
|
|
447
|
+
self.logger.exception("Failed to handle worker log record")
|
|
448
|
+
|
|
449
|
+
def dt_task_generator(self):
|
|
450
|
+
tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
|
|
451
|
+
self.tasks_picker = tasks
|
|
452
|
+
|
|
453
|
+
def evaluate_cpu(self):
|
|
454
|
+
"""Evaluate system parallelization using CPUs"""
|
|
455
|
+
statement = "Evaluating System Parallelization Capability using CPU"
|
|
456
|
+
self.logger.info(f"{statement}")
|
|
457
|
+
|
|
458
|
+
os.makedirs(self.csv_dir, exist_ok=True)
|
|
459
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
460
|
+
|
|
461
|
+
# Create test results csv
|
|
462
|
+
csv_filepath = f"{self.csv_dir}/cpu_test_results.csv"
|
|
463
|
+
prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
|
|
464
|
+
|
|
465
|
+
self.chunk_time()
|
|
466
|
+
self.dt_task_generator()
|
|
467
|
+
|
|
468
|
+
trial_num = 1
|
|
469
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
470
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
471
|
+
|
|
472
|
+
if self.eval_mode == 'gpu':
|
|
473
|
+
use_gpu = True
|
|
474
|
+
else:
|
|
475
|
+
use_gpu = False
|
|
476
|
+
|
|
477
|
+
if self.min_cpu_amount > len(self.cpu_id_list):
|
|
478
|
+
# Code won't execute because the minimum CPU amount of > the len(cpu id list)
|
|
479
|
+
# In which the rest of the code is dependent on the len for generating cpu_count
|
|
480
|
+
print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
|
|
481
|
+
quit()
|
|
482
|
+
|
|
483
|
+
with open(self.log_filepath, mode="a+", buffering=1) as log:
|
|
484
|
+
for i in range(self.min_cpu_amount, self.cpu_count+1, self.cpu_test_step_size):
|
|
485
|
+
# Set CPU affinity and initialize Ray
|
|
486
|
+
cpus_to_use = self.cpu_id_list[:i]
|
|
487
|
+
process = psutil.Process(os.getpid())
|
|
488
|
+
process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
|
|
489
|
+
|
|
490
|
+
ray.init(ignore_reinit_error=True, num_cpus=len(cpus_to_use), logging_level=logging.FATAL, log_to_driver=False)
|
|
491
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
492
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
493
|
+
self._log_thread.start() # Starts the thread
|
|
494
|
+
self.logger.info(f"Ray Successfully Initialized with {len(cpus_to_use)} CPU(s).")
|
|
495
|
+
|
|
496
|
+
timechunks_list = []
|
|
497
|
+
timechunk = 1
|
|
498
|
+
step = self.conc_timechunk_tasks_step_size # Use the class attribute
|
|
499
|
+
while timechunk <= len(self.tasks_picker):
|
|
500
|
+
timechunks_list.append(timechunk)
|
|
501
|
+
if timechunk == 1:
|
|
502
|
+
timechunk += 1
|
|
503
|
+
else:
|
|
504
|
+
timechunk += step
|
|
505
|
+
|
|
506
|
+
if len(self.tasks_picker) not in timechunks_list:
|
|
507
|
+
timechunks_list.append(len(self.tasks_picker))
|
|
508
|
+
# sets are a set of multiple items stored in a single variable
|
|
509
|
+
# unchangable after being set, cannot have duplicates and is unordered
|
|
510
|
+
timechunks_list = sorted(list(set(timechunks_list)))
|
|
511
|
+
for timechunks in timechunks_list:
|
|
512
|
+
tested_concurrency = set() # Rest for each cpu / timechunk
|
|
513
|
+
for num_stations in self.stations2use_list:
|
|
514
|
+
concurrent_predictions_list = generate_station_list(self.min_conc_stations, num_stations, self.conc_station_tasks_step_size)
|
|
515
|
+
# We do this so that we don't repeat concurrent prediction tests
|
|
516
|
+
# Because a number of concurrent predictions running can be equivilated to the number of total stations that need to be processed
|
|
517
|
+
# There is no need to duplicate more tests that will be doing the same amount of concurrent testing for a different number of total stations
|
|
518
|
+
new_concurrent_values = [x for x in concurrent_predictions_list if x not in tested_concurrency and x <= num_stations]
|
|
519
|
+
if not new_concurrent_values:
|
|
520
|
+
continue # All concurrency values already tested
|
|
521
|
+
for num_concurrent_predictions in new_concurrent_values:
|
|
522
|
+
mseed_timechunk_dir_name = self.tasks_picker[timechunks-1][1]
|
|
523
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
524
|
+
max_pending_tasks = timechunks
|
|
525
|
+
|
|
526
|
+
self.logger.info("")
|
|
527
|
+
self.logger.info(f"------- Trial Number: {trial_num} -------")
|
|
528
|
+
self.logger.info(f"CPU(s): {i}")
|
|
529
|
+
self.logger.info(f"Conc. Timechunks Being Analyzed: {timechunks} / Total Timechunks to be Analyzed: {len(self.tasks_picker)}")
|
|
530
|
+
self.logger.info(f"Total Amount of Stations to be Processed in Current Trial: {num_stations} / Number of Stations Being Processed Concurrently: {num_concurrent_predictions} / Total Overall Trial Station Count: {max(self.stations2use_list)}")
|
|
531
|
+
|
|
532
|
+
# Concurrent Timechunks
|
|
533
|
+
tasks_queue = []
|
|
534
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# ===== RAM Baseline (before launching worker) =====
|
|
538
|
+
_rss = process.memory_info().rss
|
|
539
|
+
for _ch in process.children(recursive=True):
|
|
540
|
+
try:
|
|
541
|
+
_rss += _ch.memory_info().rss
|
|
542
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
543
|
+
pass
|
|
544
|
+
mem_before_total_mb = _rss / 1e6
|
|
545
|
+
|
|
546
|
+
# peak before (platform-aware)
|
|
547
|
+
if resource is not None: # Linux/macOS
|
|
548
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
549
|
+
if sys.platform.startswith("linux"):
|
|
550
|
+
peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
|
|
551
|
+
elif sys.platform == "darwin":
|
|
552
|
+
peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
|
|
553
|
+
else:
|
|
554
|
+
peak_before_mb = mem_before_total_mb # safe fallback
|
|
555
|
+
else: # Windows: no 'resource'
|
|
556
|
+
try:
|
|
557
|
+
peak_before_mb = process.memory_full_info().peak_wset / 1e6
|
|
558
|
+
except Exception:
|
|
559
|
+
peak_before_mb = mem_before_total_mb
|
|
560
|
+
|
|
561
|
+
try:
|
|
562
|
+
while True:
|
|
563
|
+
if len(tasks_queue) < max_pending_tasks:
|
|
564
|
+
tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
|
|
565
|
+
P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
|
|
566
|
+
number_of_concurrent_station_predictions=num_concurrent_predictions, ray_cpus=cpus_to_use, use_gpu=use_gpu,
|
|
567
|
+
gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, stations2use=num_stations,
|
|
568
|
+
timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
|
|
569
|
+
number_of_concurrent_timechunk_predictions=max_pending_tasks, total_analysis_time=total_analysis_time, testing_gpu=False,
|
|
570
|
+
test_csv_filepath=csv_filepath, intra_threads=self.intra_threads, inter_threads=self.inter_threads, timechunk_dt=self.timechunk_dt))
|
|
571
|
+
|
|
572
|
+
break
|
|
573
|
+
|
|
574
|
+
else:
|
|
575
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
576
|
+
for finished_task in tasks_finished:
|
|
577
|
+
log_entry = ray.get(finished_task)
|
|
578
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
579
|
+
|
|
580
|
+
# After adding all the tasks to queue, process what's left
|
|
581
|
+
while tasks_queue:
|
|
582
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
583
|
+
for finished_task in tasks_finished:
|
|
584
|
+
log_entry = ray.get(finished_task)
|
|
585
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
586
|
+
|
|
587
|
+
update_csv(csv_filepath, success=1, error_message="")
|
|
588
|
+
except Exception as e:
|
|
589
|
+
# Failure occured, need to add to log
|
|
590
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
591
|
+
update_csv(csv_filepath, success=0, error_message=error_msg)
|
|
592
|
+
self.logger.error(f"Trial {trial_num} FAILED: {error_msg}")
|
|
593
|
+
|
|
594
|
+
# Write log entries from the queue to the file
|
|
595
|
+
while not log_queue.empty():
|
|
596
|
+
log_entry = log_queue.get()
|
|
597
|
+
|
|
598
|
+
remove_output_subdirs(self.output_dir, logger=self.logger)
|
|
599
|
+
trial_num += 1
|
|
600
|
+
|
|
601
|
+
# RAM cleanup
|
|
602
|
+
# ===== AFTER RUN (before cleanup) =====
|
|
603
|
+
_rss = process.memory_info().rss
|
|
604
|
+
for _ch in process.children(recursive=True):
|
|
605
|
+
try:
|
|
606
|
+
_rss += _ch.memory_info().rss
|
|
607
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
608
|
+
pass
|
|
609
|
+
mem_after_run_total_mb = _rss / 1e6
|
|
610
|
+
delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
|
|
611
|
+
|
|
612
|
+
# updated peak (platform-aware)
|
|
613
|
+
if resource is not None:
|
|
614
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
615
|
+
if sys.platform.startswith("linux"):
|
|
616
|
+
peak_after_mb = _ru / 1024.0
|
|
617
|
+
elif sys.platform == "darwin":
|
|
618
|
+
peak_after_mb = _ru / (1024.0 * 1024.0)
|
|
619
|
+
else:
|
|
620
|
+
peak_after_mb = mem_after_run_total_mb
|
|
621
|
+
else:
|
|
622
|
+
try:
|
|
623
|
+
peak_after_mb = process.memory_full_info().peak_wset / 1e6
|
|
624
|
+
except Exception:
|
|
625
|
+
peak_after_mb = mem_after_run_total_mb
|
|
626
|
+
|
|
627
|
+
self.logger.info("")
|
|
628
|
+
self.logger.info(
|
|
629
|
+
f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
|
|
630
|
+
f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# ===== CLEANUP =====
|
|
634
|
+
# drop strong refs so GC matters
|
|
635
|
+
try: del ref
|
|
636
|
+
except NameError: pass
|
|
637
|
+
try: del log_entry
|
|
638
|
+
except NameError: pass
|
|
639
|
+
|
|
640
|
+
_rss = process.memory_info().rss
|
|
641
|
+
for _ch in process.children(recursive=True):
|
|
642
|
+
try:
|
|
643
|
+
_rss += _ch.memory_info().rss
|
|
644
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
645
|
+
pass
|
|
646
|
+
mem_before_clean_mb = _rss / 1e6
|
|
647
|
+
|
|
648
|
+
gc.collect()
|
|
649
|
+
time.sleep(0.1)
|
|
650
|
+
|
|
651
|
+
_rss = process.memory_info().rss
|
|
652
|
+
for _ch in process.children(recursive=True):
|
|
653
|
+
try:
|
|
654
|
+
_rss += _ch.memory_info().rss
|
|
655
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
656
|
+
pass
|
|
657
|
+
mem_after_clean_mb = _rss / 1e6
|
|
658
|
+
|
|
659
|
+
freed_mb = mem_before_clean_mb - mem_after_clean_mb
|
|
660
|
+
self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB") # To-Do: Fix the Freed so its beeter (for cpu and gpu)
|
|
661
|
+
self.logger.info("")
|
|
662
|
+
|
|
663
|
+
# tested_concurrency.update([x for x in concurrent_predictions_list if x <= num_stations])
|
|
664
|
+
|
|
665
|
+
# stop log forwarder
|
|
666
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
667
|
+
self._log_thread.join(timeout=2)
|
|
668
|
+
|
|
669
|
+
ray.shutdown() # Shutdown Ray after processing all timechunks for this CPU count
|
|
670
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
self.logger.info(f"Testing complete.")
|
|
674
|
+
self.logger.info(f"")
|
|
675
|
+
self.logger.info(f"Finding Optimal Configurations...")
|
|
676
|
+
# Compute optimal configurations (CPU)
|
|
677
|
+
df = pd.read_csv(csv_filepath)
|
|
678
|
+
optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_cpu(df)
|
|
679
|
+
optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_cpu.csv", index=False)
|
|
680
|
+
best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_cpu.csv", index=False)
|
|
681
|
+
self.logger.info(f"Optimal Configurations Found. Findings saved to:")
|
|
682
|
+
self.logger.info(f" 1) Optimal CPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_cpu.csv")
|
|
683
|
+
self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_cpu.csv")
|
|
684
|
+
|
|
685
|
+
def evaluate_gpu(self):
|
|
686
|
+
"""Evaluate system parallelization using GPUs"""
|
|
687
|
+
statement = "Evaluating System Parallelization Capability using GPUs"
|
|
688
|
+
self.logger.info(f"{statement}")
|
|
689
|
+
|
|
690
|
+
# Set CPU affinity
|
|
691
|
+
process = psutil.Process(os.getpid())
|
|
692
|
+
process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
|
|
693
|
+
|
|
694
|
+
os.makedirs(self.csv_dir, exist_ok=True)
|
|
695
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
696
|
+
|
|
697
|
+
# Calculate these at the start
|
|
698
|
+
self.chunk_time()
|
|
699
|
+
self.dt_task_generator()
|
|
700
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
701
|
+
|
|
702
|
+
# Create test results csv
|
|
703
|
+
csv_filepath = f"{self.csv_dir}/gpu_test_results.csv"
|
|
704
|
+
prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
|
|
705
|
+
|
|
706
|
+
free_vram_mb = self.vram_mb if self.vram_mb else self.calculate_vram()
|
|
707
|
+
self.selected_gpus = self.selected_gpus if self.selected_gpus else list_gpu_ids()
|
|
708
|
+
self.logger.info(f"Using GPU(s): {self.selected_gpus}")
|
|
709
|
+
|
|
710
|
+
trial_num = 1
|
|
711
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
712
|
+
|
|
713
|
+
# Initialize Ray with GPUs
|
|
714
|
+
ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list),
|
|
715
|
+
logging_level=logging.FATAL, log_to_driver=False)
|
|
716
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
717
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
718
|
+
self._log_thread.start() # Starts the thread
|
|
719
|
+
self.logger.info(f"Ray Successfully Initialized with {len(self.selected_gpus)} GPU(s) and {len(self.cpu_id_list)} CPU(s).")
|
|
720
|
+
|
|
721
|
+
for stations in self.stations2use_list:
|
|
722
|
+
concurrent_predictions_list = generate_station_list(self.min_conc_stations, stations, self.conc_station_tasks_step_size)
|
|
723
|
+
for predictions in concurrent_predictions_list:
|
|
724
|
+
vram_per_task_mb = free_vram_mb / predictions
|
|
725
|
+
step_size = vram_per_task_mb * 0.05
|
|
726
|
+
vram_steps = np.arange(step_size, vram_per_task_mb + step_size, step_size)
|
|
727
|
+
self.logger.info(f"Testing the following VRAM limitations (MB): {vram_steps}")
|
|
728
|
+
|
|
729
|
+
for gpu_memory_limit_mb in vram_steps:
|
|
730
|
+
|
|
731
|
+
self.logger.info("")
|
|
732
|
+
self.logger.info(f"------- Trial Number: {trial_num} -------")
|
|
733
|
+
self.logger.info(f"VRAM Limited to {gpu_memory_limit_mb:.2f} MB per Task")
|
|
734
|
+
|
|
735
|
+
# Get the first timechunk for testing
|
|
736
|
+
mseed_timechunk_dir_name = self.tasks_picker[0][1]
|
|
737
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
738
|
+
|
|
739
|
+
self.logger.info(f"Stations: {stations}")
|
|
740
|
+
self.logger.info(f"Concurrent Station Predictions: {predictions}")
|
|
741
|
+
self.logger.info(f"VRAM per Task: {gpu_memory_limit_mb:.2f} MB")
|
|
742
|
+
self.logger.info("")
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
# ===== Baseline RAM consumption (before launching worker) =====
|
|
746
|
+
_rss = process.memory_info().rss
|
|
747
|
+
for _ch in process.children(recursive=True):
|
|
748
|
+
try:
|
|
749
|
+
_rss += _ch.memory_info().rss
|
|
750
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
751
|
+
pass
|
|
752
|
+
mem_before_total_mb = _rss / 1e6
|
|
753
|
+
|
|
754
|
+
# peak before (platform-aware)
|
|
755
|
+
if resource is not None: # Linux/macOS
|
|
756
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
757
|
+
if sys.platform.startswith("linux"):
|
|
758
|
+
peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
|
|
759
|
+
elif sys.platform == "darwin":
|
|
760
|
+
peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
|
|
761
|
+
else:
|
|
762
|
+
peak_before_mb = mem_before_total_mb # safe fallback
|
|
763
|
+
else: # Windows: no 'resource'
|
|
764
|
+
try:
|
|
765
|
+
peak_before_mb = process.memory_full_info().peak_wset / 1e6
|
|
766
|
+
except Exception:
|
|
767
|
+
peak_before_mb = mem_before_total_mb
|
|
768
|
+
|
|
769
|
+
try:
|
|
770
|
+
# Call mseed_predictor directly via Ray (just like evaluate_cpu does)
|
|
771
|
+
ref = mseed_predictor.options(num_gpus=0, num_cpus=1).remote(
|
|
772
|
+
input_dir=timechunk_dir_path,
|
|
773
|
+
output_dir=self.output_dir,
|
|
774
|
+
log_queue=self.log_queue,
|
|
775
|
+
P_threshold=self.P_threshold,
|
|
776
|
+
S_threshold=self.S_threshold,
|
|
777
|
+
p_model=self.p_model_filepath,
|
|
778
|
+
s_model=self.s_model_filepath,
|
|
779
|
+
number_of_concurrent_station_predictions=predictions,
|
|
780
|
+
ray_cpus=self.cpu_id_list,
|
|
781
|
+
use_gpu=True,
|
|
782
|
+
gpu_id=self.selected_gpus,
|
|
783
|
+
gpu_memory_limit_mb=gpu_memory_limit_mb,
|
|
784
|
+
stations2use=stations,
|
|
785
|
+
timechunk_id=mseed_timechunk_dir_name,
|
|
786
|
+
waveform_overlap=self.waveform_overlap,
|
|
787
|
+
total_timechunks=len(self.tasks_picker),
|
|
788
|
+
number_of_concurrent_timechunk_predictions=1, # Testing one timechunk at a time
|
|
789
|
+
total_analysis_time=total_analysis_time,
|
|
790
|
+
testing_gpu=True, # Enable test mode
|
|
791
|
+
test_csv_filepath=csv_filepath,
|
|
792
|
+
intra_threads=self.intra_threads,
|
|
793
|
+
inter_threads=self.inter_threads,
|
|
794
|
+
timechunk_dt=self.timechunk_dt
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
# Wait for result
|
|
798
|
+
log_entry = ray.get(ref)
|
|
799
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
800
|
+
|
|
801
|
+
# Success - update CSV
|
|
802
|
+
update_csv(csv_filepath, success=1, error_message="")
|
|
803
|
+
|
|
804
|
+
except Exception as e:
|
|
805
|
+
# Failure occurred, need to add to log
|
|
806
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
807
|
+
update_csv(csv_filepath, success=0, error_message=error_msg)
|
|
808
|
+
self.logger.info(f"Trial {trial_num} FAILED: {error_msg}")
|
|
809
|
+
|
|
810
|
+
# Write log entries from the queue to the file
|
|
811
|
+
while not log_queue.empty():
|
|
812
|
+
log_entry = log_queue.get()
|
|
813
|
+
self.logger.info(f"{log_entry}") # FIX ME
|
|
814
|
+
|
|
815
|
+
remove_output_subdirs(self.output_dir, logger=self.logger)
|
|
816
|
+
trial_num += 1
|
|
817
|
+
|
|
818
|
+
# RAM cleanup
|
|
819
|
+
# ===== AFTER RUN (before cleanup) =====
|
|
820
|
+
_rss = process.memory_info().rss
|
|
821
|
+
for _ch in process.children(recursive=True):
|
|
822
|
+
try:
|
|
823
|
+
_rss += _ch.memory_info().rss
|
|
824
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
825
|
+
pass
|
|
826
|
+
mem_after_run_total_mb = _rss / 1e6
|
|
827
|
+
delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
|
|
828
|
+
|
|
829
|
+
# updated peak (platform-aware)
|
|
830
|
+
if resource is not None:
|
|
831
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
832
|
+
if sys.platform.startswith("linux"):
|
|
833
|
+
peak_after_mb = _ru / 1024.0
|
|
834
|
+
elif sys.platform == "darwin":
|
|
835
|
+
peak_after_mb = _ru / (1024.0 * 1024.0)
|
|
836
|
+
else:
|
|
837
|
+
peak_after_mb = mem_after_run_total_mb
|
|
838
|
+
else:
|
|
839
|
+
try:
|
|
840
|
+
peak_after_mb = process.memory_full_info().peak_wset / 1e6
|
|
841
|
+
except Exception:
|
|
842
|
+
peak_after_mb = mem_after_run_total_mb
|
|
843
|
+
|
|
844
|
+
self.logger.info(
|
|
845
|
+
f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
|
|
846
|
+
f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
# ===== CLEANUP =====
|
|
850
|
+
# drop strong refs so GC matters
|
|
851
|
+
try: del ref
|
|
852
|
+
except NameError: pass
|
|
853
|
+
try: del log_entry
|
|
854
|
+
except NameError: pass
|
|
855
|
+
|
|
856
|
+
_rss = process.memory_info().rss
|
|
857
|
+
for _ch in process.children(recursive=True):
|
|
858
|
+
try:
|
|
859
|
+
_rss += _ch.memory_info().rss
|
|
860
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
861
|
+
pass
|
|
862
|
+
mem_before_clean_mb = _rss / 1e6
|
|
863
|
+
|
|
864
|
+
gc.collect()
|
|
865
|
+
time.sleep(0.1)
|
|
866
|
+
|
|
867
|
+
_rss = process.memory_info().rss
|
|
868
|
+
for _ch in process.children(recursive=True):
|
|
869
|
+
try:
|
|
870
|
+
_rss += _ch.memory_info().rss
|
|
871
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
872
|
+
pass
|
|
873
|
+
mem_after_clean_mb = _rss / 1e6
|
|
874
|
+
|
|
875
|
+
freed_mb = mem_before_clean_mb - mem_after_clean_mb
|
|
876
|
+
self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB\n")
|
|
877
|
+
self.logger.info("")
|
|
878
|
+
|
|
879
|
+
# stop log forwarder
|
|
880
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
881
|
+
self._log_thread.join(timeout=2)
|
|
882
|
+
|
|
883
|
+
ray.shutdown() # Shutdown Ray after all testing
|
|
884
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
885
|
+
|
|
886
|
+
self.logger.info(f"Testing complete.")
|
|
887
|
+
self.logger.info(f"")
|
|
888
|
+
self.logger.info(f"Finding Optimal Configurations...")
|
|
889
|
+
# Compute optimal configurations (GPU)
|
|
890
|
+
df = pd.read_csv(csv_filepath)
|
|
891
|
+
optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_gpu(df)
|
|
892
|
+
optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_gpu.csv", index=False)
|
|
893
|
+
best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_gpu.csv", index=False)
|
|
894
|
+
self.logger.info(f"Optimal Configurations Found. Findings saved to:")
|
|
895
|
+
self.logger.info(f" 1) Optimal GPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_gpu.csv")
|
|
896
|
+
self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_gpu.csv")
|
|
897
|
+
|
|
898
|
+
def evaluate(self):
|
|
899
|
+
if self.eval_mode == "cpu":
|
|
900
|
+
self.evaluate_cpu()
|
|
901
|
+
elif self.eval_mode == "gpu":
|
|
902
|
+
self.evaluate_gpu()
|
|
903
|
+
else:
|
|
904
|
+
exit()
|
|
905
|
+
|
|
906
|
+
def calculate_vram(self):
|
|
907
|
+
"""Calculate available VRAM for GPU testing."""
|
|
908
|
+
self.logger.info(f"Utilizing available VRAM...")
|
|
909
|
+
total_vram, available_vram = get_gpu_vram()
|
|
910
|
+
self.logger.info(f"Total VRAM: {total_vram:.2f} GB.")
|
|
911
|
+
self.logger.info(f"Available VRAM: {available_vram:.2f} GB.")
|
|
912
|
+
|
|
913
|
+
free_vram = total_vram * 0.9485 if available_vram / total_vram >= 0.9486 else available_vram
|
|
914
|
+
self.logger.info(f"Using up to {round(free_vram, 2)} GB of VRAM.")
|
|
915
|
+
return free_vram * 1024 # Convert to MB
|
|
916
|
+
|
|
917
|
+
"""
|
|
918
|
+
Finds the optimal CPU configuration based on evaluation results
|
|
919
|
+
"""
|
|
920
|
+
class OptimalCPUConfigurationFinder:
|
|
921
|
+
def __init__(self,
|
|
922
|
+
eval_sys_results_dir: str,
|
|
923
|
+
log_file_path: str):
|
|
924
|
+
|
|
925
|
+
self.eval_sys_results_dir = eval_sys_results_dir
|
|
926
|
+
if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
|
|
927
|
+
raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
|
|
928
|
+
self.log_file_path = log_file_path
|
|
929
|
+
|
|
930
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
931
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
932
|
+
self.logger.setLevel(logging.INFO)
|
|
933
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
934
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
935
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
936
|
+
# ensure parent dir
|
|
937
|
+
Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
938
|
+
file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
|
|
939
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
940
|
+
file_h.setFormatter(fmt)
|
|
941
|
+
stream_h.setFormatter(fmt)
|
|
942
|
+
self.logger.addHandler(file_h)
|
|
943
|
+
self.logger.addHandler(stream_h)
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def find_best_overall_usecase(self):
|
|
947
|
+
"""Finds the best overall CPU usecase configuation from eval results"""
|
|
948
|
+
file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_cpu.csv"
|
|
949
|
+
if not os.path.exists(file_path):
|
|
950
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
951
|
+
|
|
952
|
+
df_best_overall = pd.read_csv(file_path)
|
|
953
|
+
# best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
|
|
954
|
+
best_config_dict = df_best_overall.to_dict(orient='records')[0]
|
|
955
|
+
|
|
956
|
+
# Extract required values
|
|
957
|
+
num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
|
|
958
|
+
waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
|
|
959
|
+
total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
|
|
960
|
+
num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
|
|
961
|
+
length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
|
|
962
|
+
num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks per Timechunk")
|
|
963
|
+
intra_threads = best_config_dict.get("Intra-parallelism Threads")
|
|
964
|
+
inter_threads = best_config_dict.get("Inter-parallelism Threads")
|
|
965
|
+
num_stations = best_config_dict.get("Number of Stations Used")
|
|
966
|
+
total_runtime = best_config_dict.get("Total Run time for Picker (s)")
|
|
967
|
+
|
|
968
|
+
self.logger.info("")
|
|
969
|
+
self.logger.info(f"------- Finding the Best Overall CPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
970
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
971
|
+
self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
|
|
972
|
+
self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
|
|
973
|
+
self.logger.info(f"Waveform Timespace: {waveform_timespace}")
|
|
974
|
+
self.logger.info(f"Total Number of Stations Used: {num_stations}")
|
|
975
|
+
self.logger.info(f"Total Number of Timechunks: {total_num_timechunks}")
|
|
976
|
+
self.logger.info(f"Length of Timechunks (min): {length_of_timechunks}")
|
|
977
|
+
self.logger.info(f"Concurrent Timechunk Processes: {num_concurrent_timechunks}")
|
|
978
|
+
self.logger.info(f"Concurrent Station Processes Per Timechunk: {num_concurrent_stations}")
|
|
979
|
+
self.logger.info(f"Total Runtime (s): {total_runtime}")
|
|
980
|
+
self.logger.info("")
|
|
981
|
+
|
|
982
|
+
# return int(float(num_cpus)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_concurrent_timechunks)), int(float(num_concurrent_stations)), int(float(num_stations))
|
|
983
|
+
|
|
984
|
+
def find_optimal_for(self, cpu: int, station_count: int):
|
|
985
|
+
"""Finds the optimal configuration for a given number of CPUs and stations."""
|
|
986
|
+
if cpu is None or station_count is None:
|
|
987
|
+
raise ValueError("Error: CPU and station_count must have valid values.")
|
|
988
|
+
|
|
989
|
+
file_path = f"{self.eval_sys_results_dir}/optimal_configurations_cpu.csv"
|
|
990
|
+
if not os.path.exists(file_path):
|
|
991
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
992
|
+
|
|
993
|
+
df_optimal = pd.read_csv(file_path)
|
|
994
|
+
|
|
995
|
+
# Convert relevant columns to numeric
|
|
996
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
997
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
998
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
999
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
1000
|
+
|
|
1001
|
+
filtered_df = df_optimal[
|
|
1002
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
|
|
1003
|
+
(df_optimal["Number of Stations Used"] == station_count)]
|
|
1004
|
+
|
|
1005
|
+
if filtered_df.empty:
|
|
1006
|
+
raise ValueError("No matching configuration found. Please enter a valid entry.")
|
|
1007
|
+
|
|
1008
|
+
# Finds for the "Total Run time for Picker (s)" the row with the smallest value and the '1' is to say I only want
|
|
1009
|
+
# only the single row where the smallest runtime is
|
|
1010
|
+
# iloc gets the selection of data from a numerical index from the df and turns that access point into a Series
|
|
1011
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
1012
|
+
|
|
1013
|
+
self.logger.info(f"------- Best CPU-EQCCTPro Configuration for Requested Input Parameters Based on the available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1014
|
+
self.logger.info(f"CPU(s): {cpu}")
|
|
1015
|
+
self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
|
|
1016
|
+
self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
|
|
1017
|
+
self.logger.info(f"Waveform Timespace: {best_config['Total Waveform Analysis Timespace (min)']}")
|
|
1018
|
+
self.logger.info(f"Total Number of Stations Used: {station_count}")
|
|
1019
|
+
self.logger.info(f"Total Number of Timechunks: {best_config['Total Number of Timechunks']}")
|
|
1020
|
+
self.logger.info(f"Length of Timechunks (min): {best_config['Length of Timechunk (min)']}")
|
|
1021
|
+
self.logger.info(f"Concurrent Timechunk Processes: {best_config['Concurrent Timechunks Used']}")
|
|
1022
|
+
self.logger.info(f"Concurrent Station Processes Per Timechunk: {best_config['Number of Concurrent Station Tasks']}")
|
|
1023
|
+
self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
1024
|
+
self.logger.info("")
|
|
1025
|
+
|
|
1026
|
+
# return int(float(cpu)), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(best_config["Concurrent Timechunks Used"])), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(station_count))
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
class OptimalGPUConfigurationFinder:
|
|
1030
|
+
"""Finds the optimal GPU configuration based on evaluation system results."""
|
|
1031
|
+
|
|
1032
|
+
def __init__(self,
|
|
1033
|
+
eval_sys_results_dir: str,
|
|
1034
|
+
log_file_path: str):
|
|
1035
|
+
|
|
1036
|
+
self.eval_sys_results_dir = eval_sys_results_dir
|
|
1037
|
+
if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
|
|
1038
|
+
raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
|
|
1039
|
+
self.log_file_path = log_file_path
|
|
1040
|
+
|
|
1041
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
1042
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
1043
|
+
self.logger.setLevel(logging.INFO)
|
|
1044
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
1045
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
1046
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
1047
|
+
# ensure parent dir
|
|
1048
|
+
Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1049
|
+
file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
|
|
1050
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
1051
|
+
file_h.setFormatter(fmt)
|
|
1052
|
+
stream_h.setFormatter(fmt)
|
|
1053
|
+
self.logger.addHandler(file_h)
|
|
1054
|
+
self.logger.addHandler(stream_h)
|
|
1055
|
+
|
|
1056
|
+
def find_best_overall_usecase(self):
|
|
1057
|
+
"""Finds the best overall GPU configuration from evaluation results."""
|
|
1058
|
+
file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_gpu.csv"
|
|
1059
|
+
if not os.path.exists(file_path):
|
|
1060
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1061
|
+
|
|
1062
|
+
df = pd.read_csv(file_path)
|
|
1063
|
+
if df.empty:
|
|
1064
|
+
raise ValueError(f"[{datetime.now()}] Error: '{file_path}' is empty.")
|
|
1065
|
+
|
|
1066
|
+
row = df.iloc[0] # the best row you wrote out
|
|
1067
|
+
|
|
1068
|
+
# Some codepaths use two different column names for concurrency; support both
|
|
1069
|
+
conc_col = "Number of Concurrent Station Tasks per Timechunk" \
|
|
1070
|
+
if "Number of Concurrent Station Tasks per Timechunk" in df.columns \
|
|
1071
|
+
else "Number of Concurrent Station Tasks"
|
|
1072
|
+
|
|
1073
|
+
# Robust GPU parse: accepts [0], (0,), "0", 0, "", None
|
|
1074
|
+
num_gpus_list = _parse_gpus_field(row.get("GPUs Used"))
|
|
1075
|
+
# Keep as tuple for display/consistency
|
|
1076
|
+
num_gpus = tuple(num_gpus_list)
|
|
1077
|
+
|
|
1078
|
+
# Pull/normalize scalars
|
|
1079
|
+
num_cpus = row.get("Number of CPUs Allocated for Ray to Use")
|
|
1080
|
+
num_concurrent = row.get(conc_col)
|
|
1081
|
+
intra_threads = row.get("Intra-parallelism Threads")
|
|
1082
|
+
inter_threads = row.get("Inter-parallelism Threads")
|
|
1083
|
+
num_stations = row.get("Number of Stations Used")
|
|
1084
|
+
total_runtime = row.get("Total Run time for Picker (s)")
|
|
1085
|
+
vram_used = row.get("VRAM Used Per Task")
|
|
1086
|
+
|
|
1087
|
+
self.logger.info("")
|
|
1088
|
+
self.logger.info(f"------- Finding the Best Overall GPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1089
|
+
self.logger.info("")
|
|
1090
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
1091
|
+
self.logger.info(f"GPU ID(s): {num_gpus_list}")
|
|
1092
|
+
self.logger.info(f"Concurrent Predictions: {num_concurrent}")
|
|
1093
|
+
self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
|
|
1094
|
+
self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
|
|
1095
|
+
self.logger.info(f"Stations: {num_stations}")
|
|
1096
|
+
self.logger.info(f"VRAM Used per Task: {vram_used}")
|
|
1097
|
+
self.logger.info(f"Total Runtime (s): {total_runtime}")
|
|
1098
|
+
self.logger.info("")
|
|
1099
|
+
# return int(float(num_cpus)), int(float(num_concurrent_predictions)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
|
|
1100
|
+
|
|
1101
|
+
def find_optimal_for(self, num_cpus: int, gpu_list: list, station_count: int):
|
|
1102
|
+
"""Finds the optimal configuration for a given number of CPUs, GPUs, and stations."""
|
|
1103
|
+
if num_cpus is None or station_count is None or gpu_list is None:
|
|
1104
|
+
raise ValueError("Error: num_cpus, station_count, and gpu_list must have valid values.")
|
|
1105
|
+
|
|
1106
|
+
file_path = f"{self.eval_sys_results_dir}/optimal_configurations_gpu.csv"
|
|
1107
|
+
if not os.path.exists(file_path):
|
|
1108
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1109
|
+
|
|
1110
|
+
df_optimal = pd.read_csv(file_path)
|
|
1111
|
+
|
|
1112
|
+
# Convert relevant columns to numeric, handling NaNs
|
|
1113
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
1114
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
1115
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
1116
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
1117
|
+
df_optimal["VRAM Used Per Task"] = pd.to_numeric(df_optimal["VRAM Used Per Task"], errors="coerce")
|
|
1118
|
+
|
|
1119
|
+
# Convert "GPUs Used" from string representation to list
|
|
1120
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
|
1121
|
+
|
|
1122
|
+
# Convert GPU lists to tuples for comparison
|
|
1123
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
|
|
1124
|
+
|
|
1125
|
+
# Ensure gpu_list is in tuple format for comparison
|
|
1126
|
+
gpu_list_tuple = tuple(gpu_list) if isinstance(gpu_list, list) else (gpu_list,)
|
|
1127
|
+
|
|
1128
|
+
filtered_df = df_optimal[
|
|
1129
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
|
|
1130
|
+
(df_optimal["GPUs Used"] == gpu_list_tuple) &
|
|
1131
|
+
(df_optimal["Number of Stations Used"] == station_count)
|
|
1132
|
+
]
|
|
1133
|
+
|
|
1134
|
+
if filtered_df.empty:
|
|
1135
|
+
raise ValueError("No matching configuration found. Please enter a valid entry.")
|
|
1136
|
+
|
|
1137
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
1138
|
+
|
|
1139
|
+
self.logger.info(f"------- Best GPU-EQCCTPro Configuration for Requested Input Parameters Based on the Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1140
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
1141
|
+
self.logger.info(f"GPU(s): {gpu_list}")
|
|
1142
|
+
self.logger.info(f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}")
|
|
1143
|
+
self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
|
|
1144
|
+
self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
|
|
1145
|
+
self.logger.info(f"Stations: {station_count}")
|
|
1146
|
+
self.logger.info(f"VRAM Used per Task: {best_config['VRAM Used Per Task']}")
|
|
1147
|
+
self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
1148
|
+
|
|
1149
|
+
# return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
|
|
1150
|
+
# int(float(best_config["Number of Concurrent Station Tasks"])), \
|
|
1151
|
+
# int(float(best_config["Intra-parallelism Threads"])), \
|
|
1152
|
+
# int(float(best_config["Inter-parallelism Threads"])), \
|
|
1153
|
+
# gpu_list, \
|
|
1154
|
+
# int(float(best_config["VRAM Used Per Task"])), \
|
|
1155
|
+
# int(float(station_count))
|