eqcctpro 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eqcctpro/__init__.py +14 -2
- eqcctpro/eqcct_tf_models.py +407 -0
- eqcctpro/functionality.py +1424 -0
- eqcctpro/parallelization.py +1312 -0
- eqcctpro/seisbench_models.py +279 -0
- eqcctpro/tools.py +968 -0
- eqcctpro-0.7.0.dist-info/METADATA +312 -0
- eqcctpro-0.7.0.dist-info/RECORD +10 -0
- eqcctpro-0.6.2.dist-info/METADATA +0 -541
- eqcctpro-0.6.2.dist-info/RECORD +0 -5
- {eqcctpro-0.6.2.dist-info → eqcctpro-0.7.0.dist-info}/WHEEL +0 -0
- {eqcctpro-0.6.2.dist-info → eqcctpro-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1424 @@
|
|
|
1
|
+
"""
|
|
2
|
+
functionality.py controls all the functionality of EQCCTPro, specifically how we access mseed_predictor() and parallel_predict.
|
|
3
|
+
It is a level of abstraction so we can make the code more concise and cleaner
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import gc
|
|
7
|
+
import ray
|
|
8
|
+
import sys
|
|
9
|
+
import ast
|
|
10
|
+
import math
|
|
11
|
+
import queue
|
|
12
|
+
import psutil
|
|
13
|
+
import random
|
|
14
|
+
import numbers
|
|
15
|
+
import logging
|
|
16
|
+
import resource
|
|
17
|
+
import threading
|
|
18
|
+
from .tools import *
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from .parallelization import *
|
|
21
|
+
from obspy import UTCDateTime
|
|
22
|
+
from ray.util.queue import Queue
|
|
23
|
+
from datetime import datetime, timedelta
|
|
24
|
+
from .tools import _parse_gpus_field
|
|
25
|
+
from logging.handlers import QueueHandler, QueueListener
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RunEQCCTPro():
|
|
29
|
+
"""RunEQCCTPro class for running the RunEQCCTPro functions for multiple instances of the class"""
|
|
30
|
+
def __init__(self, # self is 'this instance' of the class
|
|
31
|
+
use_gpu: bool,
|
|
32
|
+
input_dir: str,
|
|
33
|
+
output_dir: str,
|
|
34
|
+
log_filepath: str,
|
|
35
|
+
p_model_filepath: str = None,
|
|
36
|
+
s_model_filepath: str = None,
|
|
37
|
+
number_of_concurrent_station_predictions: int = None,
|
|
38
|
+
number_of_concurrent_timechunk_predictions: int = 1,
|
|
39
|
+
intra_threads: int = 1,
|
|
40
|
+
inter_threads: int = 1,
|
|
41
|
+
P_threshold: float = 0.001,
|
|
42
|
+
S_threshold: float = 0.02,
|
|
43
|
+
specific_stations: str = None,
|
|
44
|
+
csv_dir: str = None,
|
|
45
|
+
best_usecase_config: bool = False,
|
|
46
|
+
vram_mb: float = None,
|
|
47
|
+
selected_gpus: list = None,
|
|
48
|
+
cpu_id_list: list = [1],
|
|
49
|
+
start_time:str = None,
|
|
50
|
+
end_time:str = None,
|
|
51
|
+
timechunk_dt:int = 1,
|
|
52
|
+
waveform_overlap:int = 0,
|
|
53
|
+
tmp_dir:str = None,
|
|
54
|
+
# SeisBench model parameters
|
|
55
|
+
model_type: str = 'eqcct', # 'eqcct' or 'seisbench'
|
|
56
|
+
seisbench_parent_model: str = None, # e.g., 'PhaseNet', 'EQTransformer'
|
|
57
|
+
seisbench_child_model: str = None, # e.g., 'original', 'stead'
|
|
58
|
+
Detection_threshold: float = 0.3): # Detection threshold for SeisBench models
|
|
59
|
+
|
|
60
|
+
self.use_gpu = use_gpu # 'this instance' of the classes object, use_gpu = use_gpu
|
|
61
|
+
self.input_dir = input_dir
|
|
62
|
+
self.output_dir = output_dir
|
|
63
|
+
self.log_filepath = log_filepath
|
|
64
|
+
self.p_model_filepath = p_model_filepath
|
|
65
|
+
self.s_model_filepath = s_model_filepath
|
|
66
|
+
self.number_of_concurrent_station_predictions = number_of_concurrent_station_predictions
|
|
67
|
+
self.number_of_concurrent_timechunk_predictions = number_of_concurrent_timechunk_predictions
|
|
68
|
+
self.intra_threads = intra_threads
|
|
69
|
+
self.inter_threads = inter_threads
|
|
70
|
+
self.P_threshold = P_threshold
|
|
71
|
+
self.S_threshold = S_threshold
|
|
72
|
+
self.specific_stations = specific_stations
|
|
73
|
+
self.csv_dir = csv_dir
|
|
74
|
+
self.best_usecase_config = best_usecase_config
|
|
75
|
+
self.vram_mb = vram_mb
|
|
76
|
+
self.selected_gpus = selected_gpus if selected_gpus is not None else list_gpu_ids() # a list of the GPU IDs. If not provided, we use all available GPUs
|
|
77
|
+
self.cpu_id_list = cpu_id_list
|
|
78
|
+
self.cpu_count = len(cpu_id_list)
|
|
79
|
+
self.start_time = start_time
|
|
80
|
+
self.end_time = end_time
|
|
81
|
+
self.timechunk_dt = timechunk_dt
|
|
82
|
+
self.waveform_overlap = waveform_overlap
|
|
83
|
+
self.home_tmp_dir = tmp_dir
|
|
84
|
+
|
|
85
|
+
# SeisBench model parameters
|
|
86
|
+
self.model_type = model_type.lower()
|
|
87
|
+
self.seisbench_parent_model = seisbench_parent_model
|
|
88
|
+
self.seisbench_child_model = seisbench_child_model
|
|
89
|
+
self.Detection_threshold = Detection_threshold
|
|
90
|
+
|
|
91
|
+
# Validate model type and parameters
|
|
92
|
+
if self.model_type not in ['eqcct', 'seisbench']:
|
|
93
|
+
raise ValueError(f"model_type must be 'eqcct' or 'seisbench', got '{model_type}'")
|
|
94
|
+
|
|
95
|
+
if self.model_type == 'eqcct':
|
|
96
|
+
if p_model_filepath is None or s_model_filepath is None:
|
|
97
|
+
raise ValueError("For EQCCT model_type, p_model_filepath and s_model_filepath are required")
|
|
98
|
+
if number_of_concurrent_station_predictions is None:
|
|
99
|
+
raise ValueError("number_of_concurrent_station_predictions is required for EQCCT")
|
|
100
|
+
elif self.model_type == 'seisbench':
|
|
101
|
+
if seisbench_parent_model is None or seisbench_child_model is None:
|
|
102
|
+
raise ValueError("For SeisBench model_type, seisbench_parent_model and seisbench_child_model are required")
|
|
103
|
+
if number_of_concurrent_station_predictions is None:
|
|
104
|
+
raise ValueError("number_of_concurrent_station_predictions is required for SeisBench")
|
|
105
|
+
|
|
106
|
+
# Ensures that the output_dir exists. If it doesn't, we create it
|
|
107
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
108
|
+
|
|
109
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
110
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
111
|
+
self.logger.setLevel(logging.INFO)
|
|
112
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
113
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
114
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
115
|
+
file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
|
|
116
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
117
|
+
file_h.setFormatter(fmt)
|
|
118
|
+
stream_h.setFormatter(fmt)
|
|
119
|
+
self.logger.addHandler(file_h)
|
|
120
|
+
self.logger.addHandler(stream_h)
|
|
121
|
+
|
|
122
|
+
self.logger.info("")
|
|
123
|
+
self.logger.info(f"------- Welcome to EQCCTPro -------")
|
|
124
|
+
self.logger.info("")
|
|
125
|
+
|
|
126
|
+
# If the user passed a GPU but no valid VRAM, need to exit
|
|
127
|
+
if self.use_gpu and not (isinstance(self.vram_mb, numbers.Real) and math.isfinite(self.vram_mb) and self.vram_mb > 0):
|
|
128
|
+
self.logger.error(f"No numerical VRAM passed. Please provide vram_mb (MB per Raylet per GPU) as a positive real number. Exiting...")
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
|
|
131
|
+
# We need to ensure that the vram specified does not exceed the capabilities of the system, if not, we need to exit safely before it happens
|
|
132
|
+
if self.use_gpu:
|
|
133
|
+
# Determine model VRAM requirement based on model type
|
|
134
|
+
if self.model_type == 'seisbench':
|
|
135
|
+
from .parallelization import get_seisbench_model_vram_mb
|
|
136
|
+
model_vram_mb = get_seisbench_model_vram_mb(
|
|
137
|
+
self.seisbench_parent_model,
|
|
138
|
+
self.seisbench_child_model,
|
|
139
|
+
default_mb=2000.0 # Default VRAM for SeisBench models
|
|
140
|
+
)
|
|
141
|
+
self.logger.info(f"Using VRAM requirement: {model_vram_mb:.0f} MB for SeisBench model {self.seisbench_parent_model}/{self.seisbench_child_model}")
|
|
142
|
+
else:
|
|
143
|
+
model_vram_mb = 1500.0 # Safety reserve for EQCCT
|
|
144
|
+
|
|
145
|
+
check_vram_per_gpu_style(
|
|
146
|
+
selected_gpus=self.selected_gpus,
|
|
147
|
+
get_gpu_vram_fn=lambda gid: get_gpu_vram(gpu_index=gid),
|
|
148
|
+
intended_workers=self.number_of_concurrent_station_predictions * self.number_of_concurrent_timechunk_predictions,
|
|
149
|
+
vram_mb=self.vram_mb,
|
|
150
|
+
model_vram_mb=model_vram_mb,
|
|
151
|
+
safety_cap=0.95,
|
|
152
|
+
eqcct_overhead_gb=0.0,
|
|
153
|
+
logger=self.logger)
|
|
154
|
+
|
|
155
|
+
# To-Do: merge dt_task_generator and chunk_time into one function and concatenate the objects so we dont have so much stuff running around
|
|
156
|
+
# Generates the dt tasks list
|
|
157
|
+
def dt_task_generator(self):
|
|
158
|
+
# Modifies the times_list values (see chunk_time()) so it can be in a format the mseed_predictor can use
|
|
159
|
+
tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
|
|
160
|
+
self.tasks_picker = tasks
|
|
161
|
+
|
|
162
|
+
def chunk_time(self):
|
|
163
|
+
# Creates the timechunks, EI. from X specific time to Y specific time to generate the dt tasks (timechunk tasks that are run in parallel first at the top level)
|
|
164
|
+
# EX. [[UTCDateTime(2024, 12, 15, 11, 58), UTCDateTime(2024, 12, 15, 13, 0)], [UTCDateTime(2024, 12, 15, 12, 58), UTCDateTime(2024, 12, 15, 14, 0)]]
|
|
165
|
+
starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
|
|
166
|
+
endtime = UTCDateTime(self.end_time)
|
|
167
|
+
|
|
168
|
+
times_list = []
|
|
169
|
+
start = starttime
|
|
170
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
171
|
+
while start <= endtime:
|
|
172
|
+
if end >= endtime:
|
|
173
|
+
end = endtime
|
|
174
|
+
times_list.append([start, end])
|
|
175
|
+
break
|
|
176
|
+
times_list.append([start, end])
|
|
177
|
+
start = end - (self.waveform_overlap * 60)
|
|
178
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
179
|
+
|
|
180
|
+
self.times_list = times_list
|
|
181
|
+
|
|
182
|
+
def _drain_worker_logs(self):
|
|
183
|
+
while True:
|
|
184
|
+
rec = self.log_queue.get() # blocks until a record arrives
|
|
185
|
+
if rec is None: break # sentinel to stop thread
|
|
186
|
+
try:
|
|
187
|
+
self.logger.handle(rec) # routes to file+console handlers
|
|
188
|
+
except Exception:
|
|
189
|
+
# never crash on logging
|
|
190
|
+
self.logger.exception("Failed to handle worker log record")
|
|
191
|
+
|
|
192
|
+
def configure_cpu(self):
|
|
193
|
+
# We need to configure the tf_environ for the CPU configuration that is being inputted
|
|
194
|
+
self.logger.info(f"Running EQCCT over Requested MSeed Files using CPU(s)...")
|
|
195
|
+
skip_tf = (self.model_type != 'eqcct')
|
|
196
|
+
if self.best_usecase_config:
|
|
197
|
+
# We use the best usecase configuration that was found using EvaluateSystem
|
|
198
|
+
result = find_optimal_configuration_cpu(best_overall_usecase=True, eval_sys_results_dir=self.csv_dir)
|
|
199
|
+
if result is None:
|
|
200
|
+
self.logger.info("")
|
|
201
|
+
self.logger.info(f"Error: Could not retrieve an optimal CPU configuration. Please check that the CSV file exists and try again. Exiting...")
|
|
202
|
+
exit() # Exit gracefully
|
|
203
|
+
cpus_to_use, num_concurrent_predictions, intra, inter, station_count = result
|
|
204
|
+
self.logger.info("")
|
|
205
|
+
self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, and {inter} Inter Threads...")
|
|
206
|
+
tf_environ(gpu_id=-1, intra_threads=intra, inter_threads=inter, logger=self.logger, skip_tf=skip_tf)
|
|
207
|
+
else:
|
|
208
|
+
# We pass the requested parameters to the tf_environ
|
|
209
|
+
tf_environ(gpu_id=-1, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger, skip_tf=skip_tf)
|
|
210
|
+
|
|
211
|
+
def configure_gpu(self):
|
|
212
|
+
# We need to configure the tf_environ for the GPU configuration that is being inputted
|
|
213
|
+
self.logger.info(f"Running EQCCT over Requested MSeed Files using GPU(s)...")
|
|
214
|
+
# In the main process (driver), we only set environment variables.
|
|
215
|
+
# We ALWAYS skip TensorFlow initialization here because the main process doesn't run models.
|
|
216
|
+
# This avoids confusing "No GPUs visible" messages if the driver's environment differs from workers.
|
|
217
|
+
skip_tf = True
|
|
218
|
+
if self.best_usecase_config:
|
|
219
|
+
result = find_optimal_configuration_gpu(True, self.csv_dir)
|
|
220
|
+
if result is None:
|
|
221
|
+
self.logger.info("")
|
|
222
|
+
self.logger.error(f"Error: Could not retrieve an optimal GPU configuration. Please check that the CSV file exists and try again. Exiting...")
|
|
223
|
+
exit() # Exit gracefully
|
|
224
|
+
|
|
225
|
+
self.logger.info("")
|
|
226
|
+
cpus_to_use, num_concurrent_predictions, intra, inter, gpus, vram_mb, station_count = result # Unpack values only if result is valid
|
|
227
|
+
self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, {inter} Inter Threads, {gpus} GPU IDs, and {vram_mb} MB VRAM per Task...")
|
|
228
|
+
tf_environ(gpu_id=1, vram_limit_mb=vram_mb, gpus_to_use=gpus, intra_threads=intra, inter_threads=inter, logger=self.logger, skip_tf=skip_tf)
|
|
229
|
+
|
|
230
|
+
else:
|
|
231
|
+
self.logger.info("")
|
|
232
|
+
self.logger.info(f"User requested to use GPU(s): {self.selected_gpus} with {self.vram_mb} MB of VRAM per Raylet (intra-op threads = {self.intra_threads}, inter-op threads = {self.inter_threads})") # Use the selected GPUs
|
|
233
|
+
tf_environ(gpu_id=1, vram_limit_mb=self.vram_mb, gpus_to_use=self.selected_gpus, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger, skip_tf=skip_tf)
|
|
234
|
+
|
|
235
|
+
def eqcctpro_parallelization(self):
|
|
236
|
+
if self.specific_stations is None: # We check if the station dirs are consistent, if not, exit
|
|
237
|
+
statement, specific_stations_list, do_i_exit = check_station_dirs(input_dir=self.input_dir)
|
|
238
|
+
self.logger.info(f"{statement}")
|
|
239
|
+
if do_i_exit: exit()
|
|
240
|
+
|
|
241
|
+
# We want to use a specified amount of stations
|
|
242
|
+
else: specific_stations_list = [station.strip() for station in self.specific_stations.split(',')]
|
|
243
|
+
statement = f"Using {len(specific_stations_list)} selected station(s)."
|
|
244
|
+
self.logger.info(f"{statement}")
|
|
245
|
+
self.logger.info("")
|
|
246
|
+
|
|
247
|
+
# Submit timechunk tasks to mseed_predictor
|
|
248
|
+
tasks_queue = []
|
|
249
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
250
|
+
|
|
251
|
+
# Compute total analyis timeframe
|
|
252
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
253
|
+
|
|
254
|
+
max_pending_tasks = self.number_of_concurrent_timechunk_predictions
|
|
255
|
+
self.logger.info(f"------- Starting EQCCTPro... -------")
|
|
256
|
+
self.logger.info(f"Detailed subprocess information can be found in the log file.")
|
|
257
|
+
self.logger.info("")
|
|
258
|
+
for i in range(len(self.tasks_picker)):
|
|
259
|
+
mseed_timechunk_dir_name = self.tasks_picker[i][1]
|
|
260
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
261
|
+
|
|
262
|
+
# Concurrent Timechunks
|
|
263
|
+
while True:
|
|
264
|
+
if len(tasks_queue) < max_pending_tasks:
|
|
265
|
+
tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
|
|
266
|
+
P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
|
|
267
|
+
number_of_concurrent_station_predictions=self.number_of_concurrent_station_predictions, ray_cpus=self.cpu_id_list, use_gpu=self.use_gpu,
|
|
268
|
+
gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, specific_stations=specific_stations_list,
|
|
269
|
+
timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
|
|
270
|
+
number_of_concurrent_timechunk_predictions=self.number_of_concurrent_timechunk_predictions, total_analysis_time=total_analysis_time,
|
|
271
|
+
intra_threads=self.intra_threads, inter_threads=self.inter_threads,
|
|
272
|
+
model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
|
|
273
|
+
seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold))
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
else: # If there are more tasks than maximum, just process them
|
|
277
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
278
|
+
for finished_task in tasks_finished:
|
|
279
|
+
log_entry = ray.get(finished_task)
|
|
280
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
281
|
+
|
|
282
|
+
# After adding all the tasks to queue, process what's left
|
|
283
|
+
while tasks_queue:
|
|
284
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
285
|
+
for finished_task in tasks_finished:
|
|
286
|
+
log_entry = ray.get(finished_task)
|
|
287
|
+
self.logger.info(log_entry)
|
|
288
|
+
|
|
289
|
+
# stop log forwarder
|
|
290
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
291
|
+
self._log_thread.join(timeout=2)
|
|
292
|
+
|
|
293
|
+
ray.shutdown()
|
|
294
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
295
|
+
self.logger.info("------- Successfully Picked All Waveform(s) from all Timechunk(s) -------")
|
|
296
|
+
# self.logger.info("------- END OF FILE -------")
|
|
297
|
+
|
|
298
|
+
def run_eqcctpro(self):
|
|
299
|
+
# Set CPU affinity
|
|
300
|
+
process = psutil.Process(os.getpid())
|
|
301
|
+
process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
|
|
302
|
+
|
|
303
|
+
self.chunk_time() # Generates the UTC times for each of the timesets in the given time range
|
|
304
|
+
self.dt_task_generator() # Generates the task list so can know how many total tasks there are for our given time range
|
|
305
|
+
|
|
306
|
+
if self.use_gpu: # GPU
|
|
307
|
+
self.configure_gpu()
|
|
308
|
+
ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False, _temp_dir=self.home_tmp_dir) # Ray initalization using GPUs
|
|
309
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
310
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
311
|
+
self._log_thread.start() # Starts the thread
|
|
312
|
+
# Log some import info to user
|
|
313
|
+
statement = f"Ray Successfully Initialized with {self.selected_gpus} GPU(s) and {len(self.cpu_id_list)} CPU(s) ({list(self.cpu_id_list)} CPU Affinity Binding)."
|
|
314
|
+
self.logger.info(f"{statement}")
|
|
315
|
+
self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
|
|
316
|
+
|
|
317
|
+
# Running parllelization
|
|
318
|
+
self.eqcctpro_parallelization()
|
|
319
|
+
|
|
320
|
+
else: # CPU
|
|
321
|
+
self.configure_cpu()
|
|
322
|
+
ray.init(ignore_reinit_error=True, num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False, _temp_dir=self.home_tmp_dir) # Ray initalization using CPUs
|
|
323
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
324
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
325
|
+
self._log_thread.start() # Starts the thread
|
|
326
|
+
# Log some import info to user
|
|
327
|
+
statement = f"Ray Successfully Initialized with {len(self.cpu_id_list)} CPU(s) ({list(self.cpu_id_list)} CPU Affinity Binding)."
|
|
328
|
+
self.logger.info(f"{statement}")
|
|
329
|
+
self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
|
|
330
|
+
|
|
331
|
+
# Running parllelization
|
|
332
|
+
self.eqcctpro_parallelization()
|
|
333
|
+
|
|
334
|
+
class EvaluateSystem():
|
|
335
|
+
"""Evaluate System class for running the evaluation system functions for multiple instances of the class"""
|
|
336
|
+
def __init__(self,
|
|
337
|
+
eval_mode: str,
|
|
338
|
+
input_dir: str,
|
|
339
|
+
output_dir: str,
|
|
340
|
+
log_filepath: str,
|
|
341
|
+
csv_dir: str,
|
|
342
|
+
p_model_filepath: str = None,
|
|
343
|
+
s_model_filepath: str = None,
|
|
344
|
+
P_threshold: float = 0.001,
|
|
345
|
+
S_threshold: float = 0.02,
|
|
346
|
+
intra_threads: int = 1,
|
|
347
|
+
inter_threads: int = 1,
|
|
348
|
+
stations2use:int = None,
|
|
349
|
+
cpu_id_list:list = [1],
|
|
350
|
+
cpu_test_step_size:int = 1,
|
|
351
|
+
starting_amount_of_stations: int = 1,
|
|
352
|
+
station_list_step_size: int = 1,
|
|
353
|
+
min_cpu_amount: int = 1,
|
|
354
|
+
min_conc_stations: int = 1,
|
|
355
|
+
conc_station_tasks_step_size: int = 1,
|
|
356
|
+
max_vram_mb:float = None,
|
|
357
|
+
gpu_vram_safety_cap:float = 0.90,
|
|
358
|
+
selected_gpus:list = None,
|
|
359
|
+
start_time:str = None,
|
|
360
|
+
end_time:str = None,
|
|
361
|
+
conc_timechunk_tasks_step_size: int = 1,
|
|
362
|
+
timechunk_dt:int = 1,
|
|
363
|
+
waveform_overlap:int = 0,
|
|
364
|
+
tmp_dir:str = None,
|
|
365
|
+
# SeisBench model parameters
|
|
366
|
+
model_type: str = 'eqcct', # 'eqcct' or 'seisbench'
|
|
367
|
+
seisbench_parent_model: str = None,
|
|
368
|
+
seisbench_child_model: str = None,
|
|
369
|
+
Detection_threshold: float = 0.3):
|
|
370
|
+
|
|
371
|
+
valid_modes = {"cpu", "gpu"}
|
|
372
|
+
if eval_mode not in valid_modes:
|
|
373
|
+
raise ValueError(f"Invalid mode '{eval_mode}'. Choose either 'cpu' or 'gpu'.")
|
|
374
|
+
|
|
375
|
+
self.eval_mode = eval_mode.lower()
|
|
376
|
+
self.intra_threads = intra_threads
|
|
377
|
+
self.inter_threads = inter_threads
|
|
378
|
+
self.input_dir = input_dir
|
|
379
|
+
self.output_dir = output_dir
|
|
380
|
+
self.log_filepath = log_filepath
|
|
381
|
+
self.csv_dir = csv_dir
|
|
382
|
+
self.P_threshold = P_threshold
|
|
383
|
+
self.S_threshold = S_threshold
|
|
384
|
+
self.p_model_filepath = p_model_filepath
|
|
385
|
+
self.s_model_filepath = s_model_filepath
|
|
386
|
+
self.stations2use = stations2use
|
|
387
|
+
self.cpu_id_list = cpu_id_list
|
|
388
|
+
self.vram_mb = max_vram_mb
|
|
389
|
+
self.gpu_vram_safety_cap = gpu_vram_safety_cap
|
|
390
|
+
self.selected_gpus = selected_gpus
|
|
391
|
+
self.use_gpu = True if self.eval_mode == 'gpu' else False
|
|
392
|
+
self.cpu_count = len(cpu_id_list)
|
|
393
|
+
self.cpu_test_step_size = cpu_test_step_size
|
|
394
|
+
self.starting_amount_of_stations = starting_amount_of_stations
|
|
395
|
+
self.station_list_step_size = station_list_step_size
|
|
396
|
+
self.min_cpu_amount = min_cpu_amount
|
|
397
|
+
self.min_conc_stations = min_conc_stations # default is = 1
|
|
398
|
+
self.conc_station_tasks_step_size = conc_station_tasks_step_size # default is = 1
|
|
399
|
+
self.stations2use_list = list(range(1, 11)) + list(range(15, 50, 5)) if stations2use is None else generate_station_list(self.starting_amount_of_stations, stations2use, self.station_list_step_size,)
|
|
400
|
+
self.start_time = start_time
|
|
401
|
+
self.end_time = end_time
|
|
402
|
+
self.conc_timechunk_tasks_step_size = conc_timechunk_tasks_step_size
|
|
403
|
+
self.timechunk_dt = timechunk_dt
|
|
404
|
+
self.waveform_overlap = waveform_overlap
|
|
405
|
+
self.home_tmp_dir = tmp_dir
|
|
406
|
+
|
|
407
|
+
# SeisBench model parameters
|
|
408
|
+
self.model_type = model_type.lower()
|
|
409
|
+
self.seisbench_parent_model = seisbench_parent_model
|
|
410
|
+
self.seisbench_child_model = seisbench_child_model
|
|
411
|
+
self.Detection_threshold = Detection_threshold
|
|
412
|
+
|
|
413
|
+
# Validate model type and parameters
|
|
414
|
+
if self.model_type not in ['eqcct', 'seisbench']:
|
|
415
|
+
raise ValueError(f"model_type must be 'eqcct' or 'seisbench', got '{model_type}'")
|
|
416
|
+
|
|
417
|
+
if self.model_type == 'eqcct':
|
|
418
|
+
if p_model_filepath is None or s_model_filepath is None:
|
|
419
|
+
raise ValueError("For EQCCT model_type, p_model_filepath and s_model_filepath are required")
|
|
420
|
+
elif self.model_type == 'seisbench':
|
|
421
|
+
if seisbench_parent_model is None or seisbench_child_model is None:
|
|
422
|
+
raise ValueError("For SeisBench model_type, seisbench_parent_model and seisbench_child_model are required")
|
|
423
|
+
|
|
424
|
+
# Ensures that the output_dir exists. If it doesn't, we create it
|
|
425
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
426
|
+
|
|
427
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
428
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
429
|
+
self.logger.setLevel(logging.INFO)
|
|
430
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
431
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
432
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
433
|
+
file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
|
|
434
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
435
|
+
file_h.setFormatter(fmt)
|
|
436
|
+
stream_h.setFormatter(fmt)
|
|
437
|
+
self.logger.addHandler(file_h)
|
|
438
|
+
self.logger.addHandler(stream_h)
|
|
439
|
+
|
|
440
|
+
self.logger.info("")
|
|
441
|
+
self.logger.info(f"------- Welcome to EQCCTPro's EvaluateSystem Functionality -------")
|
|
442
|
+
self.logger.info("")
|
|
443
|
+
# Set up temp dir
|
|
444
|
+
import tempfile
|
|
445
|
+
tempfile.tempfile = self.home_tmp_dir
|
|
446
|
+
|
|
447
|
+
os.environ['TMPDIR'] = self.home_tmp_dir
|
|
448
|
+
os.environ['TEMP'] = self.home_tmp_dir
|
|
449
|
+
os.environ['TMP'] = self.home_tmp_dir
|
|
450
|
+
self.logger.info(f"Successfully set up temp files to be stored at {self.home_tmp_dir}")
|
|
451
|
+
|
|
452
|
+
# We need to ensure that the vram specified does not exceed the capabilities of t he system, if not, we need to exit safely before it happens
|
|
453
|
+
self.chunk_time()
|
|
454
|
+
intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
|
|
455
|
+
if self.eval_mode == 'gpu':
|
|
456
|
+
if not self.selected_gpus:
|
|
457
|
+
raise ValueError("selected_gpus must be set in GPU mode.")
|
|
458
|
+
self.chunk_time()
|
|
459
|
+
intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
|
|
460
|
+
|
|
461
|
+
# Determine model VRAM requirement based on model type
|
|
462
|
+
if self.model_type == 'seisbench':
|
|
463
|
+
from .parallelization import get_seisbench_model_vram_mb
|
|
464
|
+
model_vram_mb = get_seisbench_model_vram_mb(
|
|
465
|
+
self.seisbench_parent_model,
|
|
466
|
+
self.seisbench_child_model,
|
|
467
|
+
default_mb=2000.0
|
|
468
|
+
)
|
|
469
|
+
else:
|
|
470
|
+
model_vram_mb = 3000.0 # Default for EQCCT
|
|
471
|
+
|
|
472
|
+
per_gpu_free_mb = [get_gpu_vram(gpu_index=g)[1] * 1024.0 for g in self.selected_gpus] # free_gb -> MB
|
|
473
|
+
plan = evaluate_vram_capacity(
|
|
474
|
+
intended_workers=intended_workers,
|
|
475
|
+
vram_per_worker_mb=float(self.vram_mb),
|
|
476
|
+
per_gpu_free_mb=per_gpu_free_mb,
|
|
477
|
+
model_vram_mb=model_vram_mb,
|
|
478
|
+
safety_cap=self.gpu_vram_safety_cap,
|
|
479
|
+
eqcct_overhead_gb=1.1,
|
|
480
|
+
)
|
|
481
|
+
if not plan.ok_aggregate:
|
|
482
|
+
unit = plan.per_worker_mb + plan.overhead_mb
|
|
483
|
+
raise RuntimeError(
|
|
484
|
+
f"Insufficient aggregate VRAM. Cap={plan.aggregate_cap_mb:.0f} MB, "
|
|
485
|
+
f"Need={plan.aggregate_need_mb:.0f} MB (= {plan.model_vram_mb:.0f}×{len(self.selected_gpus)} + "
|
|
486
|
+
f"{plan.intended_workers}×{unit:.0f})."
|
|
487
|
+
)
|
|
488
|
+
self.logger.info(
|
|
489
|
+
f"VRAM budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ Cap {plan.aggregate_cap_mb:.0f} MB "
|
|
490
|
+
f"across {len(self.selected_gpus)} GPU(s)."
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def _generate_stations_list(self):
|
|
494
|
+
"""Generates station list"""
|
|
495
|
+
if self.station2use is None:
|
|
496
|
+
return list(range(1, 11)) + list(range(15, 50, 5))
|
|
497
|
+
return generate_station_list(self.stations2use, self.starting_amount_of_stations, self.station_list_step_size)
|
|
498
|
+
|
|
499
|
+
# def _prepare_environment(self):
|
|
500
|
+
# """Removed 'output_dir' so that there is no conflicts in the save for a clean output return"""
|
|
501
|
+
# remove_directory(self.output_dir)
|
|
502
|
+
|
|
503
|
+
def chunk_time(self):
|
|
504
|
+
starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
|
|
505
|
+
endtime = UTCDateTime(self.end_time)
|
|
506
|
+
|
|
507
|
+
times_list = []
|
|
508
|
+
start = starttime
|
|
509
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
510
|
+
while start <= endtime:
|
|
511
|
+
if end >= endtime:
|
|
512
|
+
end = endtime
|
|
513
|
+
times_list.append([start, end])
|
|
514
|
+
break
|
|
515
|
+
times_list.append([start, end])
|
|
516
|
+
start = end - (self.waveform_overlap * 60)
|
|
517
|
+
end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
|
|
518
|
+
|
|
519
|
+
self.times_list = times_list
|
|
520
|
+
|
|
521
|
+
def _drain_worker_logs(self):
|
|
522
|
+
while True:
|
|
523
|
+
rec = self.log_queue.get() # blocks until a record arrives
|
|
524
|
+
if rec is None: break # sentinel to stop thread
|
|
525
|
+
try:
|
|
526
|
+
self.logger.handle(rec) # routes to file+console handlers
|
|
527
|
+
except Exception:
|
|
528
|
+
# never crash on logging
|
|
529
|
+
self.logger.exception("Failed to handle worker log record")
|
|
530
|
+
|
|
531
|
+
def dt_task_generator(self):
|
|
532
|
+
tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
|
|
533
|
+
self.tasks_picker = tasks
|
|
534
|
+
|
|
535
|
+
def _trial_key(self, *, num_cpus: int, stations: int, predictions: int, gpu_memory_limit_mb, timechunks: int, model: str, gpus: list = None) -> str:
|
|
536
|
+
# Use provided gpus parameter if available, otherwise fall back to self.selected_gpus
|
|
537
|
+
if gpus is not None:
|
|
538
|
+
gpus_to_use = gpus
|
|
539
|
+
else:
|
|
540
|
+
gpus_to_use = self.selected_gpus if self.selected_gpus is not None else []
|
|
541
|
+
gpus_norm = tuple(sorted(int(x) for x in gpus_to_use))
|
|
542
|
+
vram_norm = int(round(float(gpu_memory_limit_mb))) if gpu_memory_limit_mb not in ("", None) else ""
|
|
543
|
+
return f"cpus={int(num_cpus)}|gpus={gpus_norm}|stations={int(stations)}|pred={int(predictions)}|timechunks={int(timechunks)}|vram={vram_norm}|model={model}"
|
|
544
|
+
|
|
545
|
+
def _load_existing_trial_keys(self, csv_path: str) -> set[str]:
|
|
546
|
+
if not os.path.exists(csv_path):
|
|
547
|
+
return set()
|
|
548
|
+
try:
|
|
549
|
+
df = pd.read_csv(csv_path, keep_default_na=False)
|
|
550
|
+
except Exception:
|
|
551
|
+
return set()
|
|
552
|
+
|
|
553
|
+
keys = set()
|
|
554
|
+
for _, row in df.iterrows():
|
|
555
|
+
try:
|
|
556
|
+
num_cpus = int(float(row.get("Number of CPUs Allocated for Ray to Use", 0) or 0))
|
|
557
|
+
stations = int(float(row.get("Number of Stations Used", 0) or 0))
|
|
558
|
+
predictions = int(float(row.get("Number of Concurrent Station Tasks", 0) or 0))
|
|
559
|
+
timechunks = int(float(row.get("Concurrent Timechunks Used", 0) or 0))
|
|
560
|
+
vram = row.get("Inference Actor Memory Limit (MB)", "")
|
|
561
|
+
vram_mb = float(vram) if vram not in ("", None) else ""
|
|
562
|
+
# Parse GPUs from this specific row (don't overwrite self.selected_gpus)
|
|
563
|
+
row_gpus = _parse_gpus_field(row.get("GPUs Used")) or []
|
|
564
|
+
# Extract model info
|
|
565
|
+
model = row.get("Model Used", "eqcct") # Default to eqcct for legacy rows
|
|
566
|
+
# Pass the row's info directly to _trial_key
|
|
567
|
+
keys.add(self._trial_key(num_cpus=num_cpus, stations=stations, predictions=predictions,
|
|
568
|
+
gpu_memory_limit_mb=vram_mb, timechunks=timechunks, model=model, gpus=row_gpus))
|
|
569
|
+
except Exception:
|
|
570
|
+
continue
|
|
571
|
+
return keys
|
|
572
|
+
|
|
573
|
+
def evaluate_cpu(self):
|
|
574
|
+
"""Evaluate system parallelization using CPUs"""
|
|
575
|
+
statement = "Evaluating System Parallelization Capability using CPU"
|
|
576
|
+
self.logger.info(f"{statement}")
|
|
577
|
+
|
|
578
|
+
os.makedirs(self.csv_dir, exist_ok=True)
|
|
579
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
580
|
+
|
|
581
|
+
# Create test results csv
|
|
582
|
+
csv_filepath = f"{self.csv_dir}/cpu_test_results.csv"
|
|
583
|
+
prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
|
|
584
|
+
planned_keys = self._load_existing_trial_keys(csv_filepath)
|
|
585
|
+
|
|
586
|
+
self.chunk_time()
|
|
587
|
+
self.dt_task_generator()
|
|
588
|
+
|
|
589
|
+
trial_num = 1
|
|
590
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
591
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
592
|
+
|
|
593
|
+
if self.min_cpu_amount > len(self.cpu_id_list):
|
|
594
|
+
# Code won't execute because the minimum CPU amount of > the len(cpu id list)
|
|
595
|
+
# In which the rest of the code is dependent on the len for generating cpu_count
|
|
596
|
+
print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
|
|
597
|
+
quit()
|
|
598
|
+
|
|
599
|
+
with open(self.log_filepath, mode="a+", buffering=1) as log:
|
|
600
|
+
for i in range(self.min_cpu_amount, self.cpu_count+1, self.cpu_test_step_size):
|
|
601
|
+
# Set CPU affinity and initialize Ray
|
|
602
|
+
cpus_to_use = self.cpu_id_list[:i]
|
|
603
|
+
process = psutil.Process(os.getpid())
|
|
604
|
+
process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
|
|
605
|
+
|
|
606
|
+
ray.init(ignore_reinit_error=True, num_cpus=len(cpus_to_use), logging_level=logging.FATAL, log_to_driver=False, _temp_dir=self.home_tmp_dir)
|
|
607
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
608
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
609
|
+
self._log_thread.start() # Starts the thread
|
|
610
|
+
self.logger.info(f"Ray Successfully Initialized with {len(cpus_to_use)} CPU(s) ({list(cpus_to_use)} CPU Affinity Binding).")
|
|
611
|
+
|
|
612
|
+
timechunks_list = []
|
|
613
|
+
timechunk = 1
|
|
614
|
+
step = self.conc_timechunk_tasks_step_size # Use the class attribute
|
|
615
|
+
while timechunk <= len(self.tasks_picker):
|
|
616
|
+
timechunks_list.append(timechunk)
|
|
617
|
+
if timechunk == 1:
|
|
618
|
+
timechunk += 1
|
|
619
|
+
else:
|
|
620
|
+
timechunk += step
|
|
621
|
+
|
|
622
|
+
if len(self.tasks_picker) not in timechunks_list:
|
|
623
|
+
timechunks_list.append(len(self.tasks_picker))
|
|
624
|
+
# sets are a set of multiple items stored in a single variable
|
|
625
|
+
# unchangable after being set, cannot have duplicates and is unordered
|
|
626
|
+
timechunks_list = sorted(list(set(timechunks_list)))
|
|
627
|
+
# Determine model name for trial key
|
|
628
|
+
if self.model_type == 'seisbench':
|
|
629
|
+
trial_model = f"{self.seisbench_parent_model}/{self.seisbench_child_model}"
|
|
630
|
+
else:
|
|
631
|
+
trial_model = "eqcct"
|
|
632
|
+
|
|
633
|
+
for timechunks in timechunks_list:
|
|
634
|
+
tested_concurrency = set() # Reset for each cpu / timechunk configuration
|
|
635
|
+
for num_stations in self.stations2use_list:
|
|
636
|
+
# Use a 20% step size for concurrency testing as requested
|
|
637
|
+
# This tests 20%, 40%, 60%, 80%, and 100% of the current total stations
|
|
638
|
+
step = max(1, int(num_stations * 0.2))
|
|
639
|
+
concurrent_predictions_list = sorted(list(set(range(step, num_stations + 1, step))))
|
|
640
|
+
|
|
641
|
+
# Efficiency optimization: only test concurrency values we haven't seen yet
|
|
642
|
+
# for this CPU/Timechunk combo to save compute time.
|
|
643
|
+
new_concurrent_values = [x for x in concurrent_predictions_list if x not in tested_concurrency]
|
|
644
|
+
if not new_concurrent_values:
|
|
645
|
+
continue # All concurrency values already tested
|
|
646
|
+
for num_concurrent_predictions in new_concurrent_values:
|
|
647
|
+
tested_concurrency.add(num_concurrent_predictions)
|
|
648
|
+
key = self._trial_key(
|
|
649
|
+
num_cpus=len(cpus_to_use),
|
|
650
|
+
stations=num_stations,
|
|
651
|
+
predictions=num_concurrent_predictions,
|
|
652
|
+
gpu_memory_limit_mb="", # CPU eval has no per-task VRAM cap
|
|
653
|
+
timechunks=timechunks,
|
|
654
|
+
model=trial_model
|
|
655
|
+
)
|
|
656
|
+
if key in planned_keys:
|
|
657
|
+
self.logger.info(f"[SKIP] Already tested: {key}")
|
|
658
|
+
continue
|
|
659
|
+
planned_keys.add(key)
|
|
660
|
+
|
|
661
|
+
mseed_timechunk_dir_name = self.tasks_picker[timechunks-1][1]
|
|
662
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
663
|
+
max_pending_tasks = timechunks
|
|
664
|
+
|
|
665
|
+
self.logger.info("")
|
|
666
|
+
self.logger.info(f"------- Trial Number: {trial_num} -------")
|
|
667
|
+
self.logger.info(f"CPU(s): {len(cpus_to_use)}")
|
|
668
|
+
self.logger.info(f"Conc. Timechunks Being Analyzed: {timechunks} / Total Timechunks to be Analyzed: {len(self.tasks_picker)}")
|
|
669
|
+
self.logger.info(f"Total Amount of Stations to be Processed in Current Trial: {num_stations} / Number of Stations Being Processed Concurrently: {num_concurrent_predictions} / Total Overall Trial Station Count: {max(self.stations2use_list)}")
|
|
670
|
+
|
|
671
|
+
# Concurrent Timechunks
|
|
672
|
+
tasks_queue = []
|
|
673
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
# ===== RAM Baseline (before launching worker) =====
|
|
677
|
+
_rss = process.memory_info().rss
|
|
678
|
+
for _ch in process.children(recursive=True):
|
|
679
|
+
try:
|
|
680
|
+
_rss += _ch.memory_info().rss
|
|
681
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
682
|
+
pass
|
|
683
|
+
mem_before_total_mb = _rss / 1e6
|
|
684
|
+
|
|
685
|
+
# peak before (platform-aware)
|
|
686
|
+
if resource is not None: # Linux/macOS
|
|
687
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
688
|
+
if sys.platform.startswith("linux"):
|
|
689
|
+
peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
|
|
690
|
+
elif sys.platform == "darwin":
|
|
691
|
+
peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
|
|
692
|
+
else:
|
|
693
|
+
peak_before_mb = mem_before_total_mb # safe fallback
|
|
694
|
+
else: # Windows: no 'resource'
|
|
695
|
+
try:
|
|
696
|
+
peak_before_mb = process.memory_full_info().peak_wset / 1e6
|
|
697
|
+
except Exception:
|
|
698
|
+
peak_before_mb = mem_before_total_mb
|
|
699
|
+
|
|
700
|
+
try:
|
|
701
|
+
while True:
|
|
702
|
+
if len(tasks_queue) < max_pending_tasks:
|
|
703
|
+
tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
|
|
704
|
+
P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
|
|
705
|
+
number_of_concurrent_station_predictions=num_concurrent_predictions, ray_cpus=cpus_to_use, use_gpu=self.use_gpu,
|
|
706
|
+
gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, stations2use=num_stations,
|
|
707
|
+
timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
|
|
708
|
+
number_of_concurrent_timechunk_predictions=max_pending_tasks, total_analysis_time=total_analysis_time, testing_gpu=False,
|
|
709
|
+
test_csv_filepath=csv_filepath, intra_threads=self.intra_threads, inter_threads=self.inter_threads, timechunk_dt=self.timechunk_dt,
|
|
710
|
+
model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
|
|
711
|
+
seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold))
|
|
712
|
+
|
|
713
|
+
break
|
|
714
|
+
|
|
715
|
+
else:
|
|
716
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
717
|
+
for finished_task in tasks_finished:
|
|
718
|
+
log_entry = ray.get(finished_task)
|
|
719
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
720
|
+
|
|
721
|
+
# After adding all the tasks to queue, process what's left
|
|
722
|
+
while tasks_queue:
|
|
723
|
+
tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
|
|
724
|
+
for finished_task in tasks_finished:
|
|
725
|
+
log_entry = ray.get(finished_task)
|
|
726
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
727
|
+
|
|
728
|
+
update_csv(csv_filepath, success=1, error_message="")
|
|
729
|
+
except Exception as e:
|
|
730
|
+
# Failure occured, need to add to log
|
|
731
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
732
|
+
update_csv(csv_filepath, success=0, error_message=error_msg)
|
|
733
|
+
self.logger.error(f"Trial {trial_num} FAILED: {error_msg}")
|
|
734
|
+
|
|
735
|
+
# Write log entries from the queue to the file
|
|
736
|
+
while not log_queue.empty():
|
|
737
|
+
log_entry = log_queue.get()
|
|
738
|
+
|
|
739
|
+
remove_output_subdirs(self.output_dir, logger=self.logger)
|
|
740
|
+
trial_num += 1
|
|
741
|
+
|
|
742
|
+
# RAM cleanup
|
|
743
|
+
# ===== AFTER RUN (before cleanup) =====
|
|
744
|
+
_rss = process.memory_info().rss
|
|
745
|
+
for _ch in process.children(recursive=True):
|
|
746
|
+
try:
|
|
747
|
+
_rss += _ch.memory_info().rss
|
|
748
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
749
|
+
pass
|
|
750
|
+
mem_after_run_total_mb = _rss / 1e6
|
|
751
|
+
delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
|
|
752
|
+
|
|
753
|
+
# updated peak (platform-aware)
|
|
754
|
+
if resource is not None:
|
|
755
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
756
|
+
if sys.platform.startswith("linux"):
|
|
757
|
+
peak_after_mb = _ru / 1024.0
|
|
758
|
+
elif sys.platform == "darwin":
|
|
759
|
+
peak_after_mb = _ru / (1024.0 * 1024.0)
|
|
760
|
+
else:
|
|
761
|
+
peak_after_mb = mem_after_run_total_mb
|
|
762
|
+
else:
|
|
763
|
+
try:
|
|
764
|
+
peak_after_mb = process.memory_full_info().peak_wset / 1e6
|
|
765
|
+
except Exception:
|
|
766
|
+
peak_after_mb = mem_after_run_total_mb
|
|
767
|
+
|
|
768
|
+
self.logger.info("")
|
|
769
|
+
self.logger.info(
|
|
770
|
+
f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
|
|
771
|
+
f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# ===== CLEANUP =====
|
|
775
|
+
# drop strong refs so GC matters
|
|
776
|
+
try: del ref
|
|
777
|
+
except NameError: pass
|
|
778
|
+
try: del log_entry
|
|
779
|
+
except NameError: pass
|
|
780
|
+
|
|
781
|
+
_rss = process.memory_info().rss
|
|
782
|
+
for _ch in process.children(recursive=True):
|
|
783
|
+
try:
|
|
784
|
+
_rss += _ch.memory_info().rss
|
|
785
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
786
|
+
pass
|
|
787
|
+
mem_before_clean_mb = _rss / 1e6
|
|
788
|
+
|
|
789
|
+
gc.collect()
|
|
790
|
+
time.sleep(0.1)
|
|
791
|
+
|
|
792
|
+
_rss = process.memory_info().rss
|
|
793
|
+
for _ch in process.children(recursive=True):
|
|
794
|
+
try:
|
|
795
|
+
_rss += _ch.memory_info().rss
|
|
796
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
797
|
+
pass
|
|
798
|
+
mem_after_clean_mb = _rss / 1e6
|
|
799
|
+
|
|
800
|
+
freed_mb = mem_before_clean_mb - mem_after_clean_mb
|
|
801
|
+
self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB") # To-Do: Fix the Freed so its beeter (for cpu and gpu)
|
|
802
|
+
self.logger.info("")
|
|
803
|
+
|
|
804
|
+
# tested_concurrency.update([x for x in concurrent_predictions_list if x <= num_stations])
|
|
805
|
+
tested_concurrency.update(new_concurrent_values)
|
|
806
|
+
|
|
807
|
+
# stop log forwarder
|
|
808
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
809
|
+
self._log_thread.join(timeout=2)
|
|
810
|
+
|
|
811
|
+
ray.shutdown() # Shutdown Ray after processing all timechunks for this CPU count
|
|
812
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
self.logger.info(f"Testing complete.")
|
|
816
|
+
self.logger.info(f"")
|
|
817
|
+
self.logger.info(f"Finding Optimal Configurations...")
|
|
818
|
+
# Compute optimal configurations (CPU)
|
|
819
|
+
df = pd.read_csv(csv_filepath)
|
|
820
|
+
optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_cpu(df)
|
|
821
|
+
optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_cpu.csv", index=False)
|
|
822
|
+
best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_cpu.csv", index=False)
|
|
823
|
+
self.logger.info(f"Optimal Configurations Found. Findings saved to:")
|
|
824
|
+
self.logger.info(f" 1) Optimal CPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_cpu.csv")
|
|
825
|
+
self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_cpu.csv")
|
|
826
|
+
|
|
827
|
+
def evaluate_gpu(self):
|
|
828
|
+
"""Evaluate system parallelization using GPUs"""
|
|
829
|
+
statement = "Evaluating System Parallelization Capability using GPUs"
|
|
830
|
+
self.logger.info(f"{statement}")
|
|
831
|
+
|
|
832
|
+
os.makedirs(self.csv_dir, exist_ok=True)
|
|
833
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
834
|
+
|
|
835
|
+
# Create test results csv
|
|
836
|
+
csv_filepath = f"{self.csv_dir}/gpu_test_results.csv"
|
|
837
|
+
prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
|
|
838
|
+
|
|
839
|
+
# Normalize existing CSV to ensure consistent "GPUs Used" formatting and quoting
|
|
840
|
+
if os.path.exists(csv_filepath):
|
|
841
|
+
from .tools import normalize_gpu_csv_quoting
|
|
842
|
+
normalize_gpu_csv_quoting(csv_filepath)
|
|
843
|
+
self.logger.info("Normalized existing CSV entries for consistent 'GPUs Used' formatting.")
|
|
844
|
+
|
|
845
|
+
planned_keys = self._load_existing_trial_keys(csv_filepath)
|
|
846
|
+
|
|
847
|
+
# Log summary of existing trials
|
|
848
|
+
if planned_keys:
|
|
849
|
+
self.logger.info(f"Loaded {len(planned_keys)} existing trial(s) from CSV. These will be skipped.")
|
|
850
|
+
# Count trials by GPU configuration
|
|
851
|
+
gpu_counts = {}
|
|
852
|
+
for key in planned_keys:
|
|
853
|
+
# Extract GPU info from key (format: cpus=X|gpus=(...)|...)
|
|
854
|
+
if "gpus=" in key:
|
|
855
|
+
gpu_part = key.split("gpus=")[1].split("|")[0]
|
|
856
|
+
gpu_counts[gpu_part] = gpu_counts.get(gpu_part, 0) + 1
|
|
857
|
+
if gpu_counts:
|
|
858
|
+
self.logger.info("Existing trials by GPU configuration:")
|
|
859
|
+
for gpu_config, count in sorted(gpu_counts.items()):
|
|
860
|
+
self.logger.info(f" {gpu_config}: {count} trial(s)")
|
|
861
|
+
else:
|
|
862
|
+
self.logger.info("No existing trials found in CSV. Starting fresh evaluation.")
|
|
863
|
+
|
|
864
|
+
# Calculate these at the start
|
|
865
|
+
self.chunk_time()
|
|
866
|
+
self.dt_task_generator()
|
|
867
|
+
|
|
868
|
+
trial_num = 1
|
|
869
|
+
log_queue = queue.Queue() # Create a queue for log entries
|
|
870
|
+
total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
|
|
871
|
+
|
|
872
|
+
# Track statistics
|
|
873
|
+
trials_skipped = 0
|
|
874
|
+
trials_run = 0
|
|
875
|
+
|
|
876
|
+
if self.min_cpu_amount > len(self.cpu_id_list):
|
|
877
|
+
# Code won't execute because the minimum CPU amount of > the len(cpu id list)
|
|
878
|
+
# In which the rest of the code is dependent on the len for generating cpu_count
|
|
879
|
+
print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
|
|
880
|
+
quit()
|
|
881
|
+
|
|
882
|
+
for gpu in range(len(self.selected_gpus)):
|
|
883
|
+
for cpu in range(self.min_cpu_amount, len(self.cpu_id_list)+1, self.cpu_test_step_size):
|
|
884
|
+
# Set CPU affinity and initialize Ray
|
|
885
|
+
cpus_to_use = self.cpu_id_list[:cpu] # 'cpu' is a count (e.g., 1, 2, 3). Slicing [:cpu] gets that many CPU IDs, we want :cpu bc we are using 0 index counting so 0-20 exclusive = 0-19 IDs = 20 CPUs to use explicitely. ([:n is exclusive])
|
|
886
|
+
gpus_to_use = self.selected_gpus[:gpu+1] # 'gpu' is an index (0, 1, 2). We need +1 to include the current GPU.
|
|
887
|
+
# Set CPU affinity
|
|
888
|
+
process = psutil.Process(os.getpid())
|
|
889
|
+
process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
|
|
890
|
+
|
|
891
|
+
# VRAM budget per GPU (MB). If vram_mb is provided, treat it as an explicit per-GPU budget override.
|
|
892
|
+
free_vram_mb = float(self.vram_mb) if self.vram_mb else self.calculate_vram()
|
|
893
|
+
self.logger.info("")
|
|
894
|
+
self.logger.info("=" * 80)
|
|
895
|
+
self.logger.info(f"Testing Using {len(gpus_to_use)} GPU(s) with IDs {gpus_to_use} and {len(cpus_to_use)} CPU(s)")
|
|
896
|
+
self.logger.info("=" * 80)
|
|
897
|
+
|
|
898
|
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
899
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpus_to_use))
|
|
900
|
+
# Initialize Ray with GPUs
|
|
901
|
+
ray.init(ignore_reinit_error=True, num_gpus=len(gpus_to_use), num_cpus=len(cpus_to_use),
|
|
902
|
+
logging_level=logging.FATAL, log_to_driver=False, _temp_dir=self.home_tmp_dir)
|
|
903
|
+
self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
|
|
904
|
+
self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
|
|
905
|
+
self._log_thread.start() # Starts the thread
|
|
906
|
+
self.logger.info(f"Ray Successfully Initialized with {len(gpus_to_use)} GPU(s) and {len(cpus_to_use)} CPU(s) ({list(cpus_to_use)} CPU Affinity Binding).")
|
|
907
|
+
self.logger.info(f"Trials will evalute GPU(s) performance against Iterative Total Station Tasks ({self.stations2use_list}) with Varying Concurrent Predictions.")
|
|
908
|
+
self.logger.info("")
|
|
909
|
+
|
|
910
|
+
# Efficiency optimization: track tested configurations for this GPU/CPU combo
|
|
911
|
+
tested_gpu_configs = set()
|
|
912
|
+
|
|
913
|
+
# Determine model name for trial key
|
|
914
|
+
if self.model_type == 'seisbench':
|
|
915
|
+
trial_model = f"{self.seisbench_parent_model}/{self.seisbench_child_model}"
|
|
916
|
+
else:
|
|
917
|
+
trial_model = "eqcct"
|
|
918
|
+
|
|
919
|
+
for stations in self.stations2use_list:
|
|
920
|
+
# Use a 20% step size for concurrency testing as requested
|
|
921
|
+
step = max(1, int(stations * 0.2))
|
|
922
|
+
concurrent_predictions_list = sorted(list(set(range(step, stations + 1, step))))
|
|
923
|
+
|
|
924
|
+
self.logger.info(f"Evaluating GPU(s) against {stations} TOTAL STATION(s) with 20% STEP CONCURRENT STATION PREDICTIONS: {concurrent_predictions_list}")
|
|
925
|
+
for predictions in concurrent_predictions_list:
|
|
926
|
+
vram_per_task_mb = free_vram_mb / predictions
|
|
927
|
+
step_size = vram_per_task_mb * 0.2
|
|
928
|
+
vram_steps = np.arange(step_size, vram_per_task_mb + step_size, step_size)
|
|
929
|
+
|
|
930
|
+
# Determine minimum VRAM filter based on model type
|
|
931
|
+
if self.model_type == 'seisbench':
|
|
932
|
+
from .parallelization import get_seisbench_model_vram_mb
|
|
933
|
+
min_vram = get_seisbench_model_vram_mb(
|
|
934
|
+
self.seisbench_parent_model,
|
|
935
|
+
self.seisbench_child_model,
|
|
936
|
+
default_mb=2000.0
|
|
937
|
+
)
|
|
938
|
+
else:
|
|
939
|
+
min_vram = 3000.0 # EQCCT minimum
|
|
940
|
+
|
|
941
|
+
vram_steps = vram_steps[vram_steps >= min_vram]
|
|
942
|
+
|
|
943
|
+
# We are defining the hard upper limit on how much VRAM (GPU memory) TensorFlow is allowed to use inside each ModelActor process
|
|
944
|
+
# This includes 1) Loading the model weights into GPU memory during init and 2) Usage of the ModelActor (predictions, etc.)
|
|
945
|
+
# If the actor tries to allocate more memory than available, then a OOME will occur inside that actor only
|
|
946
|
+
# Good for testing OOME prevention while not letting actors steal available memory from other actors
|
|
947
|
+
# LSS: It is purely the VRAM ceiling for each shared inference actor that handles the actual model predictions
|
|
948
|
+
for gpu_memory_limit_mb in vram_steps:
|
|
949
|
+
gpu_memory_limit_mb = int(round(float(gpu_memory_limit_mb)))
|
|
950
|
+
|
|
951
|
+
# Efficiency check: avoid redundant tests for same (concurrency, vram)
|
|
952
|
+
config_key = (predictions, gpu_memory_limit_mb)
|
|
953
|
+
if config_key in tested_gpu_configs:
|
|
954
|
+
continue
|
|
955
|
+
tested_gpu_configs.add(config_key)
|
|
956
|
+
|
|
957
|
+
key = self._trial_key(
|
|
958
|
+
num_cpus=len(cpus_to_use),
|
|
959
|
+
stations=stations,
|
|
960
|
+
predictions=predictions,
|
|
961
|
+
gpu_memory_limit_mb=gpu_memory_limit_mb,
|
|
962
|
+
timechunks=1, # your GPU eval is explicitly "one timechunk at a time"
|
|
963
|
+
model=trial_model,
|
|
964
|
+
gpus=gpus_to_use, # Pass the actual GPUs being used in this iteration
|
|
965
|
+
)
|
|
966
|
+
if key in planned_keys:
|
|
967
|
+
self.logger.info(f"[SKIP] Already tested: {key}")
|
|
968
|
+
trials_skipped += 1
|
|
969
|
+
continue
|
|
970
|
+
planned_keys.add(key)
|
|
971
|
+
trials_run += 1
|
|
972
|
+
self.logger.info("")
|
|
973
|
+
self.logger.info(f"------- Trial Number: {trial_num} -------")
|
|
974
|
+
# self.logger.info(f"VRAM Limited to {gpu_memory_limit_mb:.2f} MB per Parallel Task")
|
|
975
|
+
|
|
976
|
+
# Get the first timechunk for testing
|
|
977
|
+
mseed_timechunk_dir_name = self.tasks_picker[0][1]
|
|
978
|
+
timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
|
|
979
|
+
|
|
980
|
+
self.logger.info(f"Stations: {stations}")
|
|
981
|
+
self.logger.info(f"Concurrent Station Predictions: {predictions}")
|
|
982
|
+
self.logger.info(f"VRAM per Parallel Task: {gpu_memory_limit_mb:.2f} MB")
|
|
983
|
+
self.logger.info("")
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
# ===== Baseline RAM consumption (before launching worker) =====
|
|
987
|
+
_rss = process.memory_info().rss
|
|
988
|
+
for _ch in process.children(recursive=True):
|
|
989
|
+
try:
|
|
990
|
+
_rss += _ch.memory_info().rss
|
|
991
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
992
|
+
pass
|
|
993
|
+
mem_before_total_mb = _rss / 1e6
|
|
994
|
+
|
|
995
|
+
# peak before (platform-aware)
|
|
996
|
+
if resource is not None: # Linux/macOS
|
|
997
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
998
|
+
if sys.platform.startswith("linux"):
|
|
999
|
+
peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
|
|
1000
|
+
elif sys.platform == "darwin":
|
|
1001
|
+
peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
|
|
1002
|
+
else:
|
|
1003
|
+
peak_before_mb = mem_before_total_mb # safe fallback
|
|
1004
|
+
else: # Windows: no 'resource'
|
|
1005
|
+
try:
|
|
1006
|
+
peak_before_mb = process.memory_full_info().peak_wset / 1e6
|
|
1007
|
+
except Exception:
|
|
1008
|
+
peak_before_mb = mem_before_total_mb
|
|
1009
|
+
|
|
1010
|
+
try: # To Do: Add Concurrent Timechunks Testing for GPU/CPU too, reference eqcctpro_parallelization()
|
|
1011
|
+
# Call mseed_predictor directly via Ray (just like evaluate_cpu does)
|
|
1012
|
+
ref = mseed_predictor.options(num_gpus=0, num_cpus=1).remote(
|
|
1013
|
+
input_dir=timechunk_dir_path,
|
|
1014
|
+
output_dir=self.output_dir,
|
|
1015
|
+
log_queue=self.log_queue,
|
|
1016
|
+
P_threshold=self.P_threshold,
|
|
1017
|
+
S_threshold=self.S_threshold,
|
|
1018
|
+
p_model=self.p_model_filepath,
|
|
1019
|
+
s_model=self.s_model_filepath,
|
|
1020
|
+
number_of_concurrent_station_predictions=predictions,
|
|
1021
|
+
ray_cpus=cpus_to_use,
|
|
1022
|
+
use_gpu=self.use_gpu,
|
|
1023
|
+
gpu_id=gpus_to_use,
|
|
1024
|
+
gpu_memory_limit_mb=gpu_memory_limit_mb,
|
|
1025
|
+
stations2use=stations,
|
|
1026
|
+
timechunk_id=mseed_timechunk_dir_name,
|
|
1027
|
+
waveform_overlap=self.waveform_overlap,
|
|
1028
|
+
total_timechunks=len(self.tasks_picker),
|
|
1029
|
+
number_of_concurrent_timechunk_predictions=1, # Testing one timechunk at a time
|
|
1030
|
+
total_analysis_time=total_analysis_time,
|
|
1031
|
+
testing_gpu=True, # Enable test mode
|
|
1032
|
+
test_csv_filepath=csv_filepath,
|
|
1033
|
+
intra_threads=self.intra_threads,
|
|
1034
|
+
inter_threads=self.inter_threads,
|
|
1035
|
+
timechunk_dt=self.timechunk_dt,
|
|
1036
|
+
model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
|
|
1037
|
+
seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# Wait for result
|
|
1041
|
+
log_entry = ray.get(ref)
|
|
1042
|
+
log_queue.put(log_entry) # Add log entry to the queue
|
|
1043
|
+
|
|
1044
|
+
# Success - update CSV
|
|
1045
|
+
update_csv(csv_filepath, success=1, error_message="")
|
|
1046
|
+
|
|
1047
|
+
except Exception as e:
|
|
1048
|
+
# Failure occurred, need to add to log
|
|
1049
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1050
|
+
update_csv(csv_filepath, success=0, error_message=error_msg)
|
|
1051
|
+
self.logger.info(f"Trial {trial_num} FAILED: {error_msg}")
|
|
1052
|
+
|
|
1053
|
+
# Write log entries from the queue to the file
|
|
1054
|
+
while not log_queue.empty():
|
|
1055
|
+
log_entry = log_queue.get()
|
|
1056
|
+
self.logger.info(f"{log_entry}") # FIX ME
|
|
1057
|
+
|
|
1058
|
+
remove_output_subdirs(self.output_dir, logger=self.logger)
|
|
1059
|
+
trial_num += 1
|
|
1060
|
+
|
|
1061
|
+
# RAM cleanup
|
|
1062
|
+
# ===== AFTER RUN (before cleanup) =====
|
|
1063
|
+
_rss = process.memory_info().rss
|
|
1064
|
+
for _ch in process.children(recursive=True):
|
|
1065
|
+
try:
|
|
1066
|
+
_rss += _ch.memory_info().rss
|
|
1067
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
1068
|
+
pass
|
|
1069
|
+
mem_after_run_total_mb = _rss / 1e6
|
|
1070
|
+
delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
|
|
1071
|
+
|
|
1072
|
+
# updated peak (platform-aware)
|
|
1073
|
+
if resource is not None:
|
|
1074
|
+
_ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
1075
|
+
if sys.platform.startswith("linux"):
|
|
1076
|
+
peak_after_mb = _ru / 1024.0
|
|
1077
|
+
elif sys.platform == "darwin":
|
|
1078
|
+
peak_after_mb = _ru / (1024.0 * 1024.0)
|
|
1079
|
+
else:
|
|
1080
|
+
peak_after_mb = mem_after_run_total_mb
|
|
1081
|
+
else:
|
|
1082
|
+
try:
|
|
1083
|
+
peak_after_mb = process.memory_full_info().peak_wset / 1e6
|
|
1084
|
+
except Exception:
|
|
1085
|
+
peak_after_mb = mem_after_run_total_mb
|
|
1086
|
+
|
|
1087
|
+
self.logger.info(
|
|
1088
|
+
f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
|
|
1089
|
+
f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
# ===== CLEANUP =====
|
|
1093
|
+
# drop strong refs so GC matters
|
|
1094
|
+
try: del ref
|
|
1095
|
+
except NameError: pass
|
|
1096
|
+
try: del log_entry
|
|
1097
|
+
except NameError: pass
|
|
1098
|
+
|
|
1099
|
+
_rss = process.memory_info().rss
|
|
1100
|
+
for _ch in process.children(recursive=True):
|
|
1101
|
+
try:
|
|
1102
|
+
_rss += _ch.memory_info().rss
|
|
1103
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
1104
|
+
pass
|
|
1105
|
+
mem_before_clean_mb = _rss / 1e6
|
|
1106
|
+
|
|
1107
|
+
gc.collect()
|
|
1108
|
+
time.sleep(0.1)
|
|
1109
|
+
|
|
1110
|
+
_rss = process.memory_info().rss
|
|
1111
|
+
for _ch in process.children(recursive=True):
|
|
1112
|
+
try:
|
|
1113
|
+
_rss += _ch.memory_info().rss
|
|
1114
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
1115
|
+
pass
|
|
1116
|
+
mem_after_clean_mb = _rss / 1e6
|
|
1117
|
+
|
|
1118
|
+
freed_mb = mem_before_clean_mb - mem_after_clean_mb
|
|
1119
|
+
self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB\n")
|
|
1120
|
+
self.logger.info("")
|
|
1121
|
+
|
|
1122
|
+
# stop log forwarder
|
|
1123
|
+
self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
|
|
1124
|
+
self._log_thread.join(timeout=2)
|
|
1125
|
+
|
|
1126
|
+
ray.shutdown() # Shutdown Ray after all testing
|
|
1127
|
+
self.logger.info(f"Ray Successfully Shutdown.")
|
|
1128
|
+
|
|
1129
|
+
self.logger.info(f"Testing complete.")
|
|
1130
|
+
self.logger.info(f"")
|
|
1131
|
+
self.logger.info(f"Trial Summary: {trials_run} new trial(s) executed, {trials_skipped} trial(s) skipped (already in CSV)")
|
|
1132
|
+
self.logger.info(f"Finding Optimal Configurations...")
|
|
1133
|
+
self.logger.info(f"Recalculating optimal configurations from all trial data (including new trials)...")
|
|
1134
|
+
# Compute optimal configurations (GPU)
|
|
1135
|
+
df = pd.read_csv(csv_filepath)
|
|
1136
|
+
optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_gpu(df)
|
|
1137
|
+
# Overwrite existing files with recalculated optimal configurations
|
|
1138
|
+
optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_gpu.csv", index=False)
|
|
1139
|
+
best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_gpu.csv", index=False)
|
|
1140
|
+
self.logger.info(f"Optimal Configurations Found. Findings saved to:")
|
|
1141
|
+
self.logger.info(f" 1) Optimal GPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_gpu.csv")
|
|
1142
|
+
self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_gpu.csv")
|
|
1143
|
+
|
|
1144
|
+
def evaluate(self):
|
|
1145
|
+
if self.eval_mode == "cpu":
|
|
1146
|
+
self.evaluate_cpu()
|
|
1147
|
+
elif self.eval_mode == "gpu":
|
|
1148
|
+
self.evaluate_gpu()
|
|
1149
|
+
else:
|
|
1150
|
+
exit()
|
|
1151
|
+
|
|
1152
|
+
def calculate_vram(self):
|
|
1153
|
+
cap = float(self.gpu_vram_safety_cap)
|
|
1154
|
+
if not (0.0 < cap <= 0.99):
|
|
1155
|
+
raise ValueError(f"gpu_vram_safety_cap must be in (0, 0.99], got {cap}.")
|
|
1156
|
+
|
|
1157
|
+
gpus = self.selected_gpus if self.selected_gpus else list_gpu_ids()
|
|
1158
|
+
if not gpus:
|
|
1159
|
+
raise RuntimeError("No GPUs detected for VRAM calculation.")
|
|
1160
|
+
|
|
1161
|
+
per_gpu_budget_mb = []
|
|
1162
|
+
for gid in gpus:
|
|
1163
|
+
total_gb, free_gb = get_gpu_vram(gpu_index=gid)
|
|
1164
|
+
total_mb = float(total_gb) * 1024.0
|
|
1165
|
+
free_mb = float(free_gb) * 1024.0
|
|
1166
|
+
|
|
1167
|
+
# hard cap vs physical total, plus never exceed currently free memory
|
|
1168
|
+
budget_mb = min(total_mb * cap, free_mb)
|
|
1169
|
+
per_gpu_budget_mb.append(budget_mb)
|
|
1170
|
+
|
|
1171
|
+
self.logger.info(
|
|
1172
|
+
f"GPU {gid}: total={total_gb:.2f} GB, free={free_gb:.2f} GB, "
|
|
1173
|
+
f"budget={budget_mb/1024.0:.2f} GB (cap={cap:.2f})"
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
budget_mb_min = float(min(per_gpu_budget_mb))
|
|
1177
|
+
self.logger.info(f"Using per-GPU VRAM budget = {budget_mb_min:.0f} MB (min across selected GPUs).")
|
|
1178
|
+
return budget_mb_min
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
"""
|
|
1182
|
+
Finds the optimal CPU configuration based on evaluation results
|
|
1183
|
+
"""
|
|
1184
|
+
class OptimalCPUConfigurationFinder:
|
|
1185
|
+
def __init__(self,
|
|
1186
|
+
eval_sys_results_dir: str,
|
|
1187
|
+
log_file_path: str):
|
|
1188
|
+
|
|
1189
|
+
self.eval_sys_results_dir = eval_sys_results_dir
|
|
1190
|
+
if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
|
|
1191
|
+
raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
|
|
1192
|
+
self.log_file_path = log_file_path
|
|
1193
|
+
|
|
1194
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
1195
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
1196
|
+
self.logger.setLevel(logging.INFO)
|
|
1197
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
1198
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
1199
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
1200
|
+
# ensure parent dir
|
|
1201
|
+
Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1202
|
+
file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
|
|
1203
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
1204
|
+
file_h.setFormatter(fmt)
|
|
1205
|
+
stream_h.setFormatter(fmt)
|
|
1206
|
+
self.logger.addHandler(file_h)
|
|
1207
|
+
self.logger.addHandler(stream_h)
|
|
1208
|
+
|
|
1209
|
+
|
|
1210
|
+
def find_best_overall_usecase(self):
|
|
1211
|
+
"""Finds the best overall CPU usecase configuation from eval results"""
|
|
1212
|
+
file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_cpu.csv"
|
|
1213
|
+
if not os.path.exists(file_path):
|
|
1214
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1215
|
+
|
|
1216
|
+
df_best_overall = pd.read_csv(file_path)
|
|
1217
|
+
# best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
|
|
1218
|
+
best_config_dict = df_best_overall.to_dict(orient='records')[0]
|
|
1219
|
+
|
|
1220
|
+
# Extract required values
|
|
1221
|
+
num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
|
|
1222
|
+
waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
|
|
1223
|
+
total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
|
|
1224
|
+
num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
|
|
1225
|
+
length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
|
|
1226
|
+
num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks per Timechunk")
|
|
1227
|
+
intra_threads = best_config_dict.get("Intra-parallelism Threads")
|
|
1228
|
+
inter_threads = best_config_dict.get("Inter-parallelism Threads")
|
|
1229
|
+
num_stations = best_config_dict.get("Number of Stations Used")
|
|
1230
|
+
total_runtime = best_config_dict.get("Total Run time for Picker (s)")
|
|
1231
|
+
model_used = best_config_dict.get("Model Used")
|
|
1232
|
+
|
|
1233
|
+
self.logger.info("")
|
|
1234
|
+
self.logger.info(f"------- Finding the Best Overall CPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1235
|
+
self.logger.info(f"Model Used: {model_used}")
|
|
1236
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
1237
|
+
self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
|
|
1238
|
+
self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
|
|
1239
|
+
self.logger.info(f"Waveform Timespace: {waveform_timespace}")
|
|
1240
|
+
self.logger.info(f"Total Number of Stations Used: {num_stations}")
|
|
1241
|
+
self.logger.info(f"Total Number of Timechunks: {total_num_timechunks}")
|
|
1242
|
+
self.logger.info(f"Length of Timechunks (min): {length_of_timechunks}")
|
|
1243
|
+
self.logger.info(f"Concurrent Timechunk Processes: {num_concurrent_timechunks}")
|
|
1244
|
+
self.logger.info(f"Concurrent Station Processes Per Timechunk: {num_concurrent_stations}")
|
|
1245
|
+
self.logger.info(f"Total Runtime (s): {total_runtime}")
|
|
1246
|
+
self.logger.info("")
|
|
1247
|
+
|
|
1248
|
+
# return int(float(num_cpus)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_concurrent_timechunks)), int(float(num_concurrent_stations)), int(float(num_stations))
|
|
1249
|
+
|
|
1250
|
+
def find_optimal_for(self, cpu: int, station_count: int):
|
|
1251
|
+
"""Finds the optimal configuration for a given number of CPUs and stations."""
|
|
1252
|
+
if cpu is None or station_count is None:
|
|
1253
|
+
raise ValueError("Error: CPU and station_count must have valid values.")
|
|
1254
|
+
|
|
1255
|
+
file_path = f"{self.eval_sys_results_dir}/optimal_configurations_cpu.csv"
|
|
1256
|
+
if not os.path.exists(file_path):
|
|
1257
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1258
|
+
|
|
1259
|
+
df_optimal = pd.read_csv(file_path)
|
|
1260
|
+
|
|
1261
|
+
# Convert relevant columns to numeric
|
|
1262
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
1263
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
1264
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
1265
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
1266
|
+
|
|
1267
|
+
filtered_df = df_optimal[
|
|
1268
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
|
|
1269
|
+
(df_optimal["Number of Stations Used"] == station_count)]
|
|
1270
|
+
|
|
1271
|
+
if filtered_df.empty:
|
|
1272
|
+
raise ValueError("No matching configuration found. Please enter a valid entry.")
|
|
1273
|
+
|
|
1274
|
+
# Finds for the "Total Run time for Picker (s)" the row with the smallest value and the '1' is to say I only want
|
|
1275
|
+
# only the single row where the smallest runtime is
|
|
1276
|
+
# iloc gets the selection of data from a numerical index from the df and turns that access point into a Series
|
|
1277
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
1278
|
+
|
|
1279
|
+
self.logger.info(f"------- Best CPU-EQCCTPro Configuration for Requested Input Parameters Based on the available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1280
|
+
self.logger.info(f"Model Used: {best_config.get('Model Used')}")
|
|
1281
|
+
self.logger.info(f"CPU(s): {cpu}")
|
|
1282
|
+
self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
|
|
1283
|
+
self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
|
|
1284
|
+
self.logger.info(f"Waveform Timespace: {best_config['Total Waveform Analysis Timespace (min)']}")
|
|
1285
|
+
self.logger.info(f"Total Number of Stations Used: {station_count}")
|
|
1286
|
+
self.logger.info(f"Total Number of Timechunks: {best_config['Total Number of Timechunks']}")
|
|
1287
|
+
self.logger.info(f"Length of Timechunks (min): {best_config['Length of Timechunk (min)']}")
|
|
1288
|
+
self.logger.info(f"Concurrent Timechunk Processes: {best_config['Concurrent Timechunks Used']}")
|
|
1289
|
+
self.logger.info(f"Concurrent Station Processes Per Timechunk: {best_config['Number of Concurrent Station Tasks']}")
|
|
1290
|
+
self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
1291
|
+
self.logger.info("")
|
|
1292
|
+
|
|
1293
|
+
# return int(float(cpu)), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(best_config["Concurrent Timechunks Used"])), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(station_count))
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
class OptimalGPUConfigurationFinder:
|
|
1297
|
+
"""Finds the optimal GPU configuration based on evaluation system results."""
|
|
1298
|
+
|
|
1299
|
+
def __init__(self,
|
|
1300
|
+
eval_sys_results_dir: str,
|
|
1301
|
+
log_file_path: str):
|
|
1302
|
+
|
|
1303
|
+
self.eval_sys_results_dir = eval_sys_results_dir
|
|
1304
|
+
if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
|
|
1305
|
+
raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
|
|
1306
|
+
self.log_file_path = log_file_path
|
|
1307
|
+
|
|
1308
|
+
# Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
|
|
1309
|
+
self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
|
|
1310
|
+
self.logger.setLevel(logging.INFO)
|
|
1311
|
+
self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
|
|
1312
|
+
if not self.logger.handlers: # avoid duplicating inits
|
|
1313
|
+
fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
1314
|
+
# ensure parent dir
|
|
1315
|
+
Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1316
|
+
file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
|
|
1317
|
+
stream_h = logging.StreamHandler() # Sends logs to console
|
|
1318
|
+
file_h.setFormatter(fmt)
|
|
1319
|
+
stream_h.setFormatter(fmt)
|
|
1320
|
+
self.logger.addHandler(file_h)
|
|
1321
|
+
self.logger.addHandler(stream_h)
|
|
1322
|
+
|
|
1323
|
+
def find_best_overall_usecase(self):
|
|
1324
|
+
"""Finds the best overall GPU configuration from evaluation results."""
|
|
1325
|
+
file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_gpu.csv"
|
|
1326
|
+
if not os.path.exists(file_path):
|
|
1327
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1328
|
+
|
|
1329
|
+
df = pd.read_csv(file_path)
|
|
1330
|
+
if df.empty:
|
|
1331
|
+
raise ValueError(f"[{datetime.now()}] Error: '{file_path}' is empty.")
|
|
1332
|
+
|
|
1333
|
+
row = df.iloc[0] # the best row you wrote out
|
|
1334
|
+
|
|
1335
|
+
# Some codepaths use two different column names for concurrency; support both
|
|
1336
|
+
conc_col = "Number of Concurrent Station Tasks per Timechunk" \
|
|
1337
|
+
if "Number of Concurrent Station Tasks per Timechunk" in df.columns \
|
|
1338
|
+
else "Number of Concurrent Station Tasks"
|
|
1339
|
+
|
|
1340
|
+
# Robust GPU parse: accepts [0], (0,), "0", 0, "", None
|
|
1341
|
+
num_gpus_list = _parse_gpus_field(row.get("GPUs Used"))
|
|
1342
|
+
# Keep as tuple for display/consistency
|
|
1343
|
+
num_gpus = tuple(num_gpus_list)
|
|
1344
|
+
|
|
1345
|
+
# Pull/normalize scalars
|
|
1346
|
+
num_cpus = row.get("Number of CPUs Allocated for Ray to Use")
|
|
1347
|
+
num_concurrent = row.get(conc_col)
|
|
1348
|
+
intra_threads = row.get("Intra-parallelism Threads")
|
|
1349
|
+
inter_threads = row.get("Inter-parallelism Threads")
|
|
1350
|
+
num_stations = row.get("Number of Stations Used")
|
|
1351
|
+
total_runtime = row.get("Total Run time for Picker (s)")
|
|
1352
|
+
vram_used = row.get("Inference Actor Memory Limit (MB)")
|
|
1353
|
+
model_used = row.get("Model Used")
|
|
1354
|
+
|
|
1355
|
+
self.logger.info("")
|
|
1356
|
+
self.logger.info(f"------- Finding the Best Overall GPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1357
|
+
self.logger.info("")
|
|
1358
|
+
self.logger.info(f"Model Used: {model_used}")
|
|
1359
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
1360
|
+
self.logger.info(f"GPU ID(s): {num_gpus_list}")
|
|
1361
|
+
self.logger.info(f"Concurrent Predictions: {num_concurrent}")
|
|
1362
|
+
self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
|
|
1363
|
+
self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
|
|
1364
|
+
self.logger.info(f"Stations: {num_stations}")
|
|
1365
|
+
self.logger.info(f"Inference Actor Memory Limit (MB): {vram_used}")
|
|
1366
|
+
self.logger.info(f"Total Runtime (s): {total_runtime}")
|
|
1367
|
+
self.logger.info("")
|
|
1368
|
+
# return int(float(num_cpus)), int(float(num_concurrent_predictions)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
|
|
1369
|
+
|
|
1370
|
+
def find_optimal_for(self, num_cpus: int, gpu_list: list, station_count: int):
|
|
1371
|
+
"""Finds the optimal configuration for a given number of CPUs, GPUs, and stations."""
|
|
1372
|
+
if num_cpus is None or station_count is None or gpu_list is None:
|
|
1373
|
+
raise ValueError("Error: num_cpus, station_count, and gpu_list must have valid values.")
|
|
1374
|
+
|
|
1375
|
+
file_path = f"{self.eval_sys_results_dir}/optimal_configurations_gpu.csv"
|
|
1376
|
+
if not os.path.exists(file_path):
|
|
1377
|
+
raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
|
|
1378
|
+
|
|
1379
|
+
df_optimal = pd.read_csv(file_path)
|
|
1380
|
+
|
|
1381
|
+
# Convert relevant columns to numeric, handling NaNs
|
|
1382
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
1383
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
1384
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
1385
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
1386
|
+
df_optimal["Inference Actor Memory Limit (MB)"] = pd.to_numeric(df_optimal["Inference Actor Memory Limit (MB)"], errors="coerce")
|
|
1387
|
+
|
|
1388
|
+
# Convert "GPUs Used" from string representation to list
|
|
1389
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
|
1390
|
+
|
|
1391
|
+
# Convert GPU lists to tuples for comparison
|
|
1392
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
|
|
1393
|
+
|
|
1394
|
+
# Ensure gpu_list is in tuple format for comparison
|
|
1395
|
+
gpu_list_tuple = tuple(gpu_list) if isinstance(gpu_list, list) else (gpu_list,)
|
|
1396
|
+
|
|
1397
|
+
filtered_df = df_optimal[
|
|
1398
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
|
|
1399
|
+
(df_optimal["GPUs Used"] == gpu_list_tuple) &
|
|
1400
|
+
(df_optimal["Number of Stations Used"] == station_count)
|
|
1401
|
+
]
|
|
1402
|
+
|
|
1403
|
+
if filtered_df.empty:
|
|
1404
|
+
raise ValueError("No matching configuration found. Please enter a valid entry.")
|
|
1405
|
+
|
|
1406
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
1407
|
+
|
|
1408
|
+
self.logger.info(f"------- Best GPU-EQCCTPro Configuration for Requested Input Parameters Based on the Available Trial Data in {self.eval_sys_results_dir} -------")
|
|
1409
|
+
self.logger.info(f"CPU(s): {num_cpus}")
|
|
1410
|
+
self.logger.info(f"GPU(s): {gpu_list}")
|
|
1411
|
+
self.logger.info(f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}")
|
|
1412
|
+
self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
|
|
1413
|
+
self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
|
|
1414
|
+
self.logger.info(f"Stations: {station_count}")
|
|
1415
|
+
self.logger.info(f"Inference Actor Memory Limit (MB): {best_config['Inference Actor Memory Limit (MB)']}")
|
|
1416
|
+
self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
1417
|
+
|
|
1418
|
+
# return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
|
|
1419
|
+
# int(float(best_config["Number of Concurrent Station Tasks"])), \
|
|
1420
|
+
# int(float(best_config["Intra-parallelism Threads"])), \
|
|
1421
|
+
# int(float(best_config["Inter-parallelism Threads"])), \
|
|
1422
|
+
# gpu_list, \
|
|
1423
|
+
# int(float(best_config["Inference Actor Memory Limit (MB)"])), \
|
|
1424
|
+
# int(float(station_count))
|