eqcctpro 0.4.6__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1424 @@
1
+ """
2
+ functionality.py controls all the functionality of EQCCTPro, specifically how we access mseed_predictor() and parallel_predict.
3
+ It is a level of abstraction so we can make the code more concise and cleaner
4
+ """
5
+ import os
6
+ import gc
7
+ import ray
8
+ import sys
9
+ import ast
10
+ import math
11
+ import queue
12
+ import psutil
13
+ import random
14
+ import numbers
15
+ import logging
16
+ import resource
17
+ import threading
18
+ from .tools import *
19
+ from pathlib import Path
20
+ from .parallelization import *
21
+ from obspy import UTCDateTime
22
+ from ray.util.queue import Queue
23
+ from datetime import datetime, timedelta
24
+ from .tools import _parse_gpus_field
25
+ from logging.handlers import QueueHandler, QueueListener
26
+
27
+
28
+ class RunEQCCTPro():
29
+ """RunEQCCTPro class for running the RunEQCCTPro functions for multiple instances of the class"""
30
+ def __init__(self, # self is 'this instance' of the class
31
+ use_gpu: bool,
32
+ input_dir: str,
33
+ output_dir: str,
34
+ log_filepath: str,
35
+ p_model_filepath: str = None,
36
+ s_model_filepath: str = None,
37
+ number_of_concurrent_station_predictions: int = None,
38
+ number_of_concurrent_timechunk_predictions: int = 1,
39
+ intra_threads: int = 1,
40
+ inter_threads: int = 1,
41
+ P_threshold: float = 0.001,
42
+ S_threshold: float = 0.02,
43
+ specific_stations: str = None,
44
+ csv_dir: str = None,
45
+ best_usecase_config: bool = False,
46
+ vram_mb: float = None,
47
+ selected_gpus: list = None,
48
+ cpu_id_list: list = [1],
49
+ start_time:str = None,
50
+ end_time:str = None,
51
+ timechunk_dt:int = 1,
52
+ waveform_overlap:int = 0,
53
+ tmp_dir:str = None,
54
+ # SeisBench model parameters
55
+ model_type: str = 'eqcct', # 'eqcct' or 'seisbench'
56
+ seisbench_parent_model: str = None, # e.g., 'PhaseNet', 'EQTransformer'
57
+ seisbench_child_model: str = None, # e.g., 'original', 'stead'
58
+ Detection_threshold: float = 0.3): # Detection threshold for SeisBench models
59
+
60
+ self.use_gpu = use_gpu # 'this instance' of the classes object, use_gpu = use_gpu
61
+ self.input_dir = input_dir
62
+ self.output_dir = output_dir
63
+ self.log_filepath = log_filepath
64
+ self.p_model_filepath = p_model_filepath
65
+ self.s_model_filepath = s_model_filepath
66
+ self.number_of_concurrent_station_predictions = number_of_concurrent_station_predictions
67
+ self.number_of_concurrent_timechunk_predictions = number_of_concurrent_timechunk_predictions
68
+ self.intra_threads = intra_threads
69
+ self.inter_threads = inter_threads
70
+ self.P_threshold = P_threshold
71
+ self.S_threshold = S_threshold
72
+ self.specific_stations = specific_stations
73
+ self.csv_dir = csv_dir
74
+ self.best_usecase_config = best_usecase_config
75
+ self.vram_mb = vram_mb
76
+ self.selected_gpus = selected_gpus if selected_gpus is not None else list_gpu_ids() # a list of the GPU IDs. If not provided, we use all available GPUs
77
+ self.cpu_id_list = cpu_id_list
78
+ self.cpu_count = len(cpu_id_list)
79
+ self.start_time = start_time
80
+ self.end_time = end_time
81
+ self.timechunk_dt = timechunk_dt
82
+ self.waveform_overlap = waveform_overlap
83
+ self.home_tmp_dir = tmp_dir
84
+
85
+ # SeisBench model parameters
86
+ self.model_type = model_type.lower()
87
+ self.seisbench_parent_model = seisbench_parent_model
88
+ self.seisbench_child_model = seisbench_child_model
89
+ self.Detection_threshold = Detection_threshold
90
+
91
+ # Validate model type and parameters
92
+ if self.model_type not in ['eqcct', 'seisbench']:
93
+ raise ValueError(f"model_type must be 'eqcct' or 'seisbench', got '{model_type}'")
94
+
95
+ if self.model_type == 'eqcct':
96
+ if p_model_filepath is None or s_model_filepath is None:
97
+ raise ValueError("For EQCCT model_type, p_model_filepath and s_model_filepath are required")
98
+ if number_of_concurrent_station_predictions is None:
99
+ raise ValueError("number_of_concurrent_station_predictions is required for EQCCT")
100
+ elif self.model_type == 'seisbench':
101
+ if seisbench_parent_model is None or seisbench_child_model is None:
102
+ raise ValueError("For SeisBench model_type, seisbench_parent_model and seisbench_child_model are required")
103
+ if number_of_concurrent_station_predictions is None:
104
+ raise ValueError("number_of_concurrent_station_predictions is required for SeisBench")
105
+
106
+ # Ensures that the output_dir exists. If it doesn't, we create it
107
+ os.makedirs(self.output_dir, exist_ok=True)
108
+
109
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
110
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
111
+ self.logger.setLevel(logging.INFO)
112
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
113
+ if not self.logger.handlers: # avoid duplicating inits
114
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
115
+ file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
116
+ stream_h = logging.StreamHandler() # Sends logs to console
117
+ file_h.setFormatter(fmt)
118
+ stream_h.setFormatter(fmt)
119
+ self.logger.addHandler(file_h)
120
+ self.logger.addHandler(stream_h)
121
+
122
+ self.logger.info("")
123
+ self.logger.info(f"------- Welcome to EQCCTPro -------")
124
+ self.logger.info("")
125
+
126
+ # If the user passed a GPU but no valid VRAM, need to exit
127
+ if self.use_gpu and not (isinstance(self.vram_mb, numbers.Real) and math.isfinite(self.vram_mb) and self.vram_mb > 0):
128
+ self.logger.error(f"No numerical VRAM passed. Please provide vram_mb (MB per Raylet per GPU) as a positive real number. Exiting...")
129
+ sys.exit(1)
130
+
131
+ # We need to ensure that the vram specified does not exceed the capabilities of the system, if not, we need to exit safely before it happens
132
+ if self.use_gpu:
133
+ # Determine model VRAM requirement based on model type
134
+ if self.model_type == 'seisbench':
135
+ from .parallelization import get_seisbench_model_vram_mb
136
+ model_vram_mb = get_seisbench_model_vram_mb(
137
+ self.seisbench_parent_model,
138
+ self.seisbench_child_model,
139
+ default_mb=2000.0 # Default VRAM for SeisBench models
140
+ )
141
+ self.logger.info(f"Using VRAM requirement: {model_vram_mb:.0f} MB for SeisBench model {self.seisbench_parent_model}/{self.seisbench_child_model}")
142
+ else:
143
+ model_vram_mb = 1500.0 # Safety reserve for EQCCT
144
+
145
+ check_vram_per_gpu_style(
146
+ selected_gpus=self.selected_gpus,
147
+ get_gpu_vram_fn=lambda gid: get_gpu_vram(gpu_index=gid),
148
+ intended_workers=self.number_of_concurrent_station_predictions * self.number_of_concurrent_timechunk_predictions,
149
+ vram_mb=self.vram_mb,
150
+ model_vram_mb=model_vram_mb,
151
+ safety_cap=0.95,
152
+ eqcct_overhead_gb=0.0,
153
+ logger=self.logger)
154
+
155
+ # To-Do: merge dt_task_generator and chunk_time into one function and concatenate the objects so we dont have so much stuff running around
156
+ # Generates the dt tasks list
157
+ def dt_task_generator(self):
158
+ # Modifies the times_list values (see chunk_time()) so it can be in a format the mseed_predictor can use
159
+ tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
160
+ self.tasks_picker = tasks
161
+
162
+ def chunk_time(self):
163
+ # Creates the timechunks, EI. from X specific time to Y specific time to generate the dt tasks (timechunk tasks that are run in parallel first at the top level)
164
+ # EX. [[UTCDateTime(2024, 12, 15, 11, 58), UTCDateTime(2024, 12, 15, 13, 0)], [UTCDateTime(2024, 12, 15, 12, 58), UTCDateTime(2024, 12, 15, 14, 0)]]
165
+ starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
166
+ endtime = UTCDateTime(self.end_time)
167
+
168
+ times_list = []
169
+ start = starttime
170
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
171
+ while start <= endtime:
172
+ if end >= endtime:
173
+ end = endtime
174
+ times_list.append([start, end])
175
+ break
176
+ times_list.append([start, end])
177
+ start = end - (self.waveform_overlap * 60)
178
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
179
+
180
+ self.times_list = times_list
181
+
182
+ def _drain_worker_logs(self):
183
+ while True:
184
+ rec = self.log_queue.get() # blocks until a record arrives
185
+ if rec is None: break # sentinel to stop thread
186
+ try:
187
+ self.logger.handle(rec) # routes to file+console handlers
188
+ except Exception:
189
+ # never crash on logging
190
+ self.logger.exception("Failed to handle worker log record")
191
+
192
+ def configure_cpu(self):
193
+ # We need to configure the tf_environ for the CPU configuration that is being inputted
194
+ self.logger.info(f"Running EQCCT over Requested MSeed Files using CPU(s)...")
195
+ skip_tf = (self.model_type != 'eqcct')
196
+ if self.best_usecase_config:
197
+ # We use the best usecase configuration that was found using EvaluateSystem
198
+ result = find_optimal_configuration_cpu(best_overall_usecase=True, eval_sys_results_dir=self.csv_dir)
199
+ if result is None:
200
+ self.logger.info("")
201
+ self.logger.info(f"Error: Could not retrieve an optimal CPU configuration. Please check that the CSV file exists and try again. Exiting...")
202
+ exit() # Exit gracefully
203
+ cpus_to_use, num_concurrent_predictions, intra, inter, station_count = result
204
+ self.logger.info("")
205
+ self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, and {inter} Inter Threads...")
206
+ tf_environ(gpu_id=-1, intra_threads=intra, inter_threads=inter, logger=self.logger, skip_tf=skip_tf)
207
+ else:
208
+ # We pass the requested parameters to the tf_environ
209
+ tf_environ(gpu_id=-1, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger, skip_tf=skip_tf)
210
+
211
+ def configure_gpu(self):
212
+ # We need to configure the tf_environ for the GPU configuration that is being inputted
213
+ self.logger.info(f"Running EQCCT over Requested MSeed Files using GPU(s)...")
214
+ # In the main process (driver), we only set environment variables.
215
+ # We ALWAYS skip TensorFlow initialization here because the main process doesn't run models.
216
+ # This avoids confusing "No GPUs visible" messages if the driver's environment differs from workers.
217
+ skip_tf = True
218
+ if self.best_usecase_config:
219
+ result = find_optimal_configuration_gpu(True, self.csv_dir)
220
+ if result is None:
221
+ self.logger.info("")
222
+ self.logger.error(f"Error: Could not retrieve an optimal GPU configuration. Please check that the CSV file exists and try again. Exiting...")
223
+ exit() # Exit gracefully
224
+
225
+ self.logger.info("")
226
+ cpus_to_use, num_concurrent_predictions, intra, inter, gpus, vram_mb, station_count = result # Unpack values only if result is valid
227
+ self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, {inter} Inter Threads, {gpus} GPU IDs, and {vram_mb} MB VRAM per Task...")
228
+ tf_environ(gpu_id=1, vram_limit_mb=vram_mb, gpus_to_use=gpus, intra_threads=intra, inter_threads=inter, logger=self.logger, skip_tf=skip_tf)
229
+
230
+ else:
231
+ self.logger.info("")
232
+ self.logger.info(f"User requested to use GPU(s): {self.selected_gpus} with {self.vram_mb} MB of VRAM per Raylet (intra-op threads = {self.intra_threads}, inter-op threads = {self.inter_threads})") # Use the selected GPUs
233
+ tf_environ(gpu_id=1, vram_limit_mb=self.vram_mb, gpus_to_use=self.selected_gpus, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger, skip_tf=skip_tf)
234
+
235
+ def eqcctpro_parallelization(self):
236
+ if self.specific_stations is None: # We check if the station dirs are consistent, if not, exit
237
+ statement, specific_stations_list, do_i_exit = check_station_dirs(input_dir=self.input_dir)
238
+ self.logger.info(f"{statement}")
239
+ if do_i_exit: exit()
240
+
241
+ # We want to use a specified amount of stations
242
+ else: specific_stations_list = [station.strip() for station in self.specific_stations.split(',')]
243
+ statement = f"Using {len(specific_stations_list)} selected station(s)."
244
+ self.logger.info(f"{statement}")
245
+ self.logger.info("")
246
+
247
+ # Submit timechunk tasks to mseed_predictor
248
+ tasks_queue = []
249
+ log_queue = queue.Queue() # Create a queue for log entries
250
+
251
+ # Compute total analyis timeframe
252
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
253
+
254
+ max_pending_tasks = self.number_of_concurrent_timechunk_predictions
255
+ self.logger.info(f"------- Starting EQCCTPro... -------")
256
+ self.logger.info(f"Detailed subprocess information can be found in the log file.")
257
+ self.logger.info("")
258
+ for i in range(len(self.tasks_picker)):
259
+ mseed_timechunk_dir_name = self.tasks_picker[i][1]
260
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
261
+
262
+ # Concurrent Timechunks
263
+ while True:
264
+ if len(tasks_queue) < max_pending_tasks:
265
+ tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
266
+ P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
267
+ number_of_concurrent_station_predictions=self.number_of_concurrent_station_predictions, ray_cpus=self.cpu_id_list, use_gpu=self.use_gpu,
268
+ gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, specific_stations=specific_stations_list,
269
+ timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
270
+ number_of_concurrent_timechunk_predictions=self.number_of_concurrent_timechunk_predictions, total_analysis_time=total_analysis_time,
271
+ intra_threads=self.intra_threads, inter_threads=self.inter_threads,
272
+ model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
273
+ seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold))
274
+ break
275
+
276
+ else: # If there are more tasks than maximum, just process them
277
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
278
+ for finished_task in tasks_finished:
279
+ log_entry = ray.get(finished_task)
280
+ log_queue.put(log_entry) # Add log entry to the queue
281
+
282
+ # After adding all the tasks to queue, process what's left
283
+ while tasks_queue:
284
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
285
+ for finished_task in tasks_finished:
286
+ log_entry = ray.get(finished_task)
287
+ self.logger.info(log_entry)
288
+
289
+ # stop log forwarder
290
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
291
+ self._log_thread.join(timeout=2)
292
+
293
+ ray.shutdown()
294
+ self.logger.info(f"Ray Successfully Shutdown.")
295
+ self.logger.info("------- Successfully Picked All Waveform(s) from all Timechunk(s) -------")
296
+ # self.logger.info("------- END OF FILE -------")
297
+
298
+ def run_eqcctpro(self):
299
+ # Set CPU affinity
300
+ process = psutil.Process(os.getpid())
301
+ process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
302
+
303
+ self.chunk_time() # Generates the UTC times for each of the timesets in the given time range
304
+ self.dt_task_generator() # Generates the task list so can know how many total tasks there are for our given time range
305
+
306
+ if self.use_gpu: # GPU
307
+ self.configure_gpu()
308
+ ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False, _temp_dir=self.home_tmp_dir) # Ray initalization using GPUs
309
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
310
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
311
+ self._log_thread.start() # Starts the thread
312
+ # Log some import info to user
313
+ statement = f"Ray Successfully Initialized with {self.selected_gpus} GPU(s) and {len(self.cpu_id_list)} CPU(s) ({list(self.cpu_id_list)} CPU Affinity Binding)."
314
+ self.logger.info(f"{statement}")
315
+ self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
316
+
317
+ # Running parllelization
318
+ self.eqcctpro_parallelization()
319
+
320
+ else: # CPU
321
+ self.configure_cpu()
322
+ ray.init(ignore_reinit_error=True, num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False, _temp_dir=self.home_tmp_dir) # Ray initalization using CPUs
323
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
324
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
325
+ self._log_thread.start() # Starts the thread
326
+ # Log some import info to user
327
+ statement = f"Ray Successfully Initialized with {len(self.cpu_id_list)} CPU(s) ({list(self.cpu_id_list)} CPU Affinity Binding)."
328
+ self.logger.info(f"{statement}")
329
+ self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
330
+
331
+ # Running parllelization
332
+ self.eqcctpro_parallelization()
333
+
334
+ class EvaluateSystem():
335
+ """Evaluate System class for running the evaluation system functions for multiple instances of the class"""
336
+ def __init__(self,
337
+ eval_mode: str,
338
+ input_dir: str,
339
+ output_dir: str,
340
+ log_filepath: str,
341
+ csv_dir: str,
342
+ p_model_filepath: str = None,
343
+ s_model_filepath: str = None,
344
+ P_threshold: float = 0.001,
345
+ S_threshold: float = 0.02,
346
+ intra_threads: int = 1,
347
+ inter_threads: int = 1,
348
+ stations2use:int = None,
349
+ cpu_id_list:list = [1],
350
+ cpu_test_step_size:int = 1,
351
+ starting_amount_of_stations: int = 1,
352
+ station_list_step_size: int = 1,
353
+ min_cpu_amount: int = 1,
354
+ min_conc_stations: int = 1,
355
+ conc_station_tasks_step_size: int = 1,
356
+ max_vram_mb:float = None,
357
+ gpu_vram_safety_cap:float = 0.90,
358
+ selected_gpus:list = None,
359
+ start_time:str = None,
360
+ end_time:str = None,
361
+ conc_timechunk_tasks_step_size: int = 1,
362
+ timechunk_dt:int = 1,
363
+ waveform_overlap:int = 0,
364
+ tmp_dir:str = None,
365
+ # SeisBench model parameters
366
+ model_type: str = 'eqcct', # 'eqcct' or 'seisbench'
367
+ seisbench_parent_model: str = None,
368
+ seisbench_child_model: str = None,
369
+ Detection_threshold: float = 0.3):
370
+
371
+ valid_modes = {"cpu", "gpu"}
372
+ if eval_mode not in valid_modes:
373
+ raise ValueError(f"Invalid mode '{eval_mode}'. Choose either 'cpu' or 'gpu'.")
374
+
375
+ self.eval_mode = eval_mode.lower()
376
+ self.intra_threads = intra_threads
377
+ self.inter_threads = inter_threads
378
+ self.input_dir = input_dir
379
+ self.output_dir = output_dir
380
+ self.log_filepath = log_filepath
381
+ self.csv_dir = csv_dir
382
+ self.P_threshold = P_threshold
383
+ self.S_threshold = S_threshold
384
+ self.p_model_filepath = p_model_filepath
385
+ self.s_model_filepath = s_model_filepath
386
+ self.stations2use = stations2use
387
+ self.cpu_id_list = cpu_id_list
388
+ self.vram_mb = max_vram_mb
389
+ self.gpu_vram_safety_cap = gpu_vram_safety_cap
390
+ self.selected_gpus = selected_gpus
391
+ self.use_gpu = True if self.eval_mode == 'gpu' else False
392
+ self.cpu_count = len(cpu_id_list)
393
+ self.cpu_test_step_size = cpu_test_step_size
394
+ self.starting_amount_of_stations = starting_amount_of_stations
395
+ self.station_list_step_size = station_list_step_size
396
+ self.min_cpu_amount = min_cpu_amount
397
+ self.min_conc_stations = min_conc_stations # default is = 1
398
+ self.conc_station_tasks_step_size = conc_station_tasks_step_size # default is = 1
399
+ self.stations2use_list = list(range(1, 11)) + list(range(15, 50, 5)) if stations2use is None else generate_station_list(self.starting_amount_of_stations, stations2use, self.station_list_step_size,)
400
+ self.start_time = start_time
401
+ self.end_time = end_time
402
+ self.conc_timechunk_tasks_step_size = conc_timechunk_tasks_step_size
403
+ self.timechunk_dt = timechunk_dt
404
+ self.waveform_overlap = waveform_overlap
405
+ self.home_tmp_dir = tmp_dir
406
+
407
+ # SeisBench model parameters
408
+ self.model_type = model_type.lower()
409
+ self.seisbench_parent_model = seisbench_parent_model
410
+ self.seisbench_child_model = seisbench_child_model
411
+ self.Detection_threshold = Detection_threshold
412
+
413
+ # Validate model type and parameters
414
+ if self.model_type not in ['eqcct', 'seisbench']:
415
+ raise ValueError(f"model_type must be 'eqcct' or 'seisbench', got '{model_type}'")
416
+
417
+ if self.model_type == 'eqcct':
418
+ if p_model_filepath is None or s_model_filepath is None:
419
+ raise ValueError("For EQCCT model_type, p_model_filepath and s_model_filepath are required")
420
+ elif self.model_type == 'seisbench':
421
+ if seisbench_parent_model is None or seisbench_child_model is None:
422
+ raise ValueError("For SeisBench model_type, seisbench_parent_model and seisbench_child_model are required")
423
+
424
+ # Ensures that the output_dir exists. If it doesn't, we create it
425
+ os.makedirs(self.output_dir, exist_ok=True)
426
+
427
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
428
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
429
+ self.logger.setLevel(logging.INFO)
430
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
431
+ if not self.logger.handlers: # avoid duplicating inits
432
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
433
+ file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
434
+ stream_h = logging.StreamHandler() # Sends logs to console
435
+ file_h.setFormatter(fmt)
436
+ stream_h.setFormatter(fmt)
437
+ self.logger.addHandler(file_h)
438
+ self.logger.addHandler(stream_h)
439
+
440
+ self.logger.info("")
441
+ self.logger.info(f"------- Welcome to EQCCTPro's EvaluateSystem Functionality -------")
442
+ self.logger.info("")
443
+ # Set up temp dir
444
+ import tempfile
445
+ tempfile.tempfile = self.home_tmp_dir
446
+
447
+ os.environ['TMPDIR'] = self.home_tmp_dir
448
+ os.environ['TEMP'] = self.home_tmp_dir
449
+ os.environ['TMP'] = self.home_tmp_dir
450
+ self.logger.info(f"Successfully set up temp files to be stored at {self.home_tmp_dir}")
451
+
452
+ # We need to ensure that the vram specified does not exceed the capabilities of t he system, if not, we need to exit safely before it happens
453
+ self.chunk_time()
454
+ intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
455
+ if self.eval_mode == 'gpu':
456
+ if not self.selected_gpus:
457
+ raise ValueError("selected_gpus must be set in GPU mode.")
458
+ self.chunk_time()
459
+ intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
460
+
461
+ # Determine model VRAM requirement based on model type
462
+ if self.model_type == 'seisbench':
463
+ from .parallelization import get_seisbench_model_vram_mb
464
+ model_vram_mb = get_seisbench_model_vram_mb(
465
+ self.seisbench_parent_model,
466
+ self.seisbench_child_model,
467
+ default_mb=2000.0
468
+ )
469
+ else:
470
+ model_vram_mb = 3000.0 # Default for EQCCT
471
+
472
+ per_gpu_free_mb = [get_gpu_vram(gpu_index=g)[1] * 1024.0 for g in self.selected_gpus] # free_gb -> MB
473
+ plan = evaluate_vram_capacity(
474
+ intended_workers=intended_workers,
475
+ vram_per_worker_mb=float(self.vram_mb),
476
+ per_gpu_free_mb=per_gpu_free_mb,
477
+ model_vram_mb=model_vram_mb,
478
+ safety_cap=self.gpu_vram_safety_cap,
479
+ eqcct_overhead_gb=1.1,
480
+ )
481
+ if not plan.ok_aggregate:
482
+ unit = plan.per_worker_mb + plan.overhead_mb
483
+ raise RuntimeError(
484
+ f"Insufficient aggregate VRAM. Cap={plan.aggregate_cap_mb:.0f} MB, "
485
+ f"Need={plan.aggregate_need_mb:.0f} MB (= {plan.model_vram_mb:.0f}×{len(self.selected_gpus)} + "
486
+ f"{plan.intended_workers}×{unit:.0f})."
487
+ )
488
+ self.logger.info(
489
+ f"VRAM budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ Cap {plan.aggregate_cap_mb:.0f} MB "
490
+ f"across {len(self.selected_gpus)} GPU(s)."
491
+ )
492
+
493
+ def _generate_stations_list(self):
494
+ """Generates station list"""
495
+ if self.station2use is None:
496
+ return list(range(1, 11)) + list(range(15, 50, 5))
497
+ return generate_station_list(self.stations2use, self.starting_amount_of_stations, self.station_list_step_size)
498
+
499
+ # def _prepare_environment(self):
500
+ # """Removed 'output_dir' so that there is no conflicts in the save for a clean output return"""
501
+ # remove_directory(self.output_dir)
502
+
503
+ def chunk_time(self):
504
+ starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
505
+ endtime = UTCDateTime(self.end_time)
506
+
507
+ times_list = []
508
+ start = starttime
509
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
510
+ while start <= endtime:
511
+ if end >= endtime:
512
+ end = endtime
513
+ times_list.append([start, end])
514
+ break
515
+ times_list.append([start, end])
516
+ start = end - (self.waveform_overlap * 60)
517
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
518
+
519
+ self.times_list = times_list
520
+
521
+ def _drain_worker_logs(self):
522
+ while True:
523
+ rec = self.log_queue.get() # blocks until a record arrives
524
+ if rec is None: break # sentinel to stop thread
525
+ try:
526
+ self.logger.handle(rec) # routes to file+console handlers
527
+ except Exception:
528
+ # never crash on logging
529
+ self.logger.exception("Failed to handle worker log record")
530
+
531
+ def dt_task_generator(self):
532
+ tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
533
+ self.tasks_picker = tasks
534
+
535
+ def _trial_key(self, *, num_cpus: int, stations: int, predictions: int, gpu_memory_limit_mb, timechunks: int, model: str, gpus: list = None) -> str:
536
+ # Use provided gpus parameter if available, otherwise fall back to self.selected_gpus
537
+ if gpus is not None:
538
+ gpus_to_use = gpus
539
+ else:
540
+ gpus_to_use = self.selected_gpus if self.selected_gpus is not None else []
541
+ gpus_norm = tuple(sorted(int(x) for x in gpus_to_use))
542
+ vram_norm = int(round(float(gpu_memory_limit_mb))) if gpu_memory_limit_mb not in ("", None) else ""
543
+ return f"cpus={int(num_cpus)}|gpus={gpus_norm}|stations={int(stations)}|pred={int(predictions)}|timechunks={int(timechunks)}|vram={vram_norm}|model={model}"
544
+
545
+ def _load_existing_trial_keys(self, csv_path: str) -> set[str]:
546
+ if not os.path.exists(csv_path):
547
+ return set()
548
+ try:
549
+ df = pd.read_csv(csv_path, keep_default_na=False)
550
+ except Exception:
551
+ return set()
552
+
553
+ keys = set()
554
+ for _, row in df.iterrows():
555
+ try:
556
+ num_cpus = int(float(row.get("Number of CPUs Allocated for Ray to Use", 0) or 0))
557
+ stations = int(float(row.get("Number of Stations Used", 0) or 0))
558
+ predictions = int(float(row.get("Number of Concurrent Station Tasks", 0) or 0))
559
+ timechunks = int(float(row.get("Concurrent Timechunks Used", 0) or 0))
560
+ vram = row.get("Inference Actor Memory Limit (MB)", "")
561
+ vram_mb = float(vram) if vram not in ("", None) else ""
562
+ # Parse GPUs from this specific row (don't overwrite self.selected_gpus)
563
+ row_gpus = _parse_gpus_field(row.get("GPUs Used")) or []
564
+ # Extract model info
565
+ model = row.get("Model Used", "eqcct") # Default to eqcct for legacy rows
566
+ # Pass the row's info directly to _trial_key
567
+ keys.add(self._trial_key(num_cpus=num_cpus, stations=stations, predictions=predictions,
568
+ gpu_memory_limit_mb=vram_mb, timechunks=timechunks, model=model, gpus=row_gpus))
569
+ except Exception:
570
+ continue
571
+ return keys
572
+
573
+ def evaluate_cpu(self):
574
+ """Evaluate system parallelization using CPUs"""
575
+ statement = "Evaluating System Parallelization Capability using CPU"
576
+ self.logger.info(f"{statement}")
577
+
578
+ os.makedirs(self.csv_dir, exist_ok=True)
579
+ os.makedirs(self.output_dir, exist_ok=True)
580
+
581
+ # Create test results csv
582
+ csv_filepath = f"{self.csv_dir}/cpu_test_results.csv"
583
+ prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
584
+ planned_keys = self._load_existing_trial_keys(csv_filepath)
585
+
586
+ self.chunk_time()
587
+ self.dt_task_generator()
588
+
589
+ trial_num = 1
590
+ log_queue = queue.Queue() # Create a queue for log entries
591
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
592
+
593
+ if self.min_cpu_amount > len(self.cpu_id_list):
594
+ # Code won't execute because the minimum CPU amount of > the len(cpu id list)
595
+ # In which the rest of the code is dependent on the len for generating cpu_count
596
+ print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
597
+ quit()
598
+
599
+ with open(self.log_filepath, mode="a+", buffering=1) as log:
600
+ for i in range(self.min_cpu_amount, self.cpu_count+1, self.cpu_test_step_size):
601
+ # Set CPU affinity and initialize Ray
602
+ cpus_to_use = self.cpu_id_list[:i]
603
+ process = psutil.Process(os.getpid())
604
+ process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
605
+
606
+ ray.init(ignore_reinit_error=True, num_cpus=len(cpus_to_use), logging_level=logging.FATAL, log_to_driver=False, _temp_dir=self.home_tmp_dir)
607
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
608
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
609
+ self._log_thread.start() # Starts the thread
610
+ self.logger.info(f"Ray Successfully Initialized with {len(cpus_to_use)} CPU(s) ({list(cpus_to_use)} CPU Affinity Binding).")
611
+
612
+ timechunks_list = []
613
+ timechunk = 1
614
+ step = self.conc_timechunk_tasks_step_size # Use the class attribute
615
+ while timechunk <= len(self.tasks_picker):
616
+ timechunks_list.append(timechunk)
617
+ if timechunk == 1:
618
+ timechunk += 1
619
+ else:
620
+ timechunk += step
621
+
622
+ if len(self.tasks_picker) not in timechunks_list:
623
+ timechunks_list.append(len(self.tasks_picker))
624
+ # sets are a set of multiple items stored in a single variable
625
+ # unchangable after being set, cannot have duplicates and is unordered
626
+ timechunks_list = sorted(list(set(timechunks_list)))
627
+ # Determine model name for trial key
628
+ if self.model_type == 'seisbench':
629
+ trial_model = f"{self.seisbench_parent_model}/{self.seisbench_child_model}"
630
+ else:
631
+ trial_model = "eqcct"
632
+
633
+ for timechunks in timechunks_list:
634
+ tested_concurrency = set() # Reset for each cpu / timechunk configuration
635
+ for num_stations in self.stations2use_list:
636
+ # Use a 20% step size for concurrency testing as requested
637
+ # This tests 20%, 40%, 60%, 80%, and 100% of the current total stations
638
+ step = max(1, int(num_stations * 0.2))
639
+ concurrent_predictions_list = sorted(list(set(range(step, num_stations + 1, step))))
640
+
641
+ # Efficiency optimization: only test concurrency values we haven't seen yet
642
+ # for this CPU/Timechunk combo to save compute time.
643
+ new_concurrent_values = [x for x in concurrent_predictions_list if x not in tested_concurrency]
644
+ if not new_concurrent_values:
645
+ continue # All concurrency values already tested
646
+ for num_concurrent_predictions in new_concurrent_values:
647
+ tested_concurrency.add(num_concurrent_predictions)
648
+ key = self._trial_key(
649
+ num_cpus=len(cpus_to_use),
650
+ stations=num_stations,
651
+ predictions=num_concurrent_predictions,
652
+ gpu_memory_limit_mb="", # CPU eval has no per-task VRAM cap
653
+ timechunks=timechunks,
654
+ model=trial_model
655
+ )
656
+ if key in planned_keys:
657
+ self.logger.info(f"[SKIP] Already tested: {key}")
658
+ continue
659
+ planned_keys.add(key)
660
+
661
+ mseed_timechunk_dir_name = self.tasks_picker[timechunks-1][1]
662
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
663
+ max_pending_tasks = timechunks
664
+
665
+ self.logger.info("")
666
+ self.logger.info(f"------- Trial Number: {trial_num} -------")
667
+ self.logger.info(f"CPU(s): {len(cpus_to_use)}")
668
+ self.logger.info(f"Conc. Timechunks Being Analyzed: {timechunks} / Total Timechunks to be Analyzed: {len(self.tasks_picker)}")
669
+ self.logger.info(f"Total Amount of Stations to be Processed in Current Trial: {num_stations} / Number of Stations Being Processed Concurrently: {num_concurrent_predictions} / Total Overall Trial Station Count: {max(self.stations2use_list)}")
670
+
671
+ # Concurrent Timechunks
672
+ tasks_queue = []
673
+ log_queue = queue.Queue() # Create a queue for log entries
674
+
675
+
676
+ # ===== RAM Baseline (before launching worker) =====
677
+ _rss = process.memory_info().rss
678
+ for _ch in process.children(recursive=True):
679
+ try:
680
+ _rss += _ch.memory_info().rss
681
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
682
+ pass
683
+ mem_before_total_mb = _rss / 1e6
684
+
685
+ # peak before (platform-aware)
686
+ if resource is not None: # Linux/macOS
687
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
688
+ if sys.platform.startswith("linux"):
689
+ peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
690
+ elif sys.platform == "darwin":
691
+ peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
692
+ else:
693
+ peak_before_mb = mem_before_total_mb # safe fallback
694
+ else: # Windows: no 'resource'
695
+ try:
696
+ peak_before_mb = process.memory_full_info().peak_wset / 1e6
697
+ except Exception:
698
+ peak_before_mb = mem_before_total_mb
699
+
700
+ try:
701
+ while True:
702
+ if len(tasks_queue) < max_pending_tasks:
703
+ tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
704
+ P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
705
+ number_of_concurrent_station_predictions=num_concurrent_predictions, ray_cpus=cpus_to_use, use_gpu=self.use_gpu,
706
+ gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, stations2use=num_stations,
707
+ timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
708
+ number_of_concurrent_timechunk_predictions=max_pending_tasks, total_analysis_time=total_analysis_time, testing_gpu=False,
709
+ test_csv_filepath=csv_filepath, intra_threads=self.intra_threads, inter_threads=self.inter_threads, timechunk_dt=self.timechunk_dt,
710
+ model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
711
+ seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold))
712
+
713
+ break
714
+
715
+ else:
716
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
717
+ for finished_task in tasks_finished:
718
+ log_entry = ray.get(finished_task)
719
+ log_queue.put(log_entry) # Add log entry to the queue
720
+
721
+ # After adding all the tasks to queue, process what's left
722
+ while tasks_queue:
723
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
724
+ for finished_task in tasks_finished:
725
+ log_entry = ray.get(finished_task)
726
+ log_queue.put(log_entry) # Add log entry to the queue
727
+
728
+ update_csv(csv_filepath, success=1, error_message="")
729
+ except Exception as e:
730
+ # Failure occured, need to add to log
731
+ error_msg = f"{type(e).__name__}: {str(e)}"
732
+ update_csv(csv_filepath, success=0, error_message=error_msg)
733
+ self.logger.error(f"Trial {trial_num} FAILED: {error_msg}")
734
+
735
+ # Write log entries from the queue to the file
736
+ while not log_queue.empty():
737
+ log_entry = log_queue.get()
738
+
739
+ remove_output_subdirs(self.output_dir, logger=self.logger)
740
+ trial_num += 1
741
+
742
+ # RAM cleanup
743
+ # ===== AFTER RUN (before cleanup) =====
744
+ _rss = process.memory_info().rss
745
+ for _ch in process.children(recursive=True):
746
+ try:
747
+ _rss += _ch.memory_info().rss
748
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
749
+ pass
750
+ mem_after_run_total_mb = _rss / 1e6
751
+ delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
752
+
753
+ # updated peak (platform-aware)
754
+ if resource is not None:
755
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
756
+ if sys.platform.startswith("linux"):
757
+ peak_after_mb = _ru / 1024.0
758
+ elif sys.platform == "darwin":
759
+ peak_after_mb = _ru / (1024.0 * 1024.0)
760
+ else:
761
+ peak_after_mb = mem_after_run_total_mb
762
+ else:
763
+ try:
764
+ peak_after_mb = process.memory_full_info().peak_wset / 1e6
765
+ except Exception:
766
+ peak_after_mb = mem_after_run_total_mb
767
+
768
+ self.logger.info("")
769
+ self.logger.info(
770
+ f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
771
+ f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
772
+ )
773
+
774
+ # ===== CLEANUP =====
775
+ # drop strong refs so GC matters
776
+ try: del ref
777
+ except NameError: pass
778
+ try: del log_entry
779
+ except NameError: pass
780
+
781
+ _rss = process.memory_info().rss
782
+ for _ch in process.children(recursive=True):
783
+ try:
784
+ _rss += _ch.memory_info().rss
785
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
786
+ pass
787
+ mem_before_clean_mb = _rss / 1e6
788
+
789
+ gc.collect()
790
+ time.sleep(0.1)
791
+
792
+ _rss = process.memory_info().rss
793
+ for _ch in process.children(recursive=True):
794
+ try:
795
+ _rss += _ch.memory_info().rss
796
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
797
+ pass
798
+ mem_after_clean_mb = _rss / 1e6
799
+
800
+ freed_mb = mem_before_clean_mb - mem_after_clean_mb
801
+ self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB") # To-Do: Fix the Freed so its beeter (for cpu and gpu)
802
+ self.logger.info("")
803
+
804
+ # tested_concurrency.update([x for x in concurrent_predictions_list if x <= num_stations])
805
+ tested_concurrency.update(new_concurrent_values)
806
+
807
+ # stop log forwarder
808
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
809
+ self._log_thread.join(timeout=2)
810
+
811
+ ray.shutdown() # Shutdown Ray after processing all timechunks for this CPU count
812
+ self.logger.info(f"Ray Successfully Shutdown.")
813
+
814
+
815
+ self.logger.info(f"Testing complete.")
816
+ self.logger.info(f"")
817
+ self.logger.info(f"Finding Optimal Configurations...")
818
+ # Compute optimal configurations (CPU)
819
+ df = pd.read_csv(csv_filepath)
820
+ optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_cpu(df)
821
+ optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_cpu.csv", index=False)
822
+ best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_cpu.csv", index=False)
823
+ self.logger.info(f"Optimal Configurations Found. Findings saved to:")
824
+ self.logger.info(f" 1) Optimal CPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_cpu.csv")
825
+ self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_cpu.csv")
826
+
827
+ def evaluate_gpu(self):
828
+ """Evaluate system parallelization using GPUs"""
829
+ statement = "Evaluating System Parallelization Capability using GPUs"
830
+ self.logger.info(f"{statement}")
831
+
832
+ os.makedirs(self.csv_dir, exist_ok=True)
833
+ os.makedirs(self.output_dir, exist_ok=True)
834
+
835
+ # Create test results csv
836
+ csv_filepath = f"{self.csv_dir}/gpu_test_results.csv"
837
+ prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
838
+
839
+ # Normalize existing CSV to ensure consistent "GPUs Used" formatting and quoting
840
+ if os.path.exists(csv_filepath):
841
+ from .tools import normalize_gpu_csv_quoting
842
+ normalize_gpu_csv_quoting(csv_filepath)
843
+ self.logger.info("Normalized existing CSV entries for consistent 'GPUs Used' formatting.")
844
+
845
+ planned_keys = self._load_existing_trial_keys(csv_filepath)
846
+
847
+ # Log summary of existing trials
848
+ if planned_keys:
849
+ self.logger.info(f"Loaded {len(planned_keys)} existing trial(s) from CSV. These will be skipped.")
850
+ # Count trials by GPU configuration
851
+ gpu_counts = {}
852
+ for key in planned_keys:
853
+ # Extract GPU info from key (format: cpus=X|gpus=(...)|...)
854
+ if "gpus=" in key:
855
+ gpu_part = key.split("gpus=")[1].split("|")[0]
856
+ gpu_counts[gpu_part] = gpu_counts.get(gpu_part, 0) + 1
857
+ if gpu_counts:
858
+ self.logger.info("Existing trials by GPU configuration:")
859
+ for gpu_config, count in sorted(gpu_counts.items()):
860
+ self.logger.info(f" {gpu_config}: {count} trial(s)")
861
+ else:
862
+ self.logger.info("No existing trials found in CSV. Starting fresh evaluation.")
863
+
864
+ # Calculate these at the start
865
+ self.chunk_time()
866
+ self.dt_task_generator()
867
+
868
+ trial_num = 1
869
+ log_queue = queue.Queue() # Create a queue for log entries
870
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
871
+
872
+ # Track statistics
873
+ trials_skipped = 0
874
+ trials_run = 0
875
+
876
+ if self.min_cpu_amount > len(self.cpu_id_list):
877
+ # Code won't execute because the minimum CPU amount of > the len(cpu id list)
878
+ # In which the rest of the code is dependent on the len for generating cpu_count
879
+ print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
880
+ quit()
881
+
882
+ for gpu in range(len(self.selected_gpus)):
883
+ for cpu in range(self.min_cpu_amount, len(self.cpu_id_list)+1, self.cpu_test_step_size):
884
+ # Set CPU affinity and initialize Ray
885
+ cpus_to_use = self.cpu_id_list[:cpu] # 'cpu' is a count (e.g., 1, 2, 3). Slicing [:cpu] gets that many CPU IDs, we want :cpu bc we are using 0 index counting so 0-20 exclusive = 0-19 IDs = 20 CPUs to use explicitely. ([:n is exclusive])
886
+ gpus_to_use = self.selected_gpus[:gpu+1] # 'gpu' is an index (0, 1, 2). We need +1 to include the current GPU.
887
+ # Set CPU affinity
888
+ process = psutil.Process(os.getpid())
889
+ process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
890
+
891
+ # VRAM budget per GPU (MB). If vram_mb is provided, treat it as an explicit per-GPU budget override.
892
+ free_vram_mb = float(self.vram_mb) if self.vram_mb else self.calculate_vram()
893
+ self.logger.info("")
894
+ self.logger.info("=" * 80)
895
+ self.logger.info(f"Testing Using {len(gpus_to_use)} GPU(s) with IDs {gpus_to_use} and {len(cpus_to_use)} CPU(s)")
896
+ self.logger.info("=" * 80)
897
+
898
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
899
+ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpus_to_use))
900
+ # Initialize Ray with GPUs
901
+ ray.init(ignore_reinit_error=True, num_gpus=len(gpus_to_use), num_cpus=len(cpus_to_use),
902
+ logging_level=logging.FATAL, log_to_driver=False, _temp_dir=self.home_tmp_dir)
903
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
904
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
905
+ self._log_thread.start() # Starts the thread
906
+ self.logger.info(f"Ray Successfully Initialized with {len(gpus_to_use)} GPU(s) and {len(cpus_to_use)} CPU(s) ({list(cpus_to_use)} CPU Affinity Binding).")
907
+ self.logger.info(f"Trials will evalute GPU(s) performance against Iterative Total Station Tasks ({self.stations2use_list}) with Varying Concurrent Predictions.")
908
+ self.logger.info("")
909
+
910
+ # Efficiency optimization: track tested configurations for this GPU/CPU combo
911
+ tested_gpu_configs = set()
912
+
913
+ # Determine model name for trial key
914
+ if self.model_type == 'seisbench':
915
+ trial_model = f"{self.seisbench_parent_model}/{self.seisbench_child_model}"
916
+ else:
917
+ trial_model = "eqcct"
918
+
919
+ for stations in self.stations2use_list:
920
+ # Use a 20% step size for concurrency testing as requested
921
+ step = max(1, int(stations * 0.2))
922
+ concurrent_predictions_list = sorted(list(set(range(step, stations + 1, step))))
923
+
924
+ self.logger.info(f"Evaluating GPU(s) against {stations} TOTAL STATION(s) with 20% STEP CONCURRENT STATION PREDICTIONS: {concurrent_predictions_list}")
925
+ for predictions in concurrent_predictions_list:
926
+ vram_per_task_mb = free_vram_mb / predictions
927
+ step_size = vram_per_task_mb * 0.2
928
+ vram_steps = np.arange(step_size, vram_per_task_mb + step_size, step_size)
929
+
930
+ # Determine minimum VRAM filter based on model type
931
+ if self.model_type == 'seisbench':
932
+ from .parallelization import get_seisbench_model_vram_mb
933
+ min_vram = get_seisbench_model_vram_mb(
934
+ self.seisbench_parent_model,
935
+ self.seisbench_child_model,
936
+ default_mb=2000.0
937
+ )
938
+ else:
939
+ min_vram = 3000.0 # EQCCT minimum
940
+
941
+ vram_steps = vram_steps[vram_steps >= min_vram]
942
+
943
+ # We are defining the hard upper limit on how much VRAM (GPU memory) TensorFlow is allowed to use inside each ModelActor process
944
+ # This includes 1) Loading the model weights into GPU memory during init and 2) Usage of the ModelActor (predictions, etc.)
945
+ # If the actor tries to allocate more memory than available, then a OOME will occur inside that actor only
946
+ # Good for testing OOME prevention while not letting actors steal available memory from other actors
947
+ # LSS: It is purely the VRAM ceiling for each shared inference actor that handles the actual model predictions
948
+ for gpu_memory_limit_mb in vram_steps:
949
+ gpu_memory_limit_mb = int(round(float(gpu_memory_limit_mb)))
950
+
951
+ # Efficiency check: avoid redundant tests for same (concurrency, vram)
952
+ config_key = (predictions, gpu_memory_limit_mb)
953
+ if config_key in tested_gpu_configs:
954
+ continue
955
+ tested_gpu_configs.add(config_key)
956
+
957
+ key = self._trial_key(
958
+ num_cpus=len(cpus_to_use),
959
+ stations=stations,
960
+ predictions=predictions,
961
+ gpu_memory_limit_mb=gpu_memory_limit_mb,
962
+ timechunks=1, # your GPU eval is explicitly "one timechunk at a time"
963
+ model=trial_model,
964
+ gpus=gpus_to_use, # Pass the actual GPUs being used in this iteration
965
+ )
966
+ if key in planned_keys:
967
+ self.logger.info(f"[SKIP] Already tested: {key}")
968
+ trials_skipped += 1
969
+ continue
970
+ planned_keys.add(key)
971
+ trials_run += 1
972
+ self.logger.info("")
973
+ self.logger.info(f"------- Trial Number: {trial_num} -------")
974
+ # self.logger.info(f"VRAM Limited to {gpu_memory_limit_mb:.2f} MB per Parallel Task")
975
+
976
+ # Get the first timechunk for testing
977
+ mseed_timechunk_dir_name = self.tasks_picker[0][1]
978
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
979
+
980
+ self.logger.info(f"Stations: {stations}")
981
+ self.logger.info(f"Concurrent Station Predictions: {predictions}")
982
+ self.logger.info(f"VRAM per Parallel Task: {gpu_memory_limit_mb:.2f} MB")
983
+ self.logger.info("")
984
+
985
+
986
+ # ===== Baseline RAM consumption (before launching worker) =====
987
+ _rss = process.memory_info().rss
988
+ for _ch in process.children(recursive=True):
989
+ try:
990
+ _rss += _ch.memory_info().rss
991
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
992
+ pass
993
+ mem_before_total_mb = _rss / 1e6
994
+
995
+ # peak before (platform-aware)
996
+ if resource is not None: # Linux/macOS
997
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
998
+ if sys.platform.startswith("linux"):
999
+ peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
1000
+ elif sys.platform == "darwin":
1001
+ peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
1002
+ else:
1003
+ peak_before_mb = mem_before_total_mb # safe fallback
1004
+ else: # Windows: no 'resource'
1005
+ try:
1006
+ peak_before_mb = process.memory_full_info().peak_wset / 1e6
1007
+ except Exception:
1008
+ peak_before_mb = mem_before_total_mb
1009
+
1010
+ try: # To Do: Add Concurrent Timechunks Testing for GPU/CPU too, reference eqcctpro_parallelization()
1011
+ # Call mseed_predictor directly via Ray (just like evaluate_cpu does)
1012
+ ref = mseed_predictor.options(num_gpus=0, num_cpus=1).remote(
1013
+ input_dir=timechunk_dir_path,
1014
+ output_dir=self.output_dir,
1015
+ log_queue=self.log_queue,
1016
+ P_threshold=self.P_threshold,
1017
+ S_threshold=self.S_threshold,
1018
+ p_model=self.p_model_filepath,
1019
+ s_model=self.s_model_filepath,
1020
+ number_of_concurrent_station_predictions=predictions,
1021
+ ray_cpus=cpus_to_use,
1022
+ use_gpu=self.use_gpu,
1023
+ gpu_id=gpus_to_use,
1024
+ gpu_memory_limit_mb=gpu_memory_limit_mb,
1025
+ stations2use=stations,
1026
+ timechunk_id=mseed_timechunk_dir_name,
1027
+ waveform_overlap=self.waveform_overlap,
1028
+ total_timechunks=len(self.tasks_picker),
1029
+ number_of_concurrent_timechunk_predictions=1, # Testing one timechunk at a time
1030
+ total_analysis_time=total_analysis_time,
1031
+ testing_gpu=True, # Enable test mode
1032
+ test_csv_filepath=csv_filepath,
1033
+ intra_threads=self.intra_threads,
1034
+ inter_threads=self.inter_threads,
1035
+ timechunk_dt=self.timechunk_dt,
1036
+ model_type=self.model_type, seisbench_parent_model=self.seisbench_parent_model,
1037
+ seisbench_child_model=self.seisbench_child_model, Detection_threshold=self.Detection_threshold
1038
+ )
1039
+
1040
+ # Wait for result
1041
+ log_entry = ray.get(ref)
1042
+ log_queue.put(log_entry) # Add log entry to the queue
1043
+
1044
+ # Success - update CSV
1045
+ update_csv(csv_filepath, success=1, error_message="")
1046
+
1047
+ except Exception as e:
1048
+ # Failure occurred, need to add to log
1049
+ error_msg = f"{type(e).__name__}: {str(e)}"
1050
+ update_csv(csv_filepath, success=0, error_message=error_msg)
1051
+ self.logger.info(f"Trial {trial_num} FAILED: {error_msg}")
1052
+
1053
+ # Write log entries from the queue to the file
1054
+ while not log_queue.empty():
1055
+ log_entry = log_queue.get()
1056
+ self.logger.info(f"{log_entry}") # FIX ME
1057
+
1058
+ remove_output_subdirs(self.output_dir, logger=self.logger)
1059
+ trial_num += 1
1060
+
1061
+ # RAM cleanup
1062
+ # ===== AFTER RUN (before cleanup) =====
1063
+ _rss = process.memory_info().rss
1064
+ for _ch in process.children(recursive=True):
1065
+ try:
1066
+ _rss += _ch.memory_info().rss
1067
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
1068
+ pass
1069
+ mem_after_run_total_mb = _rss / 1e6
1070
+ delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
1071
+
1072
+ # updated peak (platform-aware)
1073
+ if resource is not None:
1074
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
1075
+ if sys.platform.startswith("linux"):
1076
+ peak_after_mb = _ru / 1024.0
1077
+ elif sys.platform == "darwin":
1078
+ peak_after_mb = _ru / (1024.0 * 1024.0)
1079
+ else:
1080
+ peak_after_mb = mem_after_run_total_mb
1081
+ else:
1082
+ try:
1083
+ peak_after_mb = process.memory_full_info().peak_wset / 1e6
1084
+ except Exception:
1085
+ peak_after_mb = mem_after_run_total_mb
1086
+
1087
+ self.logger.info(
1088
+ f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
1089
+ f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
1090
+ )
1091
+
1092
+ # ===== CLEANUP =====
1093
+ # drop strong refs so GC matters
1094
+ try: del ref
1095
+ except NameError: pass
1096
+ try: del log_entry
1097
+ except NameError: pass
1098
+
1099
+ _rss = process.memory_info().rss
1100
+ for _ch in process.children(recursive=True):
1101
+ try:
1102
+ _rss += _ch.memory_info().rss
1103
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
1104
+ pass
1105
+ mem_before_clean_mb = _rss / 1e6
1106
+
1107
+ gc.collect()
1108
+ time.sleep(0.1)
1109
+
1110
+ _rss = process.memory_info().rss
1111
+ for _ch in process.children(recursive=True):
1112
+ try:
1113
+ _rss += _ch.memory_info().rss
1114
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
1115
+ pass
1116
+ mem_after_clean_mb = _rss / 1e6
1117
+
1118
+ freed_mb = mem_before_clean_mb - mem_after_clean_mb
1119
+ self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB\n")
1120
+ self.logger.info("")
1121
+
1122
+ # stop log forwarder
1123
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
1124
+ self._log_thread.join(timeout=2)
1125
+
1126
+ ray.shutdown() # Shutdown Ray after all testing
1127
+ self.logger.info(f"Ray Successfully Shutdown.")
1128
+
1129
+ self.logger.info(f"Testing complete.")
1130
+ self.logger.info(f"")
1131
+ self.logger.info(f"Trial Summary: {trials_run} new trial(s) executed, {trials_skipped} trial(s) skipped (already in CSV)")
1132
+ self.logger.info(f"Finding Optimal Configurations...")
1133
+ self.logger.info(f"Recalculating optimal configurations from all trial data (including new trials)...")
1134
+ # Compute optimal configurations (GPU)
1135
+ df = pd.read_csv(csv_filepath)
1136
+ optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_gpu(df)
1137
+ # Overwrite existing files with recalculated optimal configurations
1138
+ optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_gpu.csv", index=False)
1139
+ best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_gpu.csv", index=False)
1140
+ self.logger.info(f"Optimal Configurations Found. Findings saved to:")
1141
+ self.logger.info(f" 1) Optimal GPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_gpu.csv")
1142
+ self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_gpu.csv")
1143
+
1144
+ def evaluate(self):
1145
+ if self.eval_mode == "cpu":
1146
+ self.evaluate_cpu()
1147
+ elif self.eval_mode == "gpu":
1148
+ self.evaluate_gpu()
1149
+ else:
1150
+ exit()
1151
+
1152
+ def calculate_vram(self):
1153
+ cap = float(self.gpu_vram_safety_cap)
1154
+ if not (0.0 < cap <= 0.99):
1155
+ raise ValueError(f"gpu_vram_safety_cap must be in (0, 0.99], got {cap}.")
1156
+
1157
+ gpus = self.selected_gpus if self.selected_gpus else list_gpu_ids()
1158
+ if not gpus:
1159
+ raise RuntimeError("No GPUs detected for VRAM calculation.")
1160
+
1161
+ per_gpu_budget_mb = []
1162
+ for gid in gpus:
1163
+ total_gb, free_gb = get_gpu_vram(gpu_index=gid)
1164
+ total_mb = float(total_gb) * 1024.0
1165
+ free_mb = float(free_gb) * 1024.0
1166
+
1167
+ # hard cap vs physical total, plus never exceed currently free memory
1168
+ budget_mb = min(total_mb * cap, free_mb)
1169
+ per_gpu_budget_mb.append(budget_mb)
1170
+
1171
+ self.logger.info(
1172
+ f"GPU {gid}: total={total_gb:.2f} GB, free={free_gb:.2f} GB, "
1173
+ f"budget={budget_mb/1024.0:.2f} GB (cap={cap:.2f})"
1174
+ )
1175
+
1176
+ budget_mb_min = float(min(per_gpu_budget_mb))
1177
+ self.logger.info(f"Using per-GPU VRAM budget = {budget_mb_min:.0f} MB (min across selected GPUs).")
1178
+ return budget_mb_min
1179
+
1180
+
1181
+ """
1182
+ Finds the optimal CPU configuration based on evaluation results
1183
+ """
1184
+ class OptimalCPUConfigurationFinder:
1185
+ def __init__(self,
1186
+ eval_sys_results_dir: str,
1187
+ log_file_path: str):
1188
+
1189
+ self.eval_sys_results_dir = eval_sys_results_dir
1190
+ if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
1191
+ raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
1192
+ self.log_file_path = log_file_path
1193
+
1194
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
1195
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
1196
+ self.logger.setLevel(logging.INFO)
1197
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
1198
+ if not self.logger.handlers: # avoid duplicating inits
1199
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
1200
+ # ensure parent dir
1201
+ Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
1202
+ file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
1203
+ stream_h = logging.StreamHandler() # Sends logs to console
1204
+ file_h.setFormatter(fmt)
1205
+ stream_h.setFormatter(fmt)
1206
+ self.logger.addHandler(file_h)
1207
+ self.logger.addHandler(stream_h)
1208
+
1209
+
1210
+ def find_best_overall_usecase(self):
1211
+ """Finds the best overall CPU usecase configuation from eval results"""
1212
+ file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_cpu.csv"
1213
+ if not os.path.exists(file_path):
1214
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1215
+
1216
+ df_best_overall = pd.read_csv(file_path)
1217
+ # best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
1218
+ best_config_dict = df_best_overall.to_dict(orient='records')[0]
1219
+
1220
+ # Extract required values
1221
+ num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
1222
+ waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
1223
+ total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
1224
+ num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
1225
+ length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
1226
+ num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks per Timechunk")
1227
+ intra_threads = best_config_dict.get("Intra-parallelism Threads")
1228
+ inter_threads = best_config_dict.get("Inter-parallelism Threads")
1229
+ num_stations = best_config_dict.get("Number of Stations Used")
1230
+ total_runtime = best_config_dict.get("Total Run time for Picker (s)")
1231
+ model_used = best_config_dict.get("Model Used")
1232
+
1233
+ self.logger.info("")
1234
+ self.logger.info(f"------- Finding the Best Overall CPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
1235
+ self.logger.info(f"Model Used: {model_used}")
1236
+ self.logger.info(f"CPU(s): {num_cpus}")
1237
+ self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
1238
+ self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
1239
+ self.logger.info(f"Waveform Timespace: {waveform_timespace}")
1240
+ self.logger.info(f"Total Number of Stations Used: {num_stations}")
1241
+ self.logger.info(f"Total Number of Timechunks: {total_num_timechunks}")
1242
+ self.logger.info(f"Length of Timechunks (min): {length_of_timechunks}")
1243
+ self.logger.info(f"Concurrent Timechunk Processes: {num_concurrent_timechunks}")
1244
+ self.logger.info(f"Concurrent Station Processes Per Timechunk: {num_concurrent_stations}")
1245
+ self.logger.info(f"Total Runtime (s): {total_runtime}")
1246
+ self.logger.info("")
1247
+
1248
+ # return int(float(num_cpus)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_concurrent_timechunks)), int(float(num_concurrent_stations)), int(float(num_stations))
1249
+
1250
+ def find_optimal_for(self, cpu: int, station_count: int):
1251
+ """Finds the optimal configuration for a given number of CPUs and stations."""
1252
+ if cpu is None or station_count is None:
1253
+ raise ValueError("Error: CPU and station_count must have valid values.")
1254
+
1255
+ file_path = f"{self.eval_sys_results_dir}/optimal_configurations_cpu.csv"
1256
+ if not os.path.exists(file_path):
1257
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1258
+
1259
+ df_optimal = pd.read_csv(file_path)
1260
+
1261
+ # Convert relevant columns to numeric
1262
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
1263
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
1264
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
1265
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
1266
+
1267
+ filtered_df = df_optimal[
1268
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
1269
+ (df_optimal["Number of Stations Used"] == station_count)]
1270
+
1271
+ if filtered_df.empty:
1272
+ raise ValueError("No matching configuration found. Please enter a valid entry.")
1273
+
1274
+ # Finds for the "Total Run time for Picker (s)" the row with the smallest value and the '1' is to say I only want
1275
+ # only the single row where the smallest runtime is
1276
+ # iloc gets the selection of data from a numerical index from the df and turns that access point into a Series
1277
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
1278
+
1279
+ self.logger.info(f"------- Best CPU-EQCCTPro Configuration for Requested Input Parameters Based on the available Trial Data in {self.eval_sys_results_dir} -------")
1280
+ self.logger.info(f"Model Used: {best_config.get('Model Used')}")
1281
+ self.logger.info(f"CPU(s): {cpu}")
1282
+ self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
1283
+ self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
1284
+ self.logger.info(f"Waveform Timespace: {best_config['Total Waveform Analysis Timespace (min)']}")
1285
+ self.logger.info(f"Total Number of Stations Used: {station_count}")
1286
+ self.logger.info(f"Total Number of Timechunks: {best_config['Total Number of Timechunks']}")
1287
+ self.logger.info(f"Length of Timechunks (min): {best_config['Length of Timechunk (min)']}")
1288
+ self.logger.info(f"Concurrent Timechunk Processes: {best_config['Concurrent Timechunks Used']}")
1289
+ self.logger.info(f"Concurrent Station Processes Per Timechunk: {best_config['Number of Concurrent Station Tasks']}")
1290
+ self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
1291
+ self.logger.info("")
1292
+
1293
+ # return int(float(cpu)), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(best_config["Concurrent Timechunks Used"])), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(station_count))
1294
+
1295
+
1296
+ class OptimalGPUConfigurationFinder:
1297
+ """Finds the optimal GPU configuration based on evaluation system results."""
1298
+
1299
+ def __init__(self,
1300
+ eval_sys_results_dir: str,
1301
+ log_file_path: str):
1302
+
1303
+ self.eval_sys_results_dir = eval_sys_results_dir
1304
+ if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
1305
+ raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
1306
+ self.log_file_path = log_file_path
1307
+
1308
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
1309
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
1310
+ self.logger.setLevel(logging.INFO)
1311
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
1312
+ if not self.logger.handlers: # avoid duplicating inits
1313
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
1314
+ # ensure parent dir
1315
+ Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
1316
+ file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
1317
+ stream_h = logging.StreamHandler() # Sends logs to console
1318
+ file_h.setFormatter(fmt)
1319
+ stream_h.setFormatter(fmt)
1320
+ self.logger.addHandler(file_h)
1321
+ self.logger.addHandler(stream_h)
1322
+
1323
+ def find_best_overall_usecase(self):
1324
+ """Finds the best overall GPU configuration from evaluation results."""
1325
+ file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_gpu.csv"
1326
+ if not os.path.exists(file_path):
1327
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1328
+
1329
+ df = pd.read_csv(file_path)
1330
+ if df.empty:
1331
+ raise ValueError(f"[{datetime.now()}] Error: '{file_path}' is empty.")
1332
+
1333
+ row = df.iloc[0] # the best row you wrote out
1334
+
1335
+ # Some codepaths use two different column names for concurrency; support both
1336
+ conc_col = "Number of Concurrent Station Tasks per Timechunk" \
1337
+ if "Number of Concurrent Station Tasks per Timechunk" in df.columns \
1338
+ else "Number of Concurrent Station Tasks"
1339
+
1340
+ # Robust GPU parse: accepts [0], (0,), "0", 0, "", None
1341
+ num_gpus_list = _parse_gpus_field(row.get("GPUs Used"))
1342
+ # Keep as tuple for display/consistency
1343
+ num_gpus = tuple(num_gpus_list)
1344
+
1345
+ # Pull/normalize scalars
1346
+ num_cpus = row.get("Number of CPUs Allocated for Ray to Use")
1347
+ num_concurrent = row.get(conc_col)
1348
+ intra_threads = row.get("Intra-parallelism Threads")
1349
+ inter_threads = row.get("Inter-parallelism Threads")
1350
+ num_stations = row.get("Number of Stations Used")
1351
+ total_runtime = row.get("Total Run time for Picker (s)")
1352
+ vram_used = row.get("Inference Actor Memory Limit (MB)")
1353
+ model_used = row.get("Model Used")
1354
+
1355
+ self.logger.info("")
1356
+ self.logger.info(f"------- Finding the Best Overall GPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
1357
+ self.logger.info("")
1358
+ self.logger.info(f"Model Used: {model_used}")
1359
+ self.logger.info(f"CPU(s): {num_cpus}")
1360
+ self.logger.info(f"GPU ID(s): {num_gpus_list}")
1361
+ self.logger.info(f"Concurrent Predictions: {num_concurrent}")
1362
+ self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
1363
+ self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
1364
+ self.logger.info(f"Stations: {num_stations}")
1365
+ self.logger.info(f"Inference Actor Memory Limit (MB): {vram_used}")
1366
+ self.logger.info(f"Total Runtime (s): {total_runtime}")
1367
+ self.logger.info("")
1368
+ # return int(float(num_cpus)), int(float(num_concurrent_predictions)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
1369
+
1370
+ def find_optimal_for(self, num_cpus: int, gpu_list: list, station_count: int):
1371
+ """Finds the optimal configuration for a given number of CPUs, GPUs, and stations."""
1372
+ if num_cpus is None or station_count is None or gpu_list is None:
1373
+ raise ValueError("Error: num_cpus, station_count, and gpu_list must have valid values.")
1374
+
1375
+ file_path = f"{self.eval_sys_results_dir}/optimal_configurations_gpu.csv"
1376
+ if not os.path.exists(file_path):
1377
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1378
+
1379
+ df_optimal = pd.read_csv(file_path)
1380
+
1381
+ # Convert relevant columns to numeric, handling NaNs
1382
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
1383
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
1384
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
1385
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
1386
+ df_optimal["Inference Actor Memory Limit (MB)"] = pd.to_numeric(df_optimal["Inference Actor Memory Limit (MB)"], errors="coerce")
1387
+
1388
+ # Convert "GPUs Used" from string representation to list
1389
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
1390
+
1391
+ # Convert GPU lists to tuples for comparison
1392
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
1393
+
1394
+ # Ensure gpu_list is in tuple format for comparison
1395
+ gpu_list_tuple = tuple(gpu_list) if isinstance(gpu_list, list) else (gpu_list,)
1396
+
1397
+ filtered_df = df_optimal[
1398
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
1399
+ (df_optimal["GPUs Used"] == gpu_list_tuple) &
1400
+ (df_optimal["Number of Stations Used"] == station_count)
1401
+ ]
1402
+
1403
+ if filtered_df.empty:
1404
+ raise ValueError("No matching configuration found. Please enter a valid entry.")
1405
+
1406
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
1407
+
1408
+ self.logger.info(f"------- Best GPU-EQCCTPro Configuration for Requested Input Parameters Based on the Available Trial Data in {self.eval_sys_results_dir} -------")
1409
+ self.logger.info(f"CPU(s): {num_cpus}")
1410
+ self.logger.info(f"GPU(s): {gpu_list}")
1411
+ self.logger.info(f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}")
1412
+ self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
1413
+ self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
1414
+ self.logger.info(f"Stations: {station_count}")
1415
+ self.logger.info(f"Inference Actor Memory Limit (MB): {best_config['Inference Actor Memory Limit (MB)']}")
1416
+ self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
1417
+
1418
+ # return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
1419
+ # int(float(best_config["Number of Concurrent Station Tasks"])), \
1420
+ # int(float(best_config["Intra-parallelism Threads"])), \
1421
+ # int(float(best_config["Inter-parallelism Threads"])), \
1422
+ # gpu_list, \
1423
+ # int(float(best_config["Inference Actor Memory Limit (MB)"])), \
1424
+ # int(float(station_count))