eqcctpro 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eqcctpro might be problematic. Click here for more details.

@@ -0,0 +1,1155 @@
1
+ """
2
+ functionality.py controls all the functionality of EQCCTPro, specifically how we access mseed_predictor() and parallel_predict.
3
+ It is a level of abstraction so we can make the code more concise and cleaner
4
+ """
5
+ import os
6
+ import gc
7
+ import ray
8
+ import sys
9
+ import ast
10
+ import math
11
+ import queue
12
+ import psutil
13
+ import random
14
+ import numbers
15
+ import logging
16
+ import resource
17
+ import threading
18
+ from .tools import *
19
+ from pathlib import Path
20
+ from .parallelization import *
21
+ from obspy import UTCDateTime
22
+ from ray.util.queue import Queue
23
+ from datetime import datetime, timedelta
24
+ from .tools import _parse_gpus_field
25
+ from logging.handlers import QueueHandler, QueueListener
26
+
27
+
28
+ class RunEQCCTPro():
29
+ """RunEQCCTPro class for running the RunEQCCTPro functions for multiple instances of the class"""
30
+ def __init__(self, # self is 'this instance' of the class
31
+ use_gpu: bool,
32
+ input_dir: str,
33
+ output_dir: str,
34
+ log_filepath: str,
35
+ p_model_filepath: str,
36
+ s_model_filepath: str,
37
+ number_of_concurrent_station_predictions: int,
38
+ number_of_concurrent_timechunk_predictions: int,
39
+ intra_threads: int = 1,
40
+ inter_threads: int = 1,
41
+ P_threshold: float = 0.001,
42
+ S_threshold: float = 0.02,
43
+ specific_stations: str = None,
44
+ csv_dir: str = None,
45
+ best_usecase_config: bool = None,
46
+ vram_mb: float = None,
47
+ selected_gpus: list = None,
48
+ cpu_id_list: list = [1],
49
+ start_time:str = None,
50
+ end_time:str = None,
51
+ timechunk_dt:int = None,
52
+ waveform_overlap:int = None):
53
+
54
+ self.use_gpu = use_gpu # 'this instance' of the classes object, use_gpu = use_gpu
55
+ self.input_dir = input_dir
56
+ self.output_dir = output_dir
57
+ self.log_filepath = log_filepath
58
+ self.p_model_filepath = p_model_filepath
59
+ self.s_model_filepath = s_model_filepath
60
+ self.number_of_concurrent_station_predictions = number_of_concurrent_station_predictions
61
+ self.number_of_concurrent_timechunk_predictions = number_of_concurrent_timechunk_predictions
62
+ self.intra_threads = intra_threads
63
+ self.inter_threads = inter_threads
64
+ self.P_threshold = P_threshold
65
+ self.S_threshold = S_threshold
66
+ self.specific_stations = specific_stations
67
+ self.csv_dir = csv_dir
68
+ self.best_usecase_config = best_usecase_config
69
+ self.vram_mb = vram_mb
70
+ self.selected_gpus = selected_gpus # a list of the GPU IDs
71
+ self.cpu_id_list = cpu_id_list
72
+ self.cpu_count = len(cpu_id_list)
73
+ self.start_time = start_time
74
+ self.end_time = end_time
75
+ self.timechunk_dt = timechunk_dt
76
+ self.waveform_overlap = waveform_overlap
77
+
78
+ # Ensures that the output_dir exists. If it doesn't, we create it
79
+ os.makedirs(self.output_dir, exist_ok=True)
80
+
81
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
82
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
83
+ self.logger.setLevel(logging.INFO)
84
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
85
+ if not self.logger.handlers: # avoid duplicating inits
86
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
87
+ file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
88
+ stream_h = logging.StreamHandler() # Sends logs to console
89
+ file_h.setFormatter(fmt)
90
+ stream_h.setFormatter(fmt)
91
+ self.logger.addHandler(file_h)
92
+ self.logger.addHandler(stream_h)
93
+
94
+ self.logger.info("")
95
+ self.logger.info(f"------- Welcome to EQCCTPro -------")
96
+ self.logger.info("")
97
+
98
+ # If the user passed a GPU but no valid VRAM, need to exit
99
+ if self.use_gpu and not (isinstance(self.vram_mb, numbers.Real) and math.isfinite(self.vram_mb) and self.vram_mb > 0):
100
+ self.logger.error(f"No numerical VRAM passed. Please provide vram_mb (MB per Raylet per GPU) as a positive real number. Exiting...")
101
+ sys.exit(1)
102
+
103
+ # We need to ensure that the vram specified does not exceed the capabilities of the system, if not, we need to exit safely before it happens
104
+ if self.use_gpu:
105
+ check_vram_per_gpu_style(
106
+ selected_gpus=self.selected_gpus,
107
+ get_gpu_vram_fn=lambda gid: get_gpu_vram(gpu_index=gid),
108
+ intended_workers=self.number_of_concurrent_station_predictions * self.number_of_concurrent_timechunk_predictions,
109
+ vram_mb=self.vram_mb,
110
+ model_vram_mb=1500.0, # your safety reserve for EQCCT
111
+ safety_cap=0.95,
112
+ eqcct_overhead_gb=0.0,
113
+ logger=self.logger)
114
+
115
+ # To-Do: merge dt_task_generator and chunk_time into one function and concatenate the objects so we dont have so much stuff running around
116
+ # Generates the dt tasks list
117
+ def dt_task_generator(self):
118
+ # Modifies the times_list values (see chunk_time()) so it can be in a format the mseed_predictor can use
119
+ tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
120
+ self.tasks_picker = tasks
121
+
122
+ def chunk_time(self):
123
+ # Creates the timechunks, EI. from X specific time to Y specific time to generate the dt tasks (timechunk tasks that are run in parallel first at the top level)
124
+ # EX. [[UTCDateTime(2024, 12, 15, 11, 58), UTCDateTime(2024, 12, 15, 13, 0)], [UTCDateTime(2024, 12, 15, 12, 58), UTCDateTime(2024, 12, 15, 14, 0)]]
125
+ starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
126
+ endtime = UTCDateTime(self.end_time)
127
+
128
+ times_list = []
129
+ start = starttime
130
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
131
+ while start <= endtime:
132
+ if end >= endtime:
133
+ end = endtime
134
+ times_list.append([start, end])
135
+ break
136
+ times_list.append([start, end])
137
+ start = end - (self.waveform_overlap * 60)
138
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
139
+
140
+ self.times_list = times_list
141
+
142
+ def _drain_worker_logs(self):
143
+ while True:
144
+ rec = self.log_queue.get() # blocks until a record arrives
145
+ if rec is None: break # sentinel to stop thread
146
+ try:
147
+ self.logger.handle(rec) # routes to file+console handlers
148
+ except Exception:
149
+ # never crash on logging
150
+ self.logger.exception("Failed to handle worker log record")
151
+
152
+ def configure_cpu(self):
153
+ # We need to configure the tf_environ for the CPU configuration that is being inputted
154
+ self.logger.info(f"Running EQCCT over Requested MSeed Files using CPU(s)...")
155
+ if self.best_usecase_config:
156
+ # We use the best usecase configuration that was found using EvaluateSystem
157
+ result = find_optimal_configuration_cpu(best_overall_usecase=True, eval_sys_results_dir=self.csv_dir)
158
+ if result is None:
159
+ self.logger.info("")
160
+ self.logger.info(f"Error: Could not retrieve an optimal CPU configuration. Please check that the CSV file exists and try again. Exiting...")
161
+ exit() # Exit gracefully
162
+ cpus_to_use, num_concurrent_predictions, intra, inter, station_count = result
163
+ self.logger.info("")
164
+ self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, and {inter} Inter Threads...")
165
+ tf_environ(gpu_id=-1, intra_threads=intra, inter_threads=inter, logger=self.logger)
166
+ else:
167
+ # We pass the requested parameters to the tf_environ
168
+ tf_environ(gpu_id=-1, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger)
169
+
170
+ def configure_gpu(self):
171
+ # We need to configure the tf_environ for the GPU configuration that is being inputted
172
+ self.logger.info(f"Running EQCCT over Requested MSeed Files using GPU(s)...")
173
+ if self.best_usecase_config:
174
+ result = find_optimal_configuration_gpu(True, self.csv_dir)
175
+ if result is None:
176
+ self.logger.info("")
177
+ self.logger.error(f"Error: Could not retrieve an optimal GPU configuration. Please check that the CSV file exists and try again. Exiting...")
178
+ exit() # Exit gracefully
179
+
180
+ self.logger.info("")
181
+ cpus_to_use, num_concurrent_predictions, intra, inter, gpus, vram_mb, station_count = result # Unpack values only if result is valid
182
+ self.logger.info(f"Using {cpus_to_use} CPUs, {num_concurrent_predictions} Conc. Predictions, {intra} Intra Threads, {inter} Inter Threads, {gpus} GPU IDs, and {vram_mb} MB VRAM per Task...")
183
+ tf_environ(gpu_id=1, vram_limit_mb=vram_mb, gpus_to_use=gpus, intra_threads=intra, inter_threads=inter, logger=self.logger)
184
+
185
+ else:
186
+ self.logger.info("")
187
+ self.logger.info(f"User requested to use GPU(s): {self.selected_gpus} with {self.vram_mb} MB of VRAM per Raylet (intra-op threads = {self.intra_threads}, inter-op threads = {self.inter_threads})") # Use the selected GPUs
188
+ tf_environ(gpu_id=1, vram_limit_mb=self.vram_mb, gpus_to_use=self.selected_gpus, intra_threads=self.intra_threads, inter_threads=self.inter_threads, logger=self.logger)
189
+
190
+ def eqcctpro_parallelization(self):
191
+
192
+ if self.specific_stations is None: # We check if the station dirs are consistent, if not, exit
193
+ statement, specific_stations_list, do_i_exit = check_station_dirs(input_dir=self.input_dir)
194
+ self.logger.info(f"{statement}")
195
+ if do_i_exit: exit()
196
+
197
+ # We want to use a specified amount of stations
198
+ else: specific_stations_list = [station.strip() for station in self.specific_stations.split(',')]
199
+ statement = f"Using {len(specific_stations_list)} selected station(s)."
200
+ self.logger.info(f"{statement}")
201
+ self.logger.info("")
202
+
203
+ # Submit timechunk tasks to mseed_predictor
204
+ tasks_queue = []
205
+ log_queue = queue.Queue() # Create a queue for log entries
206
+
207
+ # Compute total analyis timeframe
208
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
209
+
210
+ max_pending_tasks = self.number_of_concurrent_timechunk_predictions
211
+ self.logger.info(f"------- Starting EQCCTPro... -------")
212
+ self.logger.info(f"Detailed subprocess information can be found in the log file.")
213
+ self.logger.info("")
214
+ for i in range(len(self.tasks_picker)):
215
+ mseed_timechunk_dir_name = self.tasks_picker[i][1]
216
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
217
+
218
+ # Concurrent Timechunks
219
+ while True:
220
+ if len(tasks_queue) < max_pending_tasks:
221
+ tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
222
+ P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
223
+ number_of_concurrent_station_predictions=self.number_of_concurrent_station_predictions, ray_cpus=self.cpu_id_list, use_gpu=self.use_gpu,
224
+ gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, specific_stations=specific_stations_list,
225
+ timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
226
+ number_of_concurrent_timechunk_predictions=self.number_of_concurrent_timechunk_predictions, total_analysis_time=total_analysis_time,
227
+ intra_threads=self.intra_threads, inter_threads=self.inter_threads))
228
+ break
229
+
230
+ else: # If there are more tasks than maximum, just process them
231
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
232
+ for finished_task in tasks_finished:
233
+ log_entry = ray.get(finished_task)
234
+ log_queue.put(log_entry) # Add log entry to the queue
235
+
236
+ # After adding all the tasks to queue, process what's left
237
+ while tasks_queue:
238
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
239
+ for finished_task in tasks_finished:
240
+ log_entry = ray.get(finished_task)
241
+ self.logger.info(log_entry)
242
+
243
+ # stop log forwarder
244
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
245
+ self._log_thread.join(timeout=2)
246
+
247
+ ray.shutdown()
248
+ self.logger.info(f"Ray Successfully Shutdown.")
249
+ self.logger.info("------- Successfully Picked All Waveform(s) from all Timechunk(s) -------")
250
+ # self.logger.info("------- END OF FILE -------")
251
+
252
+ def run_eqcctpro(self):
253
+ # Set CPU affinity
254
+ process = psutil.Process(os.getpid())
255
+ process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
256
+
257
+ self.chunk_time() # Generates the UTC times for each of the timesets in the given time range
258
+ self.dt_task_generator() # Generates the task list so can know how many total tasks there are for our given time range
259
+
260
+ if self.use_gpu: # GPU
261
+ self.configure_gpu()
262
+ ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False) # Ray initalization using GPUs
263
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
264
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
265
+ self._log_thread.start() # Starts the thread
266
+ # Log some import info to user
267
+ statement = f"Ray Successfully Initialized with {self.selected_gpus} GPU(s) and {len(self.cpu_id_list)} CPU(s)."
268
+ self.logger.info(f"{statement}")
269
+ self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
270
+
271
+ # Running parllelization
272
+ self.eqcctpro_parallelization()
273
+
274
+ else: # CPU
275
+ self.configure_cpu()
276
+ ray.init(ignore_reinit_error=True, num_cpus=len(self.cpu_id_list), logging_level=logging.ERROR, log_to_driver=False) # Ray initalization using CPUs
277
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
278
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
279
+ self._log_thread.start() # Starts the thread
280
+ # Log some import info to user
281
+ statement = f"Ray Successfully Initialized with {len(self.cpu_id_list)} CPU(s)."
282
+ self.logger.info(f"{statement}")
283
+ self.logger.info(f"Analyzing {len(self.times_list)} time chunk(s) from {self.start_time} to {self.end_time} (dt={self.timechunk_dt}min, overlap={self.waveform_overlap}min).")
284
+
285
+ # Running parllelization
286
+ self.eqcctpro_parallelization()
287
+
288
+ class EvaluateSystem():
289
+ """Evaluate System class for running the evaluation system functions for multiple instances of the class"""
290
+ def __init__(self,
291
+ eval_mode: str,
292
+ input_dir: str,
293
+ output_dir: str,
294
+ log_filepath: str,
295
+ csv_dir: str,
296
+ p_model_filepath: str,
297
+ s_model_filepath: str,
298
+ P_threshold: float = 0.001,
299
+ S_threshold: float = 0.02,
300
+ intra_threads: int = 1,
301
+ inter_threads: int = 1,
302
+ stations2use:int = None,
303
+ cpu_id_list:list = [1],
304
+ cpu_test_step_size:int = 1,
305
+ starting_amount_of_stations: int = 1,
306
+ station_list_step_size: int = 1,
307
+ min_cpu_amount: int = 1,
308
+ min_conc_stations: int = 1,
309
+ conc_station_tasks_step_size: int = 1,
310
+ vram_mb:float = None,
311
+ selected_gpus:list = None,
312
+ start_time:str = None,
313
+ end_time:str = None,
314
+ conc_timechunk_tasks_step_size: int = 1,
315
+ timechunk_dt:int = None,
316
+ waveform_overlap:int = None,
317
+ tmp_dir:str = None):
318
+
319
+ valid_modes = {"cpu", "gpu"}
320
+ if eval_mode not in valid_modes:
321
+ raise ValueError(f"Invalid mode '{eval_mode}'. Choose either 'cpu' or 'gpu'.")
322
+
323
+ self.eval_mode = eval_mode.lower()
324
+ self.intra_threads = intra_threads
325
+ self.inter_threads = inter_threads
326
+ self.input_dir = input_dir
327
+ self.output_dir = output_dir
328
+ self.log_filepath = log_filepath
329
+ self.csv_dir = csv_dir
330
+ self.P_threshold = P_threshold
331
+ self.S_threshold = S_threshold
332
+ self.p_model_filepath = p_model_filepath
333
+ self.s_model_filepath = s_model_filepath
334
+ self.stations2use = stations2use
335
+ self.cpu_id_list = cpu_id_list
336
+ self.vram_mb = vram_mb
337
+ self.selected_gpus = selected_gpus
338
+ self.cpu_count = len(cpu_id_list)
339
+ self.cpu_test_step_size = cpu_test_step_size
340
+ self.starting_amount_of_stations = starting_amount_of_stations
341
+ self.station_list_step_size = station_list_step_size
342
+ self.min_cpu_amount = min_cpu_amount
343
+ self.min_conc_stations = min_conc_stations # default is = 1
344
+ self.conc_station_tasks_step_size = conc_station_tasks_step_size # default is = 1
345
+ self.stations2use_list = list(range(1, 11)) + list(range(15, 50, 5)) if stations2use is None else generate_station_list(self.starting_amount_of_stations, stations2use, self.station_list_step_size,)
346
+ self.start_time = start_time
347
+ self.end_time = end_time
348
+ self.conc_timechunk_tasks_step_size = conc_timechunk_tasks_step_size
349
+ self.timechunk_dt = timechunk_dt
350
+ self.waveform_overlap = waveform_overlap
351
+ self.home_tmp_dir = tmp_dir
352
+
353
+ # Ensures that the output_dir exists. If it doesn't, we create it
354
+ os.makedirs(self.output_dir, exist_ok=True)
355
+
356
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
357
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
358
+ self.logger.setLevel(logging.INFO)
359
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
360
+ if not self.logger.handlers: # avoid duplicating inits
361
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
362
+ file_h = logging.FileHandler(self.log_filepath) # Writes logs to file
363
+ stream_h = logging.StreamHandler() # Sends logs to console
364
+ file_h.setFormatter(fmt)
365
+ stream_h.setFormatter(fmt)
366
+ self.logger.addHandler(file_h)
367
+ self.logger.addHandler(stream_h)
368
+
369
+ self.logger.info("")
370
+ self.logger.info(f"------- Welcome to EQCCTPro's EvaluateSystem Functionality -------")
371
+ self.logger.info("")
372
+ # Set up temp dir
373
+ import tempfile
374
+ tempfile.tempfile = self.home_tmp_dir
375
+
376
+ os.environ['TMPDIR'] = self.home_tmp_dir
377
+ os.environ['TEMP'] = self.home_tmp_dir
378
+ os.environ['TMP'] = self.home_tmp_dir
379
+ self.logger.info(f"Successfully set up temp files to be stored at {self.home_tmp_dir}")
380
+
381
+ # We need to ensure that the vram specified does not exceed the capabilities of t he system, if not, we need to exit safely before it happens
382
+ self.chunk_time()
383
+ intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
384
+ if self.eval_mode == 'gpu':
385
+ if not self.selected_gpus:
386
+ raise ValueError("selected_gpus must be set in GPU mode.")
387
+ self.chunk_time()
388
+ intended_workers = int(len(self.stations2use_list)) * int(len(self.times_list) // 2)
389
+
390
+ per_gpu_free_mb = [get_gpu_vram(gpu_index=g)[1] * 1024.0 for g in self.selected_gpus] # free_gb -> MB
391
+ plan = evaluate_vram_capacity(
392
+ intended_workers=intended_workers,
393
+ vram_per_worker_mb=float(self.vram_mb),
394
+ per_gpu_free_mb=per_gpu_free_mb,
395
+ model_vram_mb=3000.0,
396
+ safety_cap=0.90,
397
+ eqcct_overhead_gb=1.1,
398
+ )
399
+ if not plan.ok_aggregate:
400
+ unit = plan.per_worker_mb + plan.overhead_mb
401
+ raise RuntimeError(
402
+ f"Insufficient aggregate VRAM. Cap={plan.aggregate_cap_mb:.0f} MB, "
403
+ f"Need={plan.aggregate_need_mb:.0f} MB (= {plan.model_vram_mb:.0f}×{len(self.selected_gpus)} + "
404
+ f"{plan.intended_workers}×{unit:.0f})."
405
+ )
406
+ self.logger.info(
407
+ f"VRAM budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ Cap {plan.aggregate_cap_mb:.0f} MB "
408
+ f"across {len(self.selected_gpus)} GPU(s)."
409
+ )
410
+
411
+ def _generate_stations_list(self):
412
+ """Generates station list"""
413
+ if self.station2use is None:
414
+ return list(range(1, 11)) + list(range(15, 50, 5))
415
+ return generate_station_list(self.stations2use, self.starting_amount_of_stations, self.station_list_step_size)
416
+
417
+ # def _prepare_environment(self):
418
+ # """Removed 'output_dir' so that there is no conflicts in the save for a clean output return"""
419
+ # remove_directory(self.output_dir)
420
+
421
+ def chunk_time(self):
422
+ starttime = UTCDateTime(self.start_time) - (self.waveform_overlap * 60)
423
+ endtime = UTCDateTime(self.end_time)
424
+
425
+ times_list = []
426
+ start = starttime
427
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
428
+ while start <= endtime:
429
+ if end >= endtime:
430
+ end = endtime
431
+ times_list.append([start, end])
432
+ break
433
+ times_list.append([start, end])
434
+ start = end - (self.waveform_overlap * 60)
435
+ end = start + (self.waveform_overlap * 60) + (self.timechunk_dt * 60)
436
+
437
+ self.times_list = times_list
438
+
439
+ def _drain_worker_logs(self):
440
+ while True:
441
+ rec = self.log_queue.get() # blocks until a record arrives
442
+ if rec is None: break # sentinel to stop thread
443
+ try:
444
+ self.logger.handle(rec) # routes to file+console handlers
445
+ except Exception:
446
+ # never crash on logging
447
+ self.logger.exception("Failed to handle worker log record")
448
+
449
+ def dt_task_generator(self):
450
+ tasks = [[f"({i+1}/{len(self.times_list)})", f"{self.times_list[i][0].strftime(format='%Y%m%dT%H%M%SZ')}_{self.times_list[i][1].strftime(format='%Y%m%dT%H%M%SZ')}"] for i in range((len(self.times_list)))]
451
+ self.tasks_picker = tasks
452
+
453
+ def evaluate_cpu(self):
454
+ """Evaluate system parallelization using CPUs"""
455
+ statement = "Evaluating System Parallelization Capability using CPU"
456
+ self.logger.info(f"{statement}")
457
+
458
+ os.makedirs(self.csv_dir, exist_ok=True)
459
+ os.makedirs(self.output_dir, exist_ok=True)
460
+
461
+ # Create test results csv
462
+ csv_filepath = f"{self.csv_dir}/cpu_test_results.csv"
463
+ prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
464
+
465
+ self.chunk_time()
466
+ self.dt_task_generator()
467
+
468
+ trial_num = 1
469
+ log_queue = queue.Queue() # Create a queue for log entries
470
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
471
+
472
+ if self.eval_mode == 'gpu':
473
+ use_gpu = True
474
+ else:
475
+ use_gpu = False
476
+
477
+ if self.min_cpu_amount > len(self.cpu_id_list):
478
+ # Code won't execute because the minimum CPU amount of > the len(cpu id list)
479
+ # In which the rest of the code is dependent on the len for generating cpu_count
480
+ print(f"CPU ID List provided has less CPUs than the minimum requested ({len(self.cpu_id_list)} vs. {self.min_cpu_amount}). Exiting...")
481
+ quit()
482
+
483
+ with open(self.log_filepath, mode="a+", buffering=1) as log:
484
+ for i in range(self.min_cpu_amount, self.cpu_count+1, self.cpu_test_step_size):
485
+ # Set CPU affinity and initialize Ray
486
+ cpus_to_use = self.cpu_id_list[:i]
487
+ process = psutil.Process(os.getpid())
488
+ process.cpu_affinity(cpus_to_use) # Limit process to the given CPU IDs
489
+
490
+ ray.init(ignore_reinit_error=True, num_cpus=len(cpus_to_use), logging_level=logging.FATAL, log_to_driver=False)
491
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
492
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
493
+ self._log_thread.start() # Starts the thread
494
+ self.logger.info(f"Ray Successfully Initialized with {len(cpus_to_use)} CPU(s).")
495
+
496
+ timechunks_list = []
497
+ timechunk = 1
498
+ step = self.conc_timechunk_tasks_step_size # Use the class attribute
499
+ while timechunk <= len(self.tasks_picker):
500
+ timechunks_list.append(timechunk)
501
+ if timechunk == 1:
502
+ timechunk += 1
503
+ else:
504
+ timechunk += step
505
+
506
+ if len(self.tasks_picker) not in timechunks_list:
507
+ timechunks_list.append(len(self.tasks_picker))
508
+ # sets are a set of multiple items stored in a single variable
509
+ # unchangable after being set, cannot have duplicates and is unordered
510
+ timechunks_list = sorted(list(set(timechunks_list)))
511
+ for timechunks in timechunks_list:
512
+ tested_concurrency = set() # Rest for each cpu / timechunk
513
+ for num_stations in self.stations2use_list:
514
+ concurrent_predictions_list = generate_station_list(self.min_conc_stations, num_stations, self.conc_station_tasks_step_size)
515
+ # We do this so that we don't repeat concurrent prediction tests
516
+ # Because a number of concurrent predictions running can be equivilated to the number of total stations that need to be processed
517
+ # There is no need to duplicate more tests that will be doing the same amount of concurrent testing for a different number of total stations
518
+ new_concurrent_values = [x for x in concurrent_predictions_list if x not in tested_concurrency and x <= num_stations]
519
+ if not new_concurrent_values:
520
+ continue # All concurrency values already tested
521
+ for num_concurrent_predictions in new_concurrent_values:
522
+ mseed_timechunk_dir_name = self.tasks_picker[timechunks-1][1]
523
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
524
+ max_pending_tasks = timechunks
525
+
526
+ self.logger.info("")
527
+ self.logger.info(f"------- Trial Number: {trial_num} -------")
528
+ self.logger.info(f"CPU(s): {i}")
529
+ self.logger.info(f"Conc. Timechunks Being Analyzed: {timechunks} / Total Timechunks to be Analyzed: {len(self.tasks_picker)}")
530
+ self.logger.info(f"Total Amount of Stations to be Processed in Current Trial: {num_stations} / Number of Stations Being Processed Concurrently: {num_concurrent_predictions} / Total Overall Trial Station Count: {max(self.stations2use_list)}")
531
+
532
+ # Concurrent Timechunks
533
+ tasks_queue = []
534
+ log_queue = queue.Queue() # Create a queue for log entries
535
+
536
+
537
+ # ===== RAM Baseline (before launching worker) =====
538
+ _rss = process.memory_info().rss
539
+ for _ch in process.children(recursive=True):
540
+ try:
541
+ _rss += _ch.memory_info().rss
542
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
543
+ pass
544
+ mem_before_total_mb = _rss / 1e6
545
+
546
+ # peak before (platform-aware)
547
+ if resource is not None: # Linux/macOS
548
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
549
+ if sys.platform.startswith("linux"):
550
+ peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
551
+ elif sys.platform == "darwin":
552
+ peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
553
+ else:
554
+ peak_before_mb = mem_before_total_mb # safe fallback
555
+ else: # Windows: no 'resource'
556
+ try:
557
+ peak_before_mb = process.memory_full_info().peak_wset / 1e6
558
+ except Exception:
559
+ peak_before_mb = mem_before_total_mb
560
+
561
+ try:
562
+ while True:
563
+ if len(tasks_queue) < max_pending_tasks:
564
+ tasks_queue.append(mseed_predictor.options(num_gpus=0, num_cpus=1).remote(input_dir=timechunk_dir_path, output_dir=self.output_dir, log_queue=self.log_queue,
565
+ P_threshold=self.P_threshold, S_threshold=self.S_threshold, p_model=self.p_model_filepath, s_model=self.s_model_filepath,
566
+ number_of_concurrent_station_predictions=num_concurrent_predictions, ray_cpus=cpus_to_use, use_gpu=use_gpu,
567
+ gpu_id=self.selected_gpus, gpu_memory_limit_mb=self.vram_mb, stations2use=num_stations,
568
+ timechunk_id=mseed_timechunk_dir_name, waveform_overlap=self.waveform_overlap, total_timechunks=len(self.tasks_picker),
569
+ number_of_concurrent_timechunk_predictions=max_pending_tasks, total_analysis_time=total_analysis_time, testing_gpu=False,
570
+ test_csv_filepath=csv_filepath, intra_threads=self.intra_threads, inter_threads=self.inter_threads, timechunk_dt=self.timechunk_dt))
571
+
572
+ break
573
+
574
+ else:
575
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
576
+ for finished_task in tasks_finished:
577
+ log_entry = ray.get(finished_task)
578
+ log_queue.put(log_entry) # Add log entry to the queue
579
+
580
+ # After adding all the tasks to queue, process what's left
581
+ while tasks_queue:
582
+ tasks_finished, tasks_queue = ray.wait(tasks_queue, num_returns=1, timeout=None)
583
+ for finished_task in tasks_finished:
584
+ log_entry = ray.get(finished_task)
585
+ log_queue.put(log_entry) # Add log entry to the queue
586
+
587
+ update_csv(csv_filepath, success=1, error_message="")
588
+ except Exception as e:
589
+ # Failure occured, need to add to log
590
+ error_msg = f"{type(e).__name__}: {str(e)}"
591
+ update_csv(csv_filepath, success=0, error_message=error_msg)
592
+ self.logger.error(f"Trial {trial_num} FAILED: {error_msg}")
593
+
594
+ # Write log entries from the queue to the file
595
+ while not log_queue.empty():
596
+ log_entry = log_queue.get()
597
+
598
+ remove_output_subdirs(self.output_dir, logger=self.logger)
599
+ trial_num += 1
600
+
601
+ # RAM cleanup
602
+ # ===== AFTER RUN (before cleanup) =====
603
+ _rss = process.memory_info().rss
604
+ for _ch in process.children(recursive=True):
605
+ try:
606
+ _rss += _ch.memory_info().rss
607
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
608
+ pass
609
+ mem_after_run_total_mb = _rss / 1e6
610
+ delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
611
+
612
+ # updated peak (platform-aware)
613
+ if resource is not None:
614
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
615
+ if sys.platform.startswith("linux"):
616
+ peak_after_mb = _ru / 1024.0
617
+ elif sys.platform == "darwin":
618
+ peak_after_mb = _ru / (1024.0 * 1024.0)
619
+ else:
620
+ peak_after_mb = mem_after_run_total_mb
621
+ else:
622
+ try:
623
+ peak_after_mb = process.memory_full_info().peak_wset / 1e6
624
+ except Exception:
625
+ peak_after_mb = mem_after_run_total_mb
626
+
627
+ self.logger.info("")
628
+ self.logger.info(
629
+ f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
630
+ f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
631
+ )
632
+
633
+ # ===== CLEANUP =====
634
+ # drop strong refs so GC matters
635
+ try: del ref
636
+ except NameError: pass
637
+ try: del log_entry
638
+ except NameError: pass
639
+
640
+ _rss = process.memory_info().rss
641
+ for _ch in process.children(recursive=True):
642
+ try:
643
+ _rss += _ch.memory_info().rss
644
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
645
+ pass
646
+ mem_before_clean_mb = _rss / 1e6
647
+
648
+ gc.collect()
649
+ time.sleep(0.1)
650
+
651
+ _rss = process.memory_info().rss
652
+ for _ch in process.children(recursive=True):
653
+ try:
654
+ _rss += _ch.memory_info().rss
655
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
656
+ pass
657
+ mem_after_clean_mb = _rss / 1e6
658
+
659
+ freed_mb = mem_before_clean_mb - mem_after_clean_mb
660
+ self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB") # To-Do: Fix the Freed so its beeter (for cpu and gpu)
661
+ self.logger.info("")
662
+
663
+ # tested_concurrency.update([x for x in concurrent_predictions_list if x <= num_stations])
664
+
665
+ # stop log forwarder
666
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
667
+ self._log_thread.join(timeout=2)
668
+
669
+ ray.shutdown() # Shutdown Ray after processing all timechunks for this CPU count
670
+ self.logger.info(f"Ray Successfully Shutdown.")
671
+
672
+
673
+ self.logger.info(f"Testing complete.")
674
+ self.logger.info(f"")
675
+ self.logger.info(f"Finding Optimal Configurations...")
676
+ # Compute optimal configurations (CPU)
677
+ df = pd.read_csv(csv_filepath)
678
+ optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_cpu(df)
679
+ optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_cpu.csv", index=False)
680
+ best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_cpu.csv", index=False)
681
+ self.logger.info(f"Optimal Configurations Found. Findings saved to:")
682
+ self.logger.info(f" 1) Optimal CPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_cpu.csv")
683
+ self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_cpu.csv")
684
+
685
+ def evaluate_gpu(self):
686
+ """Evaluate system parallelization using GPUs"""
687
+ statement = "Evaluating System Parallelization Capability using GPUs"
688
+ self.logger.info(f"{statement}")
689
+
690
+ # Set CPU affinity
691
+ process = psutil.Process(os.getpid())
692
+ process.cpu_affinity(self.cpu_id_list) # Limit process to the given CPU IDs
693
+
694
+ os.makedirs(self.csv_dir, exist_ok=True)
695
+ os.makedirs(self.output_dir, exist_ok=True)
696
+
697
+ # Calculate these at the start
698
+ self.chunk_time()
699
+ self.dt_task_generator()
700
+ total_analysis_time = datetime.strptime(self.end_time, "%Y-%m-%d %H:%M:%S") - datetime.strptime(self.start_time, "%Y-%m-%d %H:%M:%S")
701
+
702
+ # Create test results csv
703
+ csv_filepath = f"{self.csv_dir}/gpu_test_results.csv"
704
+ prepare_csv(csv_file_path=csv_filepath, logger=self.logger)
705
+
706
+ free_vram_mb = self.vram_mb if self.vram_mb else self.calculate_vram()
707
+ self.selected_gpus = self.selected_gpus if self.selected_gpus else list_gpu_ids()
708
+ self.logger.info(f"Using GPU(s): {self.selected_gpus}")
709
+
710
+ trial_num = 1
711
+ log_queue = queue.Queue() # Create a queue for log entries
712
+
713
+ # Initialize Ray with GPUs
714
+ ray.init(ignore_reinit_error=True, num_gpus=len(self.selected_gpus), num_cpus=len(self.cpu_id_list),
715
+ logging_level=logging.FATAL, log_to_driver=False)
716
+ self.log_queue = Queue() # Create a Ray-safe queue to recieve LogRecord objects from workers so we can write them to file
717
+ self._log_thread = threading.Thread(target=self._drain_worker_logs, daemon=True) # Creates background thread whose only job is to get() records from self.log_queue and hand them over to the actual logger
718
+ self._log_thread.start() # Starts the thread
719
+ self.logger.info(f"Ray Successfully Initialized with {len(self.selected_gpus)} GPU(s) and {len(self.cpu_id_list)} CPU(s).")
720
+
721
+ for stations in self.stations2use_list:
722
+ concurrent_predictions_list = generate_station_list(self.min_conc_stations, stations, self.conc_station_tasks_step_size)
723
+ for predictions in concurrent_predictions_list:
724
+ vram_per_task_mb = free_vram_mb / predictions
725
+ step_size = vram_per_task_mb * 0.05
726
+ vram_steps = np.arange(step_size, vram_per_task_mb + step_size, step_size)
727
+ self.logger.info(f"Testing the following VRAM limitations (MB): {vram_steps}")
728
+
729
+ for gpu_memory_limit_mb in vram_steps:
730
+
731
+ self.logger.info("")
732
+ self.logger.info(f"------- Trial Number: {trial_num} -------")
733
+ self.logger.info(f"VRAM Limited to {gpu_memory_limit_mb:.2f} MB per Task")
734
+
735
+ # Get the first timechunk for testing
736
+ mseed_timechunk_dir_name = self.tasks_picker[0][1]
737
+ timechunk_dir_path = os.path.join(self.input_dir, mseed_timechunk_dir_name)
738
+
739
+ self.logger.info(f"Stations: {stations}")
740
+ self.logger.info(f"Concurrent Station Predictions: {predictions}")
741
+ self.logger.info(f"VRAM per Task: {gpu_memory_limit_mb:.2f} MB")
742
+ self.logger.info("")
743
+
744
+
745
+ # ===== Baseline RAM consumption (before launching worker) =====
746
+ _rss = process.memory_info().rss
747
+ for _ch in process.children(recursive=True):
748
+ try:
749
+ _rss += _ch.memory_info().rss
750
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
751
+ pass
752
+ mem_before_total_mb = _rss / 1e6
753
+
754
+ # peak before (platform-aware)
755
+ if resource is not None: # Linux/macOS
756
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
757
+ if sys.platform.startswith("linux"):
758
+ peak_before_mb = _ru / 1024.0 # ru_maxrss in KB on Linux
759
+ elif sys.platform == "darwin":
760
+ peak_before_mb = _ru / (1024.0 * 1024.0) # ru_maxrss in bytes on macOS
761
+ else:
762
+ peak_before_mb = mem_before_total_mb # safe fallback
763
+ else: # Windows: no 'resource'
764
+ try:
765
+ peak_before_mb = process.memory_full_info().peak_wset / 1e6
766
+ except Exception:
767
+ peak_before_mb = mem_before_total_mb
768
+
769
+ try:
770
+ # Call mseed_predictor directly via Ray (just like evaluate_cpu does)
771
+ ref = mseed_predictor.options(num_gpus=0, num_cpus=1).remote(
772
+ input_dir=timechunk_dir_path,
773
+ output_dir=self.output_dir,
774
+ log_queue=self.log_queue,
775
+ P_threshold=self.P_threshold,
776
+ S_threshold=self.S_threshold,
777
+ p_model=self.p_model_filepath,
778
+ s_model=self.s_model_filepath,
779
+ number_of_concurrent_station_predictions=predictions,
780
+ ray_cpus=self.cpu_id_list,
781
+ use_gpu=True,
782
+ gpu_id=self.selected_gpus,
783
+ gpu_memory_limit_mb=gpu_memory_limit_mb,
784
+ stations2use=stations,
785
+ timechunk_id=mseed_timechunk_dir_name,
786
+ waveform_overlap=self.waveform_overlap,
787
+ total_timechunks=len(self.tasks_picker),
788
+ number_of_concurrent_timechunk_predictions=1, # Testing one timechunk at a time
789
+ total_analysis_time=total_analysis_time,
790
+ testing_gpu=True, # Enable test mode
791
+ test_csv_filepath=csv_filepath,
792
+ intra_threads=self.intra_threads,
793
+ inter_threads=self.inter_threads,
794
+ timechunk_dt=self.timechunk_dt
795
+ )
796
+
797
+ # Wait for result
798
+ log_entry = ray.get(ref)
799
+ log_queue.put(log_entry) # Add log entry to the queue
800
+
801
+ # Success - update CSV
802
+ update_csv(csv_filepath, success=1, error_message="")
803
+
804
+ except Exception as e:
805
+ # Failure occurred, need to add to log
806
+ error_msg = f"{type(e).__name__}: {str(e)}"
807
+ update_csv(csv_filepath, success=0, error_message=error_msg)
808
+ self.logger.info(f"Trial {trial_num} FAILED: {error_msg}")
809
+
810
+ # Write log entries from the queue to the file
811
+ while not log_queue.empty():
812
+ log_entry = log_queue.get()
813
+ self.logger.info(f"{log_entry}") # FIX ME
814
+
815
+ remove_output_subdirs(self.output_dir, logger=self.logger)
816
+ trial_num += 1
817
+
818
+ # RAM cleanup
819
+ # ===== AFTER RUN (before cleanup) =====
820
+ _rss = process.memory_info().rss
821
+ for _ch in process.children(recursive=True):
822
+ try:
823
+ _rss += _ch.memory_info().rss
824
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
825
+ pass
826
+ mem_after_run_total_mb = _rss / 1e6
827
+ delta_run_mb = mem_after_run_total_mb - mem_before_total_mb
828
+
829
+ # updated peak (platform-aware)
830
+ if resource is not None:
831
+ _ru = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
832
+ if sys.platform.startswith("linux"):
833
+ peak_after_mb = _ru / 1024.0
834
+ elif sys.platform == "darwin":
835
+ peak_after_mb = _ru / (1024.0 * 1024.0)
836
+ else:
837
+ peak_after_mb = mem_after_run_total_mb
838
+ else:
839
+ try:
840
+ peak_after_mb = process.memory_full_info().peak_wset / 1e6
841
+ except Exception:
842
+ peak_after_mb = mem_after_run_total_mb
843
+
844
+ self.logger.info(
845
+ f"[MEM] Baseline: {mem_before_total_mb:.2f} MB | After run: {mem_after_run_total_mb:.2f} MB "
846
+ f"| Δrun: {delta_run_mb:.2f} MB | Peak≈{max(peak_before_mb, peak_after_mb):.2f} MB"
847
+ )
848
+
849
+ # ===== CLEANUP =====
850
+ # drop strong refs so GC matters
851
+ try: del ref
852
+ except NameError: pass
853
+ try: del log_entry
854
+ except NameError: pass
855
+
856
+ _rss = process.memory_info().rss
857
+ for _ch in process.children(recursive=True):
858
+ try:
859
+ _rss += _ch.memory_info().rss
860
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
861
+ pass
862
+ mem_before_clean_mb = _rss / 1e6
863
+
864
+ gc.collect()
865
+ time.sleep(0.1)
866
+
867
+ _rss = process.memory_info().rss
868
+ for _ch in process.children(recursive=True):
869
+ try:
870
+ _rss += _ch.memory_info().rss
871
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
872
+ pass
873
+ mem_after_clean_mb = _rss / 1e6
874
+
875
+ freed_mb = mem_before_clean_mb - mem_after_clean_mb
876
+ self.logger.info(f"[MEM] Freed ~{max(freed_mb, 0):.2f} MB; Post-clean total: {mem_after_clean_mb:.2f} MB\n")
877
+ self.logger.info("")
878
+
879
+ # stop log forwarder
880
+ self.log_queue.put(None) # remember, log_queue is a Ray Queue actor, and will only exist while Ray is still active (cannot be after the .shutdown())
881
+ self._log_thread.join(timeout=2)
882
+
883
+ ray.shutdown() # Shutdown Ray after all testing
884
+ self.logger.info(f"Ray Successfully Shutdown.")
885
+
886
+ self.logger.info(f"Testing complete.")
887
+ self.logger.info(f"")
888
+ self.logger.info(f"Finding Optimal Configurations...")
889
+ # Compute optimal configurations (GPU)
890
+ df = pd.read_csv(csv_filepath)
891
+ optimal_configuration_df, best_overall_usecase_df = find_optimal_configurations_gpu(df)
892
+ optimal_configuration_df.to_csv(f"{self.csv_dir}/optimal_configurations_gpu.csv", index=False)
893
+ best_overall_usecase_df.to_csv(f"{self.csv_dir}/best_overall_usecase_gpu.csv", index=False)
894
+ self.logger.info(f"Optimal Configurations Found. Findings saved to:")
895
+ self.logger.info(f" 1) Optimal GPU/Station/Concurrent Prediction Configurations: {self.csv_dir}/optimal_configurations_gpu.csv")
896
+ self.logger.info(f" 2) Best Overall Usecase Configuration: {self.csv_dir}/best_overall_usecase_gpu.csv")
897
+
898
+ def evaluate(self):
899
+ if self.eval_mode == "cpu":
900
+ self.evaluate_cpu()
901
+ elif self.eval_mode == "gpu":
902
+ self.evaluate_gpu()
903
+ else:
904
+ exit()
905
+
906
+ def calculate_vram(self):
907
+ """Calculate available VRAM for GPU testing."""
908
+ self.logger.info(f"Utilizing available VRAM...")
909
+ total_vram, available_vram = get_gpu_vram()
910
+ self.logger.info(f"Total VRAM: {total_vram:.2f} GB.")
911
+ self.logger.info(f"Available VRAM: {available_vram:.2f} GB.")
912
+
913
+ free_vram = total_vram * 0.9485 if available_vram / total_vram >= 0.9486 else available_vram
914
+ self.logger.info(f"Using up to {round(free_vram, 2)} GB of VRAM.")
915
+ return free_vram * 1024 # Convert to MB
916
+
917
+ """
918
+ Finds the optimal CPU configuration based on evaluation results
919
+ """
920
+ class OptimalCPUConfigurationFinder:
921
+ def __init__(self,
922
+ eval_sys_results_dir: str,
923
+ log_file_path: str):
924
+
925
+ self.eval_sys_results_dir = eval_sys_results_dir
926
+ if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
927
+ raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
928
+ self.log_file_path = log_file_path
929
+
930
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
931
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
932
+ self.logger.setLevel(logging.INFO)
933
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
934
+ if not self.logger.handlers: # avoid duplicating inits
935
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
936
+ # ensure parent dir
937
+ Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
938
+ file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
939
+ stream_h = logging.StreamHandler() # Sends logs to console
940
+ file_h.setFormatter(fmt)
941
+ stream_h.setFormatter(fmt)
942
+ self.logger.addHandler(file_h)
943
+ self.logger.addHandler(stream_h)
944
+
945
+
946
+ def find_best_overall_usecase(self):
947
+ """Finds the best overall CPU usecase configuation from eval results"""
948
+ file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_cpu.csv"
949
+ if not os.path.exists(file_path):
950
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
951
+
952
+ df_best_overall = pd.read_csv(file_path)
953
+ # best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
954
+ best_config_dict = df_best_overall.to_dict(orient='records')[0]
955
+
956
+ # Extract required values
957
+ num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
958
+ waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
959
+ total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
960
+ num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
961
+ length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
962
+ num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks per Timechunk")
963
+ intra_threads = best_config_dict.get("Intra-parallelism Threads")
964
+ inter_threads = best_config_dict.get("Inter-parallelism Threads")
965
+ num_stations = best_config_dict.get("Number of Stations Used")
966
+ total_runtime = best_config_dict.get("Total Run time for Picker (s)")
967
+
968
+ self.logger.info("")
969
+ self.logger.info(f"------- Finding the Best Overall CPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
970
+ self.logger.info(f"CPU(s): {num_cpus}")
971
+ self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
972
+ self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
973
+ self.logger.info(f"Waveform Timespace: {waveform_timespace}")
974
+ self.logger.info(f"Total Number of Stations Used: {num_stations}")
975
+ self.logger.info(f"Total Number of Timechunks: {total_num_timechunks}")
976
+ self.logger.info(f"Length of Timechunks (min): {length_of_timechunks}")
977
+ self.logger.info(f"Concurrent Timechunk Processes: {num_concurrent_timechunks}")
978
+ self.logger.info(f"Concurrent Station Processes Per Timechunk: {num_concurrent_stations}")
979
+ self.logger.info(f"Total Runtime (s): {total_runtime}")
980
+ self.logger.info("")
981
+
982
+ # return int(float(num_cpus)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_concurrent_timechunks)), int(float(num_concurrent_stations)), int(float(num_stations))
983
+
984
+ def find_optimal_for(self, cpu: int, station_count: int):
985
+ """Finds the optimal configuration for a given number of CPUs and stations."""
986
+ if cpu is None or station_count is None:
987
+ raise ValueError("Error: CPU and station_count must have valid values.")
988
+
989
+ file_path = f"{self.eval_sys_results_dir}/optimal_configurations_cpu.csv"
990
+ if not os.path.exists(file_path):
991
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
992
+
993
+ df_optimal = pd.read_csv(file_path)
994
+
995
+ # Convert relevant columns to numeric
996
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
997
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
998
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
999
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
1000
+
1001
+ filtered_df = df_optimal[
1002
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
1003
+ (df_optimal["Number of Stations Used"] == station_count)]
1004
+
1005
+ if filtered_df.empty:
1006
+ raise ValueError("No matching configuration found. Please enter a valid entry.")
1007
+
1008
+ # Finds for the "Total Run time for Picker (s)" the row with the smallest value and the '1' is to say I only want
1009
+ # only the single row where the smallest runtime is
1010
+ # iloc gets the selection of data from a numerical index from the df and turns that access point into a Series
1011
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
1012
+
1013
+ self.logger.info(f"------- Best CPU-EQCCTPro Configuration for Requested Input Parameters Based on the available Trial Data in {self.eval_sys_results_dir} -------")
1014
+ self.logger.info(f"CPU(s): {cpu}")
1015
+ self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
1016
+ self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
1017
+ self.logger.info(f"Waveform Timespace: {best_config['Total Waveform Analysis Timespace (min)']}")
1018
+ self.logger.info(f"Total Number of Stations Used: {station_count}")
1019
+ self.logger.info(f"Total Number of Timechunks: {best_config['Total Number of Timechunks']}")
1020
+ self.logger.info(f"Length of Timechunks (min): {best_config['Length of Timechunk (min)']}")
1021
+ self.logger.info(f"Concurrent Timechunk Processes: {best_config['Concurrent Timechunks Used']}")
1022
+ self.logger.info(f"Concurrent Station Processes Per Timechunk: {best_config['Number of Concurrent Station Tasks']}")
1023
+ self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
1024
+ self.logger.info("")
1025
+
1026
+ # return int(float(cpu)), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(best_config["Concurrent Timechunks Used"])), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(station_count))
1027
+
1028
+
1029
+ class OptimalGPUConfigurationFinder:
1030
+ """Finds the optimal GPU configuration based on evaluation system results."""
1031
+
1032
+ def __init__(self,
1033
+ eval_sys_results_dir: str,
1034
+ log_file_path: str):
1035
+
1036
+ self.eval_sys_results_dir = eval_sys_results_dir
1037
+ if not self.eval_sys_results_dir or not os.path.isdir(self.eval_sys_results_dir):
1038
+ raise ValueError(f"Error: The provided directory path '{self.eval_sys_results_dir}' is invalid or does not exist.")
1039
+ self.log_file_path = log_file_path
1040
+
1041
+ # Set up main logger and logger queue to retrive queued logs from Raylets to be passed to the main logger
1042
+ self.logger = logging.getLogger("eqcctpro") # We named the logger eqcctpro (can be any name)
1043
+ self.logger.setLevel(logging.INFO)
1044
+ self.logger.propagate = False # if true, events logged to this logger will be passed to the handlers of higher level (ancestor) loggers, in addition to any handlers attached to this logger
1045
+ if not self.logger.handlers: # avoid duplicating inits
1046
+ fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
1047
+ # ensure parent dir
1048
+ Path(self.log_file_path).parent.mkdir(parents=True, exist_ok=True)
1049
+ file_h = logging.FileHandler(self.log_file_path) # Writes logs to file
1050
+ stream_h = logging.StreamHandler() # Sends logs to console
1051
+ file_h.setFormatter(fmt)
1052
+ stream_h.setFormatter(fmt)
1053
+ self.logger.addHandler(file_h)
1054
+ self.logger.addHandler(stream_h)
1055
+
1056
+ def find_best_overall_usecase(self):
1057
+ """Finds the best overall GPU configuration from evaluation results."""
1058
+ file_path = f"{self.eval_sys_results_dir}/best_overall_usecase_gpu.csv"
1059
+ if not os.path.exists(file_path):
1060
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1061
+
1062
+ df = pd.read_csv(file_path)
1063
+ if df.empty:
1064
+ raise ValueError(f"[{datetime.now()}] Error: '{file_path}' is empty.")
1065
+
1066
+ row = df.iloc[0] # the best row you wrote out
1067
+
1068
+ # Some codepaths use two different column names for concurrency; support both
1069
+ conc_col = "Number of Concurrent Station Tasks per Timechunk" \
1070
+ if "Number of Concurrent Station Tasks per Timechunk" in df.columns \
1071
+ else "Number of Concurrent Station Tasks"
1072
+
1073
+ # Robust GPU parse: accepts [0], (0,), "0", 0, "", None
1074
+ num_gpus_list = _parse_gpus_field(row.get("GPUs Used"))
1075
+ # Keep as tuple for display/consistency
1076
+ num_gpus = tuple(num_gpus_list)
1077
+
1078
+ # Pull/normalize scalars
1079
+ num_cpus = row.get("Number of CPUs Allocated for Ray to Use")
1080
+ num_concurrent = row.get(conc_col)
1081
+ intra_threads = row.get("Intra-parallelism Threads")
1082
+ inter_threads = row.get("Inter-parallelism Threads")
1083
+ num_stations = row.get("Number of Stations Used")
1084
+ total_runtime = row.get("Total Run time for Picker (s)")
1085
+ vram_used = row.get("VRAM Used Per Task")
1086
+
1087
+ self.logger.info("")
1088
+ self.logger.info(f"------- Finding the Best Overall GPU Usecase Configuration Based on Available Trial Data in {self.eval_sys_results_dir} -------")
1089
+ self.logger.info("")
1090
+ self.logger.info(f"CPU(s): {num_cpus}")
1091
+ self.logger.info(f"GPU ID(s): {num_gpus_list}")
1092
+ self.logger.info(f"Concurrent Predictions: {num_concurrent}")
1093
+ self.logger.info(f"Intra-parallelism Threads: {intra_threads}")
1094
+ self.logger.info(f"Inter-parallelism Threads: {inter_threads}")
1095
+ self.logger.info(f"Stations: {num_stations}")
1096
+ self.logger.info(f"VRAM Used per Task: {vram_used}")
1097
+ self.logger.info(f"Total Runtime (s): {total_runtime}")
1098
+ self.logger.info("")
1099
+ # return int(float(num_cpus)), int(float(num_concurrent_predictions)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
1100
+
1101
+ def find_optimal_for(self, num_cpus: int, gpu_list: list, station_count: int):
1102
+ """Finds the optimal configuration for a given number of CPUs, GPUs, and stations."""
1103
+ if num_cpus is None or station_count is None or gpu_list is None:
1104
+ raise ValueError("Error: num_cpus, station_count, and gpu_list must have valid values.")
1105
+
1106
+ file_path = f"{self.eval_sys_results_dir}/optimal_configurations_gpu.csv"
1107
+ if not os.path.exists(file_path):
1108
+ raise FileNotFoundError(f"[{datetime.now()}] Error: The file '{file_path}' does not exist. Ensure it is in the correct directory.")
1109
+
1110
+ df_optimal = pd.read_csv(file_path)
1111
+
1112
+ # Convert relevant columns to numeric, handling NaNs
1113
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
1114
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
1115
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
1116
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
1117
+ df_optimal["VRAM Used Per Task"] = pd.to_numeric(df_optimal["VRAM Used Per Task"], errors="coerce")
1118
+
1119
+ # Convert "GPUs Used" from string representation to list
1120
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
1121
+
1122
+ # Convert GPU lists to tuples for comparison
1123
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
1124
+
1125
+ # Ensure gpu_list is in tuple format for comparison
1126
+ gpu_list_tuple = tuple(gpu_list) if isinstance(gpu_list, list) else (gpu_list,)
1127
+
1128
+ filtered_df = df_optimal[
1129
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
1130
+ (df_optimal["GPUs Used"] == gpu_list_tuple) &
1131
+ (df_optimal["Number of Stations Used"] == station_count)
1132
+ ]
1133
+
1134
+ if filtered_df.empty:
1135
+ raise ValueError("No matching configuration found. Please enter a valid entry.")
1136
+
1137
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
1138
+
1139
+ self.logger.info(f"------- Best GPU-EQCCTPro Configuration for Requested Input Parameters Based on the Available Trial Data in {self.eval_sys_results_dir} -------")
1140
+ self.logger.info(f"CPU(s): {num_cpus}")
1141
+ self.logger.info(f"GPU(s): {gpu_list}")
1142
+ self.logger.info(f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}")
1143
+ self.logger.info(f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}")
1144
+ self.logger.info(f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}")
1145
+ self.logger.info(f"Stations: {station_count}")
1146
+ self.logger.info(f"VRAM Used per Task: {best_config['VRAM Used Per Task']}")
1147
+ self.logger.info(f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
1148
+
1149
+ # return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
1150
+ # int(float(best_config["Number of Concurrent Station Tasks"])), \
1151
+ # int(float(best_config["Intra-parallelism Threads"])), \
1152
+ # int(float(best_config["Inter-parallelism Threads"])), \
1153
+ # gpu_list, \
1154
+ # int(float(best_config["VRAM Used Per Task"])), \
1155
+ # int(float(station_count))