eqcctpro 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eqcctpro might be problematic. Click here for more details.

eqcctpro/tools.py ADDED
@@ -0,0 +1,871 @@
1
+ """
2
+ tools.py contains the sub-tool functions that EQCCTPro uses, such as getting the current system's VRAM, creating subdirs, etc.
3
+ """
4
+ from __future__ import annotations
5
+ import os
6
+ import re
7
+ import ast
8
+ import math
9
+ import glob
10
+ import shutil
11
+ import pynvml
12
+ import logging
13
+ import pandas as pd
14
+ from math import floor
15
+ from pathlib import Path
16
+ from dataclasses import dataclass, field # To-Do: Figure out what dataclass and field mean/do
17
+ from datetime import datetime, timedelta
18
+ from typing import Iterable, Callable, Optional, List
19
+
20
+ # We define the CSV header as a globally consistent CSV header for all CSVs generated by EQCCTPro
21
+ CANONICAL_CSV_HEADER = [
22
+ "Trial Number",
23
+ "Stations Used",
24
+ "Number of Stations Used",
25
+ "Number of CPUs Allocated for Ray to Use",
26
+ "Intra-parallelism Threads",
27
+ "Inter-parallelism Threads",
28
+ "GPUs Used",
29
+ "VRAM Used Per Task",
30
+ "Total Waveform Analysis Timespace (min)",
31
+ "Total Number of Timechunks",
32
+ "Concurrent Timechunks Used",
33
+ "Length of Timechunk (min)",
34
+ "Number of Concurrent Station Tasks",
35
+ "Total Run time for Picker (s)",
36
+ "Trial Success",
37
+ "Error Message",
38
+ ]
39
+
40
+ """
41
+ build_station_list_from_dir, looks_like_timechunk_id, and _TIMECHUNK_RE work together to discover the stations that are in a timechunk directory. \
42
+ build_station_list_from_dir builds the list of stations in a timechunk dir, using looks_like_timechunk_id to make sure that the subdirectory of the
43
+ given input_dir is not a timechunk_id, if it is, it is filtered out.
44
+ """
45
+ # Pattern for understanding the structure of a timechunk directory naming pattern
46
+ _TIMECHUNK_RE = re.compile(r"^\d{8}T\d{6}Z_\d{8}T\d{6}Z$")
47
+
48
+ def looks_like_timechunk_id(name: str) -> bool:
49
+ return bool(_TIMECHUNK_RE.match(name or ""))
50
+
51
+ def build_station_list_from_dir(input_dir: str) -> list[str]:
52
+ """
53
+ Robustly discover stations under a timechunk directory.
54
+ Accepts files like *.mseed/*.sac or one-dir-per-station structures.
55
+ """
56
+ stations = set()
57
+
58
+ # 1) Files directly inside input_dir
59
+ for p in glob.glob(os.path.join(input_dir, "*")):
60
+ base = os.path.basename(p)
61
+ if os.path.isfile(p):
62
+ # file path — take stem without extension
63
+ stations.add(os.path.splitext(base)[0])
64
+
65
+ # 2) One subdir per station (e.g., input_dir/AT01/*.mseed)
66
+ for p in glob.glob(os.path.join(input_dir, "*")):
67
+ if os.path.isdir(p):
68
+ stations.add(os.path.basename(p))
69
+
70
+ # Filter out anything that looks like a timechunk id (safety)
71
+ stations = [s for s in stations if not looks_like_timechunk_id(s)]
72
+
73
+ return sorted(stations)
74
+
75
+ """
76
+ generate_station_list builds a list that contains the number of stations to use in the trial iterations, marching from a given amount of starting stations
77
+ to a final amount of stations with either an specified or a pre-determined step size.
78
+ """
79
+ def generate_station_list(starting_amount_of_stations, total_num_stations_to_use, station_list_step_size):
80
+ if total_num_stations_to_use == 1:
81
+ return [1]
82
+ elif total_num_stations_to_use <= 10:
83
+ return list(range(1, total_num_stations_to_use + 1))
84
+ elif starting_amount_of_stations == 1 and station_list_step_size == 1:
85
+ # Numbers 1-10
86
+ station_list = list(range(1, 11))
87
+
88
+ # Multiples of 5 up to total_num_stations_to_use
89
+ multiples_of_5 = list(range(15, total_num_stations_to_use + 1, 5))
90
+
91
+ # Any additional numbers between 21 and total_num_stations_to_use
92
+ additional_numbers = list(range(21, total_num_stations_to_use + 1))
93
+
94
+ # Combine lists while ensuring uniqueness
95
+ return sorted(set(station_list + multiples_of_5 + additional_numbers))
96
+ else:
97
+ return list(range(starting_amount_of_stations, total_num_stations_to_use + 1, station_list_step_size))
98
+
99
+ """
100
+ list_gpu_ids returns a list of available GPU IDs on the system.
101
+ """
102
+ def list_gpu_ids():
103
+ """List all available GPU IDs on the system."""
104
+ pynvml.nvmlInit() # Initialize NVML
105
+ gpu_count = pynvml.nvmlDeviceGetCount() # Get number of GPUs
106
+ gpu_ids = list(range(gpu_count)) # Create a list of GPU indices
107
+ pynvml.nvmlShutdown() # Shutdown NVML
108
+ return gpu_ids
109
+
110
+ """
111
+ get_gpu_vram retrieves the total and free VRAM (in GB) for the current GPU.
112
+ """
113
+ def get_gpu_vram(gpu_index): # Need to fix to pass in a GPU instead of hard indexing 0
114
+ """Retrieve total and free VRAM (in GB) for the current GPU."""
115
+ pynvml.nvmlInit() # Initialize NVML
116
+ handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) # Use provided GPU
117
+ total_vram = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3) # Convert bytes to GB
118
+ free_vram = pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024**3) # Convert bytes to GB
119
+ pynvml.nvmlShutdown() # Shutdown NVML
120
+ return total_vram, free_vram
121
+
122
+ """
123
+ _parse_gpu_fields normalize the 'GPUs Used' field from CSV into a list[int]
124
+ """
125
+ def _parse_gpus_field(x):
126
+ if x is None:
127
+ return []
128
+ if isinstance(x, (list, tuple)):
129
+ return [int(v) for v in x]
130
+ if isinstance(x, (int, float)) and not (isinstance(x, float) and math.isnan(x)):
131
+ return [int(x)]
132
+ if isinstance(x, str):
133
+ s = x.strip()
134
+ if not s:
135
+ return []
136
+ s = s.replace("(", "[").replace(")", "]").replace("{", "[").replace("}", "]")
137
+ try:
138
+ val = ast.literal_eval(s)
139
+ if isinstance(val, (list, tuple)):
140
+ return [int(v) for v in val]
141
+ if isinstance(val, (int, float)):
142
+ return [int(val)]
143
+ except Exception:
144
+ nums = re.findall(r"-?\d+", s)
145
+ return [int(n) for n in nums]
146
+ return []
147
+
148
+ """
149
+ VramPlan checks to see that for the RunEQCCTPro functionality, when the user calls to use a certain amount of VRAM per Raylet,
150
+ we check to see that the amount requested does not exceed what is capable for the system.
151
+ """
152
+ # To-Do: Understand how this works
153
+ @dataclass
154
+ class VramPlan:
155
+ intended_workers: int
156
+ per_worker_mb: float
157
+ overhead_mb: float
158
+ model_vram_mb: float
159
+ per_gpu_cap_mb: List[float] # post safety-cap
160
+ max_workers_per_gpu: List[int]
161
+ total_max_workers: int
162
+ aggregate_cap_mb: float
163
+ aggregate_need_mb: float
164
+ ok_per_gpu: bool
165
+ ok_aggregate: bool
166
+ # Optional; value-object semantics preserved (not shown in repr/eq)
167
+ logger: Optional[logging.Logger] = field(default=None, repr=False, compare=False)
168
+
169
+ def evaluate_vram_capacity(
170
+ *,
171
+ intended_workers: int,
172
+ vram_per_worker_mb: float,
173
+ per_gpu_free_mb: Iterable[float], # free MB per selected GPU (raw)
174
+ model_vram_mb: float = 3000.0, # reserve per GPU
175
+ safety_cap: float = 0.90, # cap each GPU
176
+ eqcct_overhead_gb: float = 1.1 # per-worker overhead (GB)
177
+ ) -> VramPlan:
178
+ """
179
+ Computes both per-GPU admission and aggregate budget feasibility.
180
+ Pure: no logging/printing/raising.
181
+ """
182
+ overhead_mb = eqcct_overhead_gb * 1024.0
183
+ per_gpu_cap_mb = [safety_cap * float(x) for x in per_gpu_free_mb]
184
+
185
+ denom = (vram_per_worker_mb + overhead_mb)
186
+ if denom <= 0:
187
+ raise ValueError("Non-positive per-worker memory requirement.")
188
+
189
+ max_workers_per_gpu = []
190
+ for cap in per_gpu_cap_mb:
191
+ remaining = max(0.0, cap - model_vram_mb) # reserve the model actor once per GPU
192
+ maxw = max(0, floor(remaining / denom))
193
+ max_workers_per_gpu.append(maxw)
194
+
195
+ total_max_workers = sum(max_workers_per_gpu)
196
+ ok_per_gpu = intended_workers <= total_max_workers
197
+
198
+ aggregate_cap_mb = sum(per_gpu_cap_mb)
199
+ aggregate_need_mb = (model_vram_mb * len(per_gpu_cap_mb)) + intended_workers * denom
200
+ ok_aggregate = aggregate_need_mb <= aggregate_cap_mb
201
+
202
+ return VramPlan(
203
+ intended_workers=intended_workers,
204
+ per_worker_mb=vram_per_worker_mb,
205
+ overhead_mb=overhead_mb,
206
+ model_vram_mb=model_vram_mb,
207
+ per_gpu_cap_mb=per_gpu_cap_mb,
208
+ max_workers_per_gpu=max_workers_per_gpu,
209
+ total_max_workers=total_max_workers,
210
+ aggregate_cap_mb=aggregate_cap_mb,
211
+ aggregate_need_mb=aggregate_need_mb,
212
+ ok_per_gpu=ok_per_gpu,
213
+ ok_aggregate=ok_aggregate,
214
+ )
215
+
216
+ def _emit(logger: Optional[logging.Logger], level: str, msg: str) -> None:
217
+ if logger is not None:
218
+ getattr(logger, level)(msg)
219
+ else:
220
+ print(msg)
221
+
222
+ # ---------- Thin wrappers to preserve your two call sites ----------
223
+
224
+ def check_vram_per_gpu_style(
225
+ *,
226
+ selected_gpus: List[int],
227
+ get_gpu_vram_fn: Callable[[int], tuple[float, float]], # returns (total_gb, free_gb) for that GPU
228
+ intended_workers: int,
229
+ vram_mb: float,
230
+ model_vram_mb: float = 3000.0,
231
+ safety_cap: float = 0.95, # matches your original 95% cap
232
+ eqcct_overhead_gb: float = 0.0, # original per-GPU check ignored runtime overhead
233
+ logger: Optional[logging.Logger] = None
234
+ ) -> None:
235
+ per_gpu_free_mb = [(get_gpu_vram_fn(gid)[1] * 1024.0) for gid in selected_gpus]
236
+
237
+ plan = evaluate_vram_capacity(
238
+ intended_workers=intended_workers,
239
+ vram_per_worker_mb=float(vram_mb),
240
+ per_gpu_free_mb=per_gpu_free_mb,
241
+ model_vram_mb=model_vram_mb,
242
+ safety_cap=safety_cap,
243
+ eqcct_overhead_gb=eqcct_overhead_gb,
244
+ )
245
+ plan.logger = logger
246
+
247
+ if not plan.ok_per_gpu:
248
+ # Compose precise diagnostic
249
+ unit = plan.per_worker_mb + plan.overhead_mb
250
+ msg = (
251
+ f"ERROR: Per-GPU capacity insufficient for {plan.intended_workers} workers.\n"
252
+ f" Reservation per GPU: model={plan.model_vram_mb:.0f} MB, worker_unit={unit:.0f} MB "
253
+ f"({plan.per_worker_mb:.0f} + {plan.overhead_mb:.0f})\n"
254
+ f" Per-GPU caps after safety: {', '.join(f'{c:.0f}' for c in plan.per_gpu_cap_mb)} MB\n"
255
+ f" Max workers per GPU: {plan.max_workers_per_gpu} (total={plan.total_max_workers})\n"
256
+ f"Action: lower vram_mb, reduce concurrency, or add GPUs."
257
+ )
258
+ _emit(logger, "error", msg)
259
+ raise RuntimeError(msg)
260
+
261
+ _emit(logger, "info",
262
+ f"Per-GPU admission OK: {plan.intended_workers} ≤ {plan.total_max_workers} workers.")
263
+
264
+ def check_vram_aggregate_style(
265
+ *,
266
+ eval_mode: str,
267
+ selected_gpus: List[int],
268
+ get_cluster_free_gb_fn: Callable[[], tuple[float, float]], # returns (total_gb, free_gb)
269
+ intended_workers: int,
270
+ vram_mb: float,
271
+ model_vram_mb: float = 3000.0,
272
+ safety_cap: float = 0.90,
273
+ eqcct_overhead_gb: float = 1.1,
274
+ logger: Optional[logging.Logger] = None
275
+ ) -> None:
276
+ if eval_mode.lower() != "gpu":
277
+ raise ValueError(f"vram_mb is only meaningful in GPU mode; got eval_mode='{eval_mode}'.")
278
+
279
+ # Homogeneous assumption (your prior logic): same free_gb replicated
280
+ _, free_gb = get_cluster_free_gb_fn()
281
+ per_gpu_free_mb = [free_gb * 1024.0] * len(selected_gpus)
282
+
283
+ plan = evaluate_vram_capacity(
284
+ intended_workers=intended_workers,
285
+ vram_per_worker_mb=float(vram_mb),
286
+ per_gpu_free_mb=per_gpu_free_mb,
287
+ model_vram_mb=model_vram_mb,
288
+ safety_cap=safety_cap,
289
+ eqcct_overhead_gb=eqcct_overhead_gb,
290
+ )
291
+ plan.logger = logger
292
+
293
+ if not plan.ok_aggregate:
294
+ unit = plan.per_worker_mb + plan.overhead_mb
295
+ msg = (
296
+ f"ERROR: Aggregate VRAM insufficient.\n"
297
+ f" GPUs: {len(selected_gpus)} | Safety cap: {int(safety_cap*100)}%\n"
298
+ f" Aggregate cap: {plan.aggregate_cap_mb:.0f} MB\n"
299
+ f" Aggregate need: {plan.aggregate_need_mb:.0f} MB "
300
+ f"(= {model_vram_mb:.0f}×{len(selected_gpus)} + {intended_workers}×{unit:.0f})\n"
301
+ f"Action: lower vram_mb, reduce concurrency, or add GPUs."
302
+ )
303
+ _emit(logger, "error", msg)
304
+ raise RuntimeError(msg)
305
+
306
+ _emit(logger, "info",
307
+ f"Aggregate budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ "
308
+ f"Cap {plan.aggregate_cap_mb:.0f} MB across {len(selected_gpus)} GPU(s).")
309
+
310
+
311
+ """
312
+ prepare_csv either loads or initializes the CSV file for storing test results.
313
+ """
314
+ def prepare_csv(csv_file_path, logger):
315
+ """
316
+ Loads or initializes the CSV file for storing test results.
317
+ """
318
+ if os.path.exists(csv_file_path):
319
+ logger.info(f"Loading existing CSV file from '{csv_file_path}'...")
320
+ return pd.read_csv(csv_file_path)
321
+ logger.info(f"CSV file not found. Creating a new CSV file at '{csv_file_path}'...")
322
+
323
+ columns = CANONICAL_CSV_HEADER
324
+ df = pd.DataFrame(columns=columns)
325
+ df.to_csv(csv_file_path, index=False)
326
+
327
+ """
328
+ append_trial_row appends a completed trial_dictionary to a CSV via the last line. Does this after successfully completing a trial during the
329
+ EvaluateSystem process for either CPU or GPU. Does not provide information for the success or the error message of the trial (update_csv does).
330
+ """
331
+ def append_trial_row(csv_path: str, trial_data: dict):
332
+ """
333
+ Append a complete trial row to the CSV with all fields populated.
334
+ """
335
+ csvp = Path(csv_path)
336
+
337
+ # Ensure header exists with canonical order
338
+ if not csvp.exists():
339
+ pd.DataFrame(columns=CANONICAL_CSV_HEADER).to_csv(csvp, index=False)
340
+
341
+ df_existing = pd.read_csv(csvp, keep_default_na=False)
342
+
343
+ # Align row to the canonical header (use empty string for missing keys)
344
+ row = {col: trial_data.get(col, "") for col in CANONICAL_CSV_HEADER}
345
+
346
+ # Auto-number trials if not provided
347
+ if pd.isna(row["Trial Number"]) or row["Trial Number"] == "" or row["Trial Number"] is None:
348
+ row["Trial Number"] = len(df_existing) + 1
349
+
350
+ df_new = pd.DataFrame([row], columns=CANONICAL_CSV_HEADER)
351
+ df_out = pd.concat([df_existing, df_new], ignore_index=True)
352
+ df_out.to_csv(csvp, index=False)
353
+
354
+ print(f"Appended trial {row['Trial Number']} to {csv_path}")
355
+
356
+ """
357
+ update_csv updates a completed trial after the code has exited the mseed_predictor loop and into the last steps of completing the trial.
358
+ If the trial either was a success or had errors, we update the last row with the success/error information of the trial.
359
+ """
360
+ def update_csv(csv_filepath, success, error_message):
361
+ df = pd.read_csv(csv_filepath)
362
+ if "Error Message" not in df.columns:
363
+ df["Error Message"] = ""
364
+
365
+ # Ensure string dtype
366
+ df["Error Message"] = df["Error Message"].astype("string")
367
+
368
+ last_idx = df.index[-1] # Get last row id number
369
+ df.loc[last_idx, 'Trial Success'] = success # Access value at row last_idx, column 'Trial Success'
370
+ df.loc[last_idx, 'Error Message'] = error_message # Access value at row last_idx, column 'Error Message'
371
+
372
+ df.to_csv(csv_filepath, index=False)
373
+
374
+ # """
375
+ # remove_directory removes a specified directory if it exists. (NOT CURRENTLY USED)
376
+ # """
377
+ # def remove_directory(path):
378
+ # """
379
+ # Removes the specified directory if it exists.
380
+ # """
381
+ # if os.path.exists(path):
382
+ # shutil.rmtree(path)
383
+ # print(f"Removed directory: {path}")
384
+ # else:
385
+ # print(f"Directory '{path}' does not exist anymore.")
386
+
387
+ """
388
+ remove_output_subdirs removes all the subdirectoreies of a specified directory. We use it in the code for removing the contents of the 'output' dir
389
+ once a trial has completed, because if we do not, then the code will recognize the directory name and believe that the waveform was already analyzed
390
+ when in reality it was in the previous configuration iteration. Does not remove trial contents or log, just station subdirs created by mseed_predictor
391
+ and parallel_predict.
392
+ """
393
+ def remove_output_subdirs(output_dir: str, logger: logging.Logger | None = None) -> None:
394
+ """
395
+ Delete any *_outputs subdirectories in `output_dir`.
396
+ Logs via `logger` if provided; otherwise falls back to print.
397
+ """
398
+ try:
399
+ for name in os.listdir(output_dir):
400
+ path = os.path.join(output_dir, name)
401
+ if os.path.isdir(path) and name.endswith("_outputs"):
402
+ shutil.rmtree(path, ignore_errors=True)
403
+ msg = f"Removed subdirectory: {path}"
404
+ (logger.info if logger else print)(msg)
405
+ except Exception as e:
406
+ msg = f"Failed to remove output subdirs in {output_dir}: {e}"
407
+ (logger.error if logger else print)(msg)
408
+
409
+ """
410
+ We need to check to make sure that the input dir's contents (IE. the station subdirs in each timechunk dir)
411
+ are the same length. We need to ensure that they are
412
+ """
413
+ def check_station_dirs(input_dir):
414
+ subdir_lens, station_list_f = [], []
415
+ sorted_input_dir = sorted(os.listdir(input_dir))
416
+ subdirs = [item for item in sorted_input_dir if os.path.isdir(os.path.join(input_dir, item))] # Inherits the sorted
417
+ for timechunk_dir in subdirs:
418
+ subdir_path = os.path.join(input_dir, timechunk_dir)
419
+ try:
420
+ station_list = os.listdir(subdir_path)
421
+ subdir_lens.append(len(station_list))
422
+ station_list_f = station_list
423
+ except OSError as e:
424
+ print(f"Warning: Could not read directory {subdir_path}. Error: {e}")
425
+
426
+ check_if_lens_are_same = len(set(subdir_lens))
427
+ if check_if_lens_are_same != 1:
428
+ # Bad case
429
+ statement = f"The contents across your timechunk directories are not the same. They must match for EQCCTPro. Fix station subdirs so each timechunk has the same stations. Exiting..."
430
+ return statement, station_list_f, True
431
+ else:
432
+ statement = f"Stations subdirs in timechunk directories are consistent. Continuing EQCCTPro..."
433
+ return statement, station_list_f, False
434
+
435
+
436
+ """
437
+ tf_environ sets the tensorflow environment for either an allocated CPU or GPU configuration.
438
+ If it is a CPU, the CUDA_DEVICE_ORDER is configured for a CPU; Intra/inter parallelism threads are also set as well
439
+ If a GPU, the CUDA_DEVICE_ORDER is set for a GPU, as well as intra/inter threads, and we configure that GPU to only
440
+ use up to a limited number of vram (vram_limit_mb), which we need for the trials.
441
+ """
442
+ def tf_environ(gpu_id, vram_limit_mb=None, gpus_to_use=None, intra_threads=None, inter_threads=None, log_device=True, logger=None):
443
+ """
444
+ Configure TensorFlow to use fixed VRAM slices per visible GPU.
445
+ Call this ONCE per Ray actor, BEFORE building/loading any TF model.
446
+ """
447
+
448
+ # Normalize logger: if None, use a silent logger that discards records
449
+ if logger is None:
450
+ logger = logging.getLogger("eqcctpro.null")
451
+ logger.propagate = False
452
+ if not logger.handlers:
453
+ logger.addHandler(logging.NullHandler())
454
+
455
+ # C++ backend verbosity (must be set before importing TF)
456
+ os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3") # 0=all,1=INFO-,2=WARNING-,3=ERROR-
457
+ os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # avoid oneDNN “custom ops” info line
458
+
459
+ # 0) Visibility must be set BEFORE importing tensorflow
460
+ if gpu_id == -1 or not gpus_to_use:
461
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
462
+ logger.info(f"GPU disabled (CPU-only).")
463
+ else:
464
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
465
+ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpus_to_use))
466
+ logger.info(f"GPU enabled. Visible GPU IDs: {gpus_to_use}")
467
+
468
+ # 1) Now import TF (it will honor visibility)
469
+ import tensorflow as tf
470
+ tf.get_logger().setLevel(logging.ERROR)
471
+ try:
472
+ from absl import logging as absl_logging
473
+ absl_logging.set_verbosity(absl_logging.ERROR)
474
+ except Exception:
475
+ pass
476
+ if log_device:
477
+ tf.debugging.set_log_device_placement(True)
478
+
479
+ # 2) Threading (optional)
480
+ if intra_threads is not None:
481
+ tf.config.threading.set_intra_op_parallelism_threads(int(intra_threads))
482
+ logger.info(f"Configured Intra-op threads = {intra_threads}")
483
+ if inter_threads is not None:
484
+ tf.config.threading.set_inter_op_parallelism_threads(int(inter_threads))
485
+ logger.info(f"Configured Inter-op threads = {inter_threads}")
486
+
487
+ # 3) Configure fixed VRAM slices on all visible GPUs
488
+ vis_gpus = tf.config.list_physical_devices("GPU")
489
+ if not vis_gpus:
490
+ logger.info(f"No GPUs visible; proceeding on CPU.")
491
+ logger.info("")
492
+ return {"logical_gpus": [], "physical_gpus": []}
493
+
494
+ if vram_limit_mb is None or vram_limit_mb <= 0:
495
+ raise ValueError("vram_limit_mb must be a positive integer when using fixed VRAM slicing.")
496
+
497
+ try:
498
+ for gpu in vis_gpus:
499
+ # One logical device per physical GPU, each with a hard VRAM cap
500
+ tf.config.set_logical_device_configuration(gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=int(vram_limit_mb))])
501
+ # Force logical devices to materialize
502
+ # Logical devices are a virtual rep. of a physical hardware component (CPU/GPU) that TF creates to manage workload distribution
503
+ # Can have more logical devices than what you have physically, however you are constrained by the physical limitations of your hardware
504
+ logical = tf.config.list_logical_devices("GPU")
505
+ logger.info(
506
+ f"Set VRAM slicing: "
507
+ f"{vram_limit_mb} MB per logical GPU "
508
+ f"({len(logical)} logical over {len(vis_gpus)} physical)."
509
+ )
510
+ except RuntimeError as e:
511
+ # Happens if any TF GPU context was already initialized
512
+ raise RuntimeError(
513
+ "Failed to set logical device configuration. "
514
+ "Ensure tf_environ() is called before any TensorFlow GPU ops or model creation.\n"
515
+ f"Original error: {e}")
516
+
517
+ """
518
+ find_optimal_configurations_cpu/gpu find at the end of the trial run throught find the:
519
+ 1. The best number of concurrent predictions for each (stations, CPUs/GPU VRAM) pair that results in the fastest runtime.
520
+ 2. The overall best configuration balancing stations, CPUs/GPU VRAM, and runtime.
521
+ """
522
+ def find_optimal_configurations_cpu(df):
523
+ """
524
+ Find:
525
+ 1. The best number of concurrent predictions for each (stations, CPUs) pair that results in the fastest runtime.
526
+ 2. The overall best configuration balancing stations, CPUs, and runtime.
527
+ """
528
+
529
+ # Convert relevant columns to numeric, handling NaNs gracefully
530
+ df["Number of Stations Used"] = pd.to_numeric(df["Number of Stations Used"], errors="coerce")
531
+ df["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df["Number of CPUs Allocated for Ray to Use"], errors="coerce")
532
+ df["Total Number of Timechunks"] = pd.to_numeric(df["Total Number of Timechunks"], errors="coerce")
533
+ df["Concurrent Timechunks Used"] = pd.to_numeric(df["Concurrent Timechunks Used"], errors="coerce")
534
+ df["Number of Concurrent Station Tasks"] = pd.to_numeric(df["Number of Concurrent Station Tasks"], errors="coerce")
535
+ df["Total Run time for Picker (s)"] = pd.to_numeric(df["Total Run time for Picker (s)"], errors="coerce")
536
+
537
+
538
+ # Drop rows with missing values in these essential columns
539
+ df_cleaned = df.dropna(subset=["Number of Stations Used", "Number of CPUs Allocated for Ray to Use",
540
+ "Concurrent Timechunks Used", "Number of Concurrent Station Tasks", "Total Run time for Picker (s)"])
541
+
542
+ # Find the best concurrent prediction configuration for each combination of (Stations, Timechunks, CPUs)
543
+ optimal_concurrent_preds = df_cleaned.loc[
544
+ df_cleaned.groupby(["Number of Stations Used", "Concurrent Timechunks Used", "Number of CPUs Allocated for Ray to Use"])
545
+ ["Total Run time for Picker (s)"].idxmin()
546
+ ]
547
+
548
+ # Define what "moderate" means in terms of CPU usage (e.g., middle 50% of available CPUs)
549
+ cpu_min = df_cleaned["Number of CPUs Allocated for Ray to Use"].quantile(0.25)
550
+ cpu_max = df_cleaned["Number of CPUs Allocated for Ray to Use"].quantile(0.75)
551
+
552
+ # Filter for rows within the moderate CPU range
553
+ df_moderate_cpus = df_cleaned[(df_cleaned["Number of CPUs Allocated for Ray to Use"] >= cpu_min) &
554
+ (df_cleaned["Number of CPUs Allocated for Ray to Use"] <= cpu_max)]
555
+
556
+ # Sort by the highest number of stations first, then by the fastest runtime
557
+ best_overall_config = df_moderate_cpus.sort_values(
558
+ by=["Number of Stations Used", "Total Run time for Picker (s)"],
559
+ ascending=[False, True] # Maximize stations, minimize runtime
560
+ ).iloc[0]
561
+
562
+ # Format the output for human readability
563
+ formatted_output = {
564
+ "Trial Number": best_overall_config["Trial Number"],
565
+ "Number of Stations Used": best_overall_config["Number of Stations Used"],
566
+ "Total Number of Timechunks": best_overall_config["Total Number of Timechunks"],
567
+ "Concurrent Timechunks Used": best_overall_config["Concurrent Timechunks Used"],
568
+ "Length of Timechunk (min)": str(best_overall_config["Length of Timechunk (min)"]),
569
+ "Total Waveform Analysis Timespace (min)": str(best_overall_config["Total Waveform Analysis Timespace (min)"]),
570
+ "Number of Concurrent Station Tasks per Timechunk": best_overall_config["Number of Concurrent Station Tasks"],
571
+ "Number of CPUs Allocated for Ray to Use": best_overall_config["Number of CPUs Allocated for Ray to Use"],
572
+ "Intra-parallelism Threads": best_overall_config["Intra-parallelism Threads"],
573
+ "Inter-parallelism Threads": best_overall_config["Inter-parallelism Threads"],
574
+ "Total Run time for Picker (s)": best_overall_config["Total Run time for Picker (s)"],
575
+ "Trial Success": best_overall_config["Trial Success"],
576
+ "Error Message": best_overall_config["Error Message"],
577
+ }
578
+
579
+ best_overall_df = pd.DataFrame([formatted_output])
580
+
581
+
582
+ return optimal_concurrent_preds, best_overall_df
583
+
584
+
585
+ def find_optimal_configurations_gpu(df):
586
+ """
587
+ Find:
588
+ 1) Best concurrency for each (stations, CPUs, GPUs, VRAM) combo (fastest runtime).
589
+ 2) Best overall balanced configuration.
590
+ """
591
+ # 1) Numeric normalization
592
+ numeric_cols = [
593
+ "Number of Stations Used",
594
+ "Number of CPUs Allocated for Ray to Use",
595
+ "Number of Concurrent Station Tasks",
596
+ "Total Run time for Picker (s)",
597
+ "VRAM Used Per Task",
598
+ ]
599
+ for col in numeric_cols:
600
+ df[col] = pd.to_numeric(df[col], errors="coerce")
601
+
602
+ # 2) Normalize GPUs Used -> list[int], then create a *hashable* key
603
+ df["GPUs Used"] = df["GPUs Used"].apply(_parse_gpus_field)
604
+ df["GPUs Used (key)"] = df["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else tuple())
605
+
606
+ # 3) Drop rows missing essentials
607
+ essentials = numeric_cols + ["GPUs Used (key)"]
608
+ df_cleaned = df.dropna(subset=essentials).copy()
609
+
610
+ if df_cleaned.empty:
611
+ # Nothing to optimize; return empty frames shaped like callers expect
612
+ return df_cleaned, df_cleaned
613
+
614
+ # 4) Fastest runtime per (Stations, CPUs, GPUs, VRAM) bucket
615
+ grp_cols = [
616
+ "Number of Stations Used",
617
+ "Number of CPUs Allocated for Ray to Use",
618
+ "GPUs Used (key)",
619
+ "VRAM Used Per Task",
620
+ ]
621
+ idx = (
622
+ df_cleaned
623
+ .groupby(grp_cols)["Total Run time for Picker (s)"]
624
+ .idxmin()
625
+ )
626
+ optimal_concurrent_preds = df_cleaned.loc[idx].copy()
627
+
628
+ # For readability in outputs, show GPUs as list again
629
+ optimal_concurrent_preds["GPUs Used"] = optimal_concurrent_preds["GPUs Used (key)"].apply(list)
630
+ # (Optional) drop helper key in the returned table
631
+ optimal_concurrent_preds.drop(columns=["GPUs Used (key)"], inplace=True, errors="ignore")
632
+
633
+ # 5) “Moderate VRAM” window; if empty, fall back safely
634
+ vram_min = df_cleaned["VRAM Used Per Task"].quantile(0.25)
635
+ vram_max = df_cleaned["VRAM Used Per Task"].quantile(0.75)
636
+ df_moderate_vram = df_cleaned[
637
+ (df_cleaned["VRAM Used Per Task"] >= vram_min)
638
+ & (df_cleaned["VRAM Used Per Task"] <= vram_max)
639
+ ].copy()
640
+ if df_moderate_vram.empty:
641
+ df_moderate_vram = df_cleaned.copy()
642
+
643
+ # Highest stations first, then fastest runtime
644
+ best_overall_config = df_moderate_vram.sort_values(
645
+ by=["Number of Stations Used", "Total Run time for Picker (s)"],
646
+ ascending=[False, True],
647
+ ).iloc[0]
648
+
649
+ formatted_output = {
650
+ "Trial Number": best_overall_config["Trial Number"],
651
+ "Number of Stations Used": best_overall_config["Number of Stations Used"],
652
+ "Total Number of Timechunks": best_overall_config["Total Number of Timechunks"],
653
+ "Concurrent Timechunks Used": best_overall_config["Concurrent Timechunks Used"],
654
+ "Length of Timechunk (min)": str(best_overall_config["Length of Timechunk (min)"]),
655
+ "Total Waveform Analysis Timespace (min)": str(best_overall_config["Total Waveform Analysis Timespace (min)"]),
656
+ "Number of Concurrent Station Tasks per Timechunk": best_overall_config["Number of Concurrent Station Tasks"],
657
+ "Number of CPUs Allocated for Ray to Use": best_overall_config["Number of CPUs Allocated for Ray to Use"],
658
+ "GPUs Used": list(best_overall_config.get("GPUs Used (key)", ())) or best_overall_config.get("GPUs Used", []),
659
+ "VRAM Used Per Task": best_overall_config["VRAM Used Per Task"],
660
+ "Intra-parallelism Threads": best_overall_config["Intra-parallelism Threads"],
661
+ "Inter-parallelism Threads": best_overall_config["Inter-parallelism Threads"],
662
+ "Total Run time for Picker (s)": best_overall_config["Total Run time for Picker (s)"],
663
+ "Trial Success": best_overall_config["Trial Success"],
664
+ "Error Message": best_overall_config["Error Message"],
665
+ }
666
+ best_overall_df = pd.DataFrame([formatted_output])
667
+
668
+ return optimal_concurrent_preds, best_overall_df
669
+
670
+ """
671
+ find_optimal_configuration_cpu/gpu returns back the best overall usecase results configuration, and takes those values to be used as the
672
+ current operation's runtime configuration."""
673
+ def find_optimal_configuration_cpu(best_overall_usecase:bool, eval_sys_results_dir:str, cpu:int=None, station_count:int=None):
674
+ # Check if eval_sys_results_dir is valid
675
+ if not eval_sys_results_dir or not os.path.isdir(eval_sys_results_dir):
676
+ print(f"Error: The provided directory path '{eval_sys_results_dir}' is invalid or does not exist.")
677
+ print("Please provide a valid directory path for the input parameter 'csv_dir'.")
678
+ return exit() # Exit early if the directory is invalid
679
+
680
+ if best_overall_usecase is True:
681
+ file_path = f"{eval_sys_results_dir}/best_overall_usecase_cpu.csv"
682
+
683
+ # Check if the CSV file exists before reading
684
+ if not os.path.exists(file_path):
685
+ print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
686
+ return exit()
687
+
688
+ # Load the CSV
689
+ df_best_overall = pd.read_csv(file_path)
690
+ # Convert into a dictionary for easy access
691
+ best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
692
+
693
+ # Extract required values
694
+ num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
695
+ waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
696
+ total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
697
+ num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
698
+ length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
699
+ num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks")
700
+ intra_threads = best_config_dict.get("Intra-parallelism Threads")
701
+ inter_threads = best_config_dict.get("Inter-parallelism Threads")
702
+ num_stations = best_config_dict.get("Number of Stations Used")
703
+ total_runtime = best_config_dict.get("Total Run time for Picker (s)")
704
+
705
+ print("\nBest Overall Usecase Configuration Based on Trial Data:")
706
+ print(f"CPU: {num_cpus}\n"
707
+ f"Intra-parallelism Threads: {intra_threads}\n"
708
+ f"Inter-parallelism Threads: {inter_threads}\n"
709
+ f"Waveform Timespace: {waveform_timespace}"
710
+ f"Total Number of Timechunks: {total_num_timechunks}"
711
+ f"Length of Timechunks (min): {length_of_timechunks}"
712
+ f"Concurrent Timechunks: {num_concurrent_stations}\n"
713
+ f"Concurrent Stations: {num_concurrent_stations}\n"
714
+ f"Stations: {num_stations}\n"
715
+ f"Total Runtime (s): {total_runtime}")
716
+
717
+ # Return the extracted values
718
+ return int(float(num_cpus)), int(float(num_concurrent_stations)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_stations))
719
+
720
+ else: # Optimal Configuration for User-Specified CPUs and Number of Stations to use
721
+ # Ensure valid CPU and station count values
722
+ if cpu is None or station_count is None:
723
+ print("Error: CPU and station_count must have valid values.")
724
+ return exit()
725
+
726
+ file_path = f"{eval_sys_results_dir}/optimal_configurations_cpu.csv"
727
+
728
+ # Check if the CSV file exists before reading
729
+ if not os.path.exists(file_path):
730
+ print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
731
+ return exit()
732
+
733
+
734
+ df_optimal = pd.read_csv(file_path)
735
+
736
+ # Convert relevant columns to numeric, handling NaNs gracefully
737
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
738
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
739
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
740
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
741
+ filtered_df = df_optimal[
742
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
743
+ (df_optimal["Number of Stations Used"] == station_count)]
744
+ if filtered_df.empty:
745
+ print("No matching configuration found. Please enter a valid entry.")
746
+ exit()
747
+
748
+ # Find the best configuration (fastest runtime)
749
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
750
+
751
+ print("\nBest Configuration for Requested Input Parameters Based on Trial Data:")
752
+ print(f"CPU: {cpu}\nConcurrent Predictions: {best_config['Number of Concurrent Station Tasks']}\n"
753
+ f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}\n"
754
+ f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}\n"
755
+ f"Stations: {station_count}\nTotal Runtime (s): {best_config['Total Run time for Picker (s)']}")
756
+
757
+ return int(float(cpu)), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(station_count))
758
+
759
+
760
+ def find_optimal_configuration_gpu(best_overall_usecase: bool, eval_sys_results_dir: str, num_cpus: int = None, num_gpus: list = None, station_count: int = None):
761
+ """
762
+ Find the optimal GPU configuration for a given number of CPUs, GPUs, and stations.
763
+ Returns the best configuration including CPUs, concurrent predictions, intra/inter parallelism threads,
764
+ GPUs, VRAM, and stations.
765
+ """
766
+
767
+ # Check if eval_sys_results_dir is valid
768
+ if not eval_sys_results_dir or not os.path.isdir(eval_sys_results_dir):
769
+ print(f"Error: The provided directory path '{eval_sys_results_dir}' is invalid or does not exist.")
770
+ print("Please provide a valid directory path for the input parameter 'csv_dir'.")
771
+ return None # Exit early if the directory is invalid
772
+
773
+ if best_overall_usecase:
774
+ file_path = f"{eval_sys_results_dir}/best_overall_usecase_gpu.csv"
775
+
776
+ # Check if the CSV file exists before reading
777
+ if not os.path.exists(file_path):
778
+ print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
779
+ return None
780
+
781
+ # Load the CSV
782
+ df_best_overall = pd.read_csv(file_path, header=None, index_col=0)
783
+
784
+ # Convert into a dictionary for easy access
785
+ best_config_dict = df_best_overall.to_dict()[1] # Extract key-value pairs
786
+
787
+ # Extract required values
788
+ num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
789
+ num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks")
790
+ intra_threads = best_config_dict.get("Intra-parallelism Threads")
791
+ inter_threads = best_config_dict.get("Inter-parallelism Threads")
792
+ num_stations = best_config_dict.get("Number of Stations Used")
793
+ total_runtime = best_config_dict.get("Total Run time for Picker (s)")
794
+ vram_used = best_config_dict.get("VRAM Used Per Task")
795
+ num_gpus_st = best_config_dict.get("GPUs Used")
796
+ num_gpus = ast.literal_eval(num_gpus_st)
797
+
798
+ print("\nBest Overall Usecase Configuration Based on Trial Data:")
799
+ print(f"CPU: {num_cpus}\n"
800
+ f"GPU ID(s): {num_gpus}\n"
801
+ f"Concurrent Predictions: {num_concurrent_stations}\n"
802
+ f"Intra-parallelism Threads: {intra_threads}\n"
803
+ f"Inter-parallelism Threads: {inter_threads}\n"
804
+ f"Stations: {num_stations}\n"
805
+ f"VRAM Used per Task: {vram_used}\n"
806
+ f"Total Runtime (s): {total_runtime}")
807
+
808
+ return int(float(num_cpus)), int(float(num_concurrent_stations)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
809
+
810
+ else: # Optimal Configuration for User-Specified CPUs, GPUs, and Number of Stations to use
811
+ # Ensure valid CPU, GPU, and station count values
812
+ if num_cpus is None or station_count is None or num_gpus is None:
813
+ print("Error: num_cpus, station_count, and num_gpus must have valid values.")
814
+ return None
815
+
816
+ file_path = f"{eval_sys_results_dir}/optimal_configurations_gpu.csv"
817
+
818
+ # Check if the CSV file exists before reading
819
+ if not os.path.exists(file_path):
820
+ print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
821
+ return None
822
+
823
+ df_optimal = pd.read_csv(file_path)
824
+
825
+ # Convert relevant columns to numeric, handling NaNs gracefully
826
+ df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
827
+ df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
828
+ df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
829
+ df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
830
+ df_optimal["VRAM Used Per Task"] = pd.to_numeric(df_optimal["VRAM Used Per Task"], errors="coerce")
831
+
832
+ # Convert "GPUs Used" from string representation to list
833
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
834
+
835
+ # Convert GPU lists to tuples for comparison
836
+ df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
837
+
838
+ # Ensure num_gpus is in tuple format for comparison
839
+ num_gpus_tuple = tuple(num_gpus) if isinstance(num_gpus, list) else (num_gpus,)
840
+
841
+ filtered_df = df_optimal[
842
+ (df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
843
+ (df_optimal["GPUs Used"] == num_gpus_tuple) &
844
+ (df_optimal["Number of Stations Used"] == station_count)
845
+ ]
846
+
847
+ if filtered_df.empty:
848
+ print("No matching configuration found. Please enter a valid entry.")
849
+ exit()
850
+
851
+ # Find the best configuration (fastest runtime)
852
+ best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
853
+
854
+ print("\nBest Configuration for Requested Application Usecase Based on Trial Data:")
855
+ print(f"CPU: {num_cpus}\n"
856
+ f"GPU: {num_gpus}\n"
857
+ f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}\n"
858
+ f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}\n"
859
+ f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}\n"
860
+ f"Stations: {station_count}\n"
861
+ f"VRAM Used per Task: {best_config['VRAM Used Per Task']}\n"
862
+ f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
863
+
864
+ return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
865
+ int(float(best_config["Number of Concurrent Station Tasks"])), \
866
+ int(float(best_config["Intra-parallelism Threads"])), \
867
+ int(float(best_config["Inter-parallelism Threads"])), \
868
+ num_gpus, \
869
+ int(float(best_config["VRAM Used Per Task"])), \
870
+ int(float(station_count))
871
+