eqcctpro 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eqcctpro/__init__.py +14 -2
- eqcctpro/eqcct_tf_models.py +407 -0
- eqcctpro/functionality.py +1424 -0
- eqcctpro/parallelization.py +1312 -0
- eqcctpro/seisbench_models.py +279 -0
- eqcctpro/tools.py +968 -0
- eqcctpro-0.7.0.dist-info/METADATA +312 -0
- eqcctpro-0.7.0.dist-info/RECORD +10 -0
- eqcctpro-0.6.2.dist-info/METADATA +0 -541
- eqcctpro-0.6.2.dist-info/RECORD +0 -5
- {eqcctpro-0.6.2.dist-info → eqcctpro-0.7.0.dist-info}/WHEEL +0 -0
- {eqcctpro-0.6.2.dist-info → eqcctpro-0.7.0.dist-info}/top_level.txt +0 -0
eqcctpro/tools.py
ADDED
|
@@ -0,0 +1,968 @@
|
|
|
1
|
+
"""
|
|
2
|
+
tools.py contains the sub-tool functions that EQCCTPro uses, such as getting the current system's VRAM, creating subdirs, etc.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import ast
|
|
8
|
+
import math
|
|
9
|
+
import glob
|
|
10
|
+
import csv
|
|
11
|
+
import shutil
|
|
12
|
+
import pynvml
|
|
13
|
+
import logging
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from math import floor
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from dataclasses import dataclass, field # To-Do: Figure out what dataclass and field mean/do
|
|
18
|
+
from datetime import datetime, timedelta
|
|
19
|
+
from typing import Iterable, Callable, Optional, List
|
|
20
|
+
|
|
21
|
+
# We define the CSV header as a globally consistent CSV header for all CSVs generated by EQCCTPro
|
|
22
|
+
CANONICAL_CSV_HEADER = [
|
|
23
|
+
"Trial Number",
|
|
24
|
+
"Stations Used",
|
|
25
|
+
"Number of Stations Used",
|
|
26
|
+
"Number of CPUs Allocated for Ray to Use",
|
|
27
|
+
"Intra-parallelism Threads",
|
|
28
|
+
"Inter-parallelism Threads",
|
|
29
|
+
"GPUs Used",
|
|
30
|
+
"Inference Actor Memory Limit (MB)",
|
|
31
|
+
"Total Waveform Analysis Timespace (min)",
|
|
32
|
+
"Total Number of Timechunks",
|
|
33
|
+
"Concurrent Timechunks Used",
|
|
34
|
+
"Length of Timechunk (min)",
|
|
35
|
+
"Number of Concurrent Station Tasks",
|
|
36
|
+
"Total Run time for Picker (s)",
|
|
37
|
+
"Model Used",
|
|
38
|
+
"Trial Success",
|
|
39
|
+
"Error Message",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
build_station_list_from_dir, looks_like_timechunk_id, and _TIMECHUNK_RE work together to discover the stations that are in a timechunk directory. \
|
|
44
|
+
build_station_list_from_dir builds the list of stations in a timechunk dir, using looks_like_timechunk_id to make sure that the subdirectory of the
|
|
45
|
+
given input_dir is not a timechunk_id, if it is, it is filtered out.
|
|
46
|
+
"""
|
|
47
|
+
# Pattern for understanding the structure of a timechunk directory naming pattern
|
|
48
|
+
_TIMECHUNK_RE = re.compile(r"^\d{8}T\d{6}Z_\d{8}T\d{6}Z$")
|
|
49
|
+
|
|
50
|
+
def looks_like_timechunk_id(name: str) -> bool:
|
|
51
|
+
return bool(_TIMECHUNK_RE.match(name or ""))
|
|
52
|
+
|
|
53
|
+
def build_station_list_from_dir(input_dir: str) -> list[str]:
|
|
54
|
+
"""
|
|
55
|
+
Robustly discover stations under a timechunk directory.
|
|
56
|
+
Accepts files like *.mseed/*.sac or one-dir-per-station structures.
|
|
57
|
+
"""
|
|
58
|
+
stations = set()
|
|
59
|
+
|
|
60
|
+
# 1) Files directly inside input_dir
|
|
61
|
+
for p in glob.glob(os.path.join(input_dir, "*")):
|
|
62
|
+
base = os.path.basename(p)
|
|
63
|
+
if os.path.isfile(p):
|
|
64
|
+
# file path — take stem without extension
|
|
65
|
+
stations.add(os.path.splitext(base)[0])
|
|
66
|
+
|
|
67
|
+
# 2) One subdir per station (e.g., input_dir/AT01/*.mseed)
|
|
68
|
+
for p in glob.glob(os.path.join(input_dir, "*")):
|
|
69
|
+
if os.path.isdir(p):
|
|
70
|
+
stations.add(os.path.basename(p))
|
|
71
|
+
|
|
72
|
+
# Filter out anything that looks like a timechunk id (safety)
|
|
73
|
+
stations = [s for s in stations if not looks_like_timechunk_id(s)]
|
|
74
|
+
|
|
75
|
+
return sorted(stations)
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
generate_station_list builds a list that contains the number of stations to use in the trial iterations, marching from a given amount of starting stations
|
|
79
|
+
to a final amount of stations with either an specified or a pre-determined step size.
|
|
80
|
+
"""
|
|
81
|
+
def generate_station_list(starting_amount_of_stations, total_num_stations_to_use, station_list_step_size):
|
|
82
|
+
if total_num_stations_to_use == 1:
|
|
83
|
+
return [1]
|
|
84
|
+
elif total_num_stations_to_use < 10:
|
|
85
|
+
return list(range(1, total_num_stations_to_use + 1))
|
|
86
|
+
elif total_num_stations_to_use >= 10 and starting_amount_of_stations == 1 and station_list_step_size == 1:
|
|
87
|
+
# We want to reduce as much extra testing as needed by applying an iterative approach for marching approach via smart step sizes
|
|
88
|
+
|
|
89
|
+
# We check if we can apply multiples of 5 up to total_num_stations_to_use
|
|
90
|
+
target = total_num_stations_to_use
|
|
91
|
+
start = 10 # We start from 10 because we already covered 1-9 in the previous condition
|
|
92
|
+
step = 5 # Step size of 5 for multiples of 5 (User can change this if desired - To Do add var for this in the future)
|
|
93
|
+
|
|
94
|
+
stp_of_one = list(range(1, start + 1)) # We generate a list of 1-10 with step size of 1 and afterwards we will add multiples of 5
|
|
95
|
+
|
|
96
|
+
remainder = target % step # We calculate the remainder to see if target is a multiple of 5 or not
|
|
97
|
+
max_multiple = target - remainder # We calculate what is the maximum multiple of 5 that is less than or equal to the target
|
|
98
|
+
if remainder != 0: # We found a number that is not immediately a multiple of 5, so we need to add the remaining numbers after the last multiple of 5 up to the target with step size of 1
|
|
99
|
+
marching_scheme = stp_of_one + list(range(start + step, max_multiple + 1, step)) + list(range(max_multiple + 1, target + 1, 1))
|
|
100
|
+
|
|
101
|
+
elif remainder == 0: # We hit that the target is not a multiple of 5, so we start from 20, ..., +5, ..., up to target
|
|
102
|
+
marching_scheme = stp_of_one + list(range(start + step, target + 1, step))
|
|
103
|
+
|
|
104
|
+
return sorted(set(marching_scheme))
|
|
105
|
+
else:
|
|
106
|
+
return list(range(starting_amount_of_stations, total_num_stations_to_use + 1, station_list_step_size))
|
|
107
|
+
|
|
108
|
+
"""
|
|
109
|
+
list_gpu_ids returns a list of available GPU IDs on the system.
|
|
110
|
+
"""
|
|
111
|
+
def list_gpu_ids():
|
|
112
|
+
"""List all available GPU IDs on the system."""
|
|
113
|
+
pynvml.nvmlInit() # Initialize NVML
|
|
114
|
+
gpu_count = pynvml.nvmlDeviceGetCount() # Get number of GPUs
|
|
115
|
+
gpu_ids = list(range(gpu_count)) # Create a list of GPU indices
|
|
116
|
+
pynvml.nvmlShutdown() # Shutdown NVML
|
|
117
|
+
return gpu_ids
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
get_gpu_vram retrieves the total and free VRAM (in GB) for the current GPU.
|
|
121
|
+
"""
|
|
122
|
+
def get_gpu_vram(gpu_index): # Need to fix to pass in a GPU instead of hard indexing 0
|
|
123
|
+
"""Retrieve total and free VRAM (in GB) for the current GPU."""
|
|
124
|
+
pynvml.nvmlInit() # Initialize NVML
|
|
125
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) # Use provided GPU
|
|
126
|
+
total_vram = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3) # Convert bytes to GB
|
|
127
|
+
free_vram = pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024**3) # Convert bytes to GB
|
|
128
|
+
pynvml.nvmlShutdown() # Shutdown NVML
|
|
129
|
+
return total_vram, free_vram
|
|
130
|
+
|
|
131
|
+
"""
|
|
132
|
+
_parse_gpu_fields normalize the 'GPUs Used' field from CSV into a list[int]
|
|
133
|
+
"""
|
|
134
|
+
def _parse_gpus_field(x):
|
|
135
|
+
if x is None:
|
|
136
|
+
return []
|
|
137
|
+
if isinstance(x, (list, tuple)):
|
|
138
|
+
return [int(v) for v in x]
|
|
139
|
+
if isinstance(x, (int, float)) and not (isinstance(x, float) and math.isnan(x)):
|
|
140
|
+
return [int(x)]
|
|
141
|
+
if isinstance(x, str):
|
|
142
|
+
s = x.strip()
|
|
143
|
+
if not s:
|
|
144
|
+
return []
|
|
145
|
+
s = s.replace("(", "[").replace(")", "]").replace("{", "[").replace("}", "]")
|
|
146
|
+
try:
|
|
147
|
+
val = ast.literal_eval(s)
|
|
148
|
+
if isinstance(val, (list, tuple)):
|
|
149
|
+
return [int(v) for v in val]
|
|
150
|
+
if isinstance(val, (int, float)):
|
|
151
|
+
return [int(val)]
|
|
152
|
+
except Exception:
|
|
153
|
+
nums = re.findall(r"-?\d+", s)
|
|
154
|
+
return [int(n) for n in nums]
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
VramPlan checks to see that for the RunEQCCTPro functionality, when the user calls to use a certain amount of VRAM per Raylet,
|
|
159
|
+
we check to see that the amount requested does not exceed what is capable for the system.
|
|
160
|
+
"""
|
|
161
|
+
# To-Do: Understand how this works
|
|
162
|
+
@dataclass
|
|
163
|
+
class VramPlan:
|
|
164
|
+
intended_workers: int
|
|
165
|
+
per_worker_mb: float
|
|
166
|
+
overhead_mb: float
|
|
167
|
+
model_vram_mb: float
|
|
168
|
+
per_gpu_cap_mb: List[float] # post safety-cap
|
|
169
|
+
max_workers_per_gpu: List[int]
|
|
170
|
+
total_max_workers: int
|
|
171
|
+
aggregate_cap_mb: float
|
|
172
|
+
aggregate_need_mb: float
|
|
173
|
+
ok_per_gpu: bool
|
|
174
|
+
ok_aggregate: bool
|
|
175
|
+
# Optional; value-object semantics preserved (not shown in repr/eq)
|
|
176
|
+
logger: Optional[logging.Logger] = field(default=None, repr=False, compare=False)
|
|
177
|
+
|
|
178
|
+
def evaluate_vram_capacity(
|
|
179
|
+
*,
|
|
180
|
+
intended_workers: int,
|
|
181
|
+
vram_per_worker_mb: float,
|
|
182
|
+
per_gpu_free_mb: Iterable[float], # free MB per selected GPU (raw)
|
|
183
|
+
model_vram_mb: float = 3000.0, # reserve per GPU
|
|
184
|
+
safety_cap: float = 0.90, # cap each GPU
|
|
185
|
+
eqcct_overhead_gb: float = 1.1 # per-worker overhead (GB)
|
|
186
|
+
) -> VramPlan:
|
|
187
|
+
"""
|
|
188
|
+
Computes both per-GPU admission and aggregate budget feasibility.
|
|
189
|
+
Pure: no logging/printing/raising.
|
|
190
|
+
"""
|
|
191
|
+
overhead_mb = eqcct_overhead_gb * 1024.0
|
|
192
|
+
per_gpu_cap_mb = [safety_cap * float(x) for x in per_gpu_free_mb]
|
|
193
|
+
|
|
194
|
+
denom = (vram_per_worker_mb + overhead_mb)
|
|
195
|
+
if denom <= 0:
|
|
196
|
+
raise ValueError("Non-positive per-worker memory requirement.")
|
|
197
|
+
|
|
198
|
+
max_workers_per_gpu = []
|
|
199
|
+
for cap in per_gpu_cap_mb:
|
|
200
|
+
remaining = max(0.0, cap - model_vram_mb) # reserve the model actor once per GPU
|
|
201
|
+
maxw = max(0, floor(remaining / denom))
|
|
202
|
+
max_workers_per_gpu.append(maxw)
|
|
203
|
+
|
|
204
|
+
total_max_workers = sum(max_workers_per_gpu)
|
|
205
|
+
ok_per_gpu = intended_workers <= total_max_workers
|
|
206
|
+
|
|
207
|
+
aggregate_cap_mb = sum(per_gpu_cap_mb)
|
|
208
|
+
aggregate_need_mb = (model_vram_mb * len(per_gpu_cap_mb)) + intended_workers * denom
|
|
209
|
+
ok_aggregate = aggregate_need_mb <= aggregate_cap_mb
|
|
210
|
+
|
|
211
|
+
return VramPlan(
|
|
212
|
+
intended_workers=intended_workers,
|
|
213
|
+
per_worker_mb=vram_per_worker_mb,
|
|
214
|
+
overhead_mb=overhead_mb,
|
|
215
|
+
model_vram_mb=model_vram_mb,
|
|
216
|
+
per_gpu_cap_mb=per_gpu_cap_mb,
|
|
217
|
+
max_workers_per_gpu=max_workers_per_gpu,
|
|
218
|
+
total_max_workers=total_max_workers,
|
|
219
|
+
aggregate_cap_mb=aggregate_cap_mb,
|
|
220
|
+
aggregate_need_mb=aggregate_need_mb,
|
|
221
|
+
ok_per_gpu=ok_per_gpu,
|
|
222
|
+
ok_aggregate=ok_aggregate,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def _emit(logger: Optional[logging.Logger], level: str, msg: str) -> None:
|
|
226
|
+
if logger is not None:
|
|
227
|
+
getattr(logger, level)(msg)
|
|
228
|
+
else:
|
|
229
|
+
print(msg)
|
|
230
|
+
|
|
231
|
+
# ---------- Thin wrappers to preserve your two call sites ----------
|
|
232
|
+
|
|
233
|
+
def check_vram_per_gpu_style(
|
|
234
|
+
*,
|
|
235
|
+
selected_gpus: List[int],
|
|
236
|
+
get_gpu_vram_fn: Callable[[int], tuple[float, float]], # returns (total_gb, free_gb) for that GPU
|
|
237
|
+
intended_workers: int,
|
|
238
|
+
vram_mb: float,
|
|
239
|
+
model_vram_mb: float = 3000.0,
|
|
240
|
+
safety_cap: float = 0.95, # matches your original 95% cap
|
|
241
|
+
eqcct_overhead_gb: float = 0.0, # original per-GPU check ignored runtime overhead
|
|
242
|
+
logger: Optional[logging.Logger] = None
|
|
243
|
+
) -> None:
|
|
244
|
+
|
|
245
|
+
safety_cap = float(safety_cap)
|
|
246
|
+
if not (0.0 < safety_cap <= 0.99):
|
|
247
|
+
raise ValueError(f"safety_cap must be in (0, 0.99], got {safety_cap}.")
|
|
248
|
+
|
|
249
|
+
per_gpu_free_mb = [(get_gpu_vram_fn(gid)[1] * 1024.0) for gid in selected_gpus]
|
|
250
|
+
|
|
251
|
+
plan = evaluate_vram_capacity(
|
|
252
|
+
intended_workers=intended_workers,
|
|
253
|
+
vram_per_worker_mb=float(vram_mb),
|
|
254
|
+
per_gpu_free_mb=per_gpu_free_mb,
|
|
255
|
+
model_vram_mb=model_vram_mb,
|
|
256
|
+
safety_cap=safety_cap,
|
|
257
|
+
eqcct_overhead_gb=eqcct_overhead_gb,
|
|
258
|
+
)
|
|
259
|
+
plan.logger = logger
|
|
260
|
+
|
|
261
|
+
if not plan.ok_per_gpu:
|
|
262
|
+
# Compose precise diagnostic
|
|
263
|
+
unit = plan.per_worker_mb + plan.overhead_mb
|
|
264
|
+
msg = (
|
|
265
|
+
f"ERROR: Per-GPU capacity insufficient for {plan.intended_workers} workers.\n"
|
|
266
|
+
f" Reservation per GPU: model={plan.model_vram_mb:.0f} MB, worker_unit={unit:.0f} MB "
|
|
267
|
+
f"({plan.per_worker_mb:.0f} + {plan.overhead_mb:.0f})\n"
|
|
268
|
+
f" Per-GPU caps after safety: {', '.join(f'{c:.0f}' for c in plan.per_gpu_cap_mb)} MB\n"
|
|
269
|
+
f" Max workers per GPU: {plan.max_workers_per_gpu} (total={plan.total_max_workers})\n"
|
|
270
|
+
f"Action: lower vram_mb, reduce concurrency, or add GPUs."
|
|
271
|
+
)
|
|
272
|
+
_emit(logger, "error", msg)
|
|
273
|
+
raise RuntimeError(msg)
|
|
274
|
+
|
|
275
|
+
_emit(logger, "info",
|
|
276
|
+
f"Per-GPU admission OK: {plan.intended_workers} ≤ {plan.total_max_workers} workers.")
|
|
277
|
+
|
|
278
|
+
def check_vram_aggregate_style(
|
|
279
|
+
*,
|
|
280
|
+
eval_mode: str,
|
|
281
|
+
selected_gpus: List[int],
|
|
282
|
+
get_cluster_free_gb_fn: Callable[[], tuple[float, float]], # returns (total_gb, free_gb)
|
|
283
|
+
intended_workers: int,
|
|
284
|
+
vram_mb: float,
|
|
285
|
+
model_vram_mb: float = 3000.0,
|
|
286
|
+
safety_cap: float = 0.90,
|
|
287
|
+
eqcct_overhead_gb: float = 1.1,
|
|
288
|
+
logger: Optional[logging.Logger] = None
|
|
289
|
+
) -> None:
|
|
290
|
+
if eval_mode.lower() != "gpu":
|
|
291
|
+
raise ValueError(f"vram_mb is only meaningful in GPU mode; got eval_mode='{eval_mode}'.")
|
|
292
|
+
|
|
293
|
+
# Homogeneous assumption (your prior logic): same free_gb replicated
|
|
294
|
+
_, free_gb = get_cluster_free_gb_fn()
|
|
295
|
+
per_gpu_free_mb = [free_gb * 1024.0] * len(selected_gpus)
|
|
296
|
+
|
|
297
|
+
plan = evaluate_vram_capacity(
|
|
298
|
+
intended_workers=intended_workers,
|
|
299
|
+
vram_per_worker_mb=float(vram_mb),
|
|
300
|
+
per_gpu_free_mb=per_gpu_free_mb,
|
|
301
|
+
model_vram_mb=model_vram_mb,
|
|
302
|
+
safety_cap=safety_cap,
|
|
303
|
+
eqcct_overhead_gb=eqcct_overhead_gb,
|
|
304
|
+
)
|
|
305
|
+
plan.logger = logger
|
|
306
|
+
|
|
307
|
+
if not plan.ok_aggregate:
|
|
308
|
+
unit = plan.per_worker_mb + plan.overhead_mb
|
|
309
|
+
msg = (
|
|
310
|
+
f"ERROR: Aggregate VRAM insufficient.\n"
|
|
311
|
+
f" GPUs: {len(selected_gpus)} | Safety cap: {int(safety_cap*100)}%\n"
|
|
312
|
+
f" Aggregate cap: {plan.aggregate_cap_mb:.0f} MB\n"
|
|
313
|
+
f" Aggregate need: {plan.aggregate_need_mb:.0f} MB "
|
|
314
|
+
f"(= {model_vram_mb:.0f}×{len(selected_gpus)} + {intended_workers}×{unit:.0f})\n"
|
|
315
|
+
f"Action: lower vram_mb, reduce concurrency, or add GPUs."
|
|
316
|
+
)
|
|
317
|
+
_emit(logger, "error", msg)
|
|
318
|
+
raise RuntimeError(msg)
|
|
319
|
+
|
|
320
|
+
_emit(logger, "info",
|
|
321
|
+
f"Aggregate budget OK. Need {plan.aggregate_need_mb:.0f} MB ≤ "
|
|
322
|
+
f"Cap {plan.aggregate_cap_mb:.0f} MB across {len(selected_gpus)} GPU(s).")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
"""
|
|
326
|
+
prepare_csv either loads or initializes the CSV file for storing test results.
|
|
327
|
+
"""
|
|
328
|
+
def prepare_csv(csv_file_path, logger):
|
|
329
|
+
"""
|
|
330
|
+
Loads or initializes the CSV file for storing test results.
|
|
331
|
+
"""
|
|
332
|
+
if os.path.exists(csv_file_path):
|
|
333
|
+
logger.info(f"Loading existing CSV file from '{csv_file_path}'...")
|
|
334
|
+
return pd.read_csv(csv_file_path)
|
|
335
|
+
logger.info(f"CSV file not found. Creating a new CSV file at '{csv_file_path}'...")
|
|
336
|
+
|
|
337
|
+
columns = CANONICAL_CSV_HEADER
|
|
338
|
+
df = pd.DataFrame(columns=columns)
|
|
339
|
+
df.to_csv(csv_file_path, index=False)
|
|
340
|
+
|
|
341
|
+
"""
|
|
342
|
+
append_trial_row appends a completed trial_dictionary to a CSV via the last line. Does this after successfully completing a trial during the
|
|
343
|
+
EvaluateSystem process for either CPU or GPU. Does not provide information for the success or the error message of the trial (update_csv does).
|
|
344
|
+
"""
|
|
345
|
+
def append_trial_row(csv_path: str, trial_data: dict):
|
|
346
|
+
"""
|
|
347
|
+
Append a complete trial row to the CSV with all fields populated.
|
|
348
|
+
Ensures "GPUs Used" column is consistently formatted and quoted.
|
|
349
|
+
"""
|
|
350
|
+
csvp = Path(csv_path)
|
|
351
|
+
|
|
352
|
+
# Ensure header exists with canonical order
|
|
353
|
+
if not csvp.exists():
|
|
354
|
+
pd.DataFrame(columns=CANONICAL_CSV_HEADER).to_csv(csvp, index=False)
|
|
355
|
+
|
|
356
|
+
df_existing = pd.read_csv(csvp, keep_default_na=False)
|
|
357
|
+
|
|
358
|
+
# Align row to the canonical header (use empty string for missing keys)
|
|
359
|
+
row = {col: trial_data.get(col, "") for col in CANONICAL_CSV_HEADER}
|
|
360
|
+
|
|
361
|
+
# Normalize "GPUs Used" format to ensure consistent quoting
|
|
362
|
+
if "GPUs Used" in row and row["GPUs Used"]:
|
|
363
|
+
gpus_value = row["GPUs Used"]
|
|
364
|
+
# If it's already a JSON string, parse and reformat consistently
|
|
365
|
+
if isinstance(gpus_value, str):
|
|
366
|
+
try:
|
|
367
|
+
# Try to parse as JSON/list
|
|
368
|
+
gpus_list = ast.literal_eval(gpus_value.replace("(", "[").replace(")", "]"))
|
|
369
|
+
if isinstance(gpus_list, (list, tuple)):
|
|
370
|
+
# Format consistently: [0] or [0, 1] with space after comma
|
|
371
|
+
row["GPUs Used"] = "[" + ", ".join(map(str, gpus_list)) + "]"
|
|
372
|
+
else:
|
|
373
|
+
row["GPUs Used"] = str(gpus_value)
|
|
374
|
+
except:
|
|
375
|
+
row["GPUs Used"] = str(gpus_value)
|
|
376
|
+
elif isinstance(gpus_value, (list, tuple)):
|
|
377
|
+
row["GPUs Used"] = "[" + ", ".join(map(str, gpus_value)) + "]"
|
|
378
|
+
else:
|
|
379
|
+
row["GPUs Used"] = str(gpus_value) if gpus_value else "[]"
|
|
380
|
+
|
|
381
|
+
# Auto-number trials if not provided
|
|
382
|
+
if pd.isna(row["Trial Number"]) or row["Trial Number"] == "" or row["Trial Number"] is None:
|
|
383
|
+
row["Trial Number"] = len(df_existing) + 1
|
|
384
|
+
|
|
385
|
+
df_new = pd.DataFrame([row], columns=CANONICAL_CSV_HEADER)
|
|
386
|
+
df_out = pd.concat([df_existing, df_new], ignore_index=True)
|
|
387
|
+
|
|
388
|
+
# Write CSV with quoting to ensure "GPUs Used" is always quoted
|
|
389
|
+
# Use QUOTE_NONNUMERIC to quote all non-numeric fields, ensuring consistency
|
|
390
|
+
df_out.to_csv(csvp, index=False, quoting=csv.QUOTE_NONNUMERIC)
|
|
391
|
+
|
|
392
|
+
print(f"Appended trial {row['Trial Number']} to {csv_path}")
|
|
393
|
+
|
|
394
|
+
"""
|
|
395
|
+
normalize_gpu_csv_quoting ensures all "GPUs Used" entries in a CSV are consistently formatted and quoted.
|
|
396
|
+
This function can be used to update existing CSV files to have consistent formatting.
|
|
397
|
+
"""
|
|
398
|
+
def normalize_gpu_csv_quoting(csv_filepath: str):
|
|
399
|
+
"""
|
|
400
|
+
Normalize the "GPUs Used" column in an existing CSV file to ensure consistent formatting and quoting.
|
|
401
|
+
"""
|
|
402
|
+
if not os.path.exists(csv_filepath):
|
|
403
|
+
return
|
|
404
|
+
|
|
405
|
+
df = pd.read_csv(csv_filepath, keep_default_na=False)
|
|
406
|
+
|
|
407
|
+
if "GPUs Used" not in df.columns:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
# Normalize each "GPUs Used" entry
|
|
411
|
+
for idx in df.index:
|
|
412
|
+
gpus_value = df.at[idx, "GPUs Used"]
|
|
413
|
+
if pd.isna(gpus_value) or gpus_value == "":
|
|
414
|
+
df.at[idx, "GPUs Used"] = "[]"
|
|
415
|
+
else:
|
|
416
|
+
try:
|
|
417
|
+
# Parse the GPU value (handles various formats)
|
|
418
|
+
gpus_str = str(gpus_value).strip()
|
|
419
|
+
# Remove existing quotes if present
|
|
420
|
+
if gpus_str.startswith('"') and gpus_str.endswith('"'):
|
|
421
|
+
gpus_str = gpus_str[1:-1]
|
|
422
|
+
|
|
423
|
+
# Parse as list
|
|
424
|
+
gpus_list = ast.literal_eval(gpus_str.replace("(", "[").replace(")", "]"))
|
|
425
|
+
if isinstance(gpus_list, (list, tuple)):
|
|
426
|
+
# Format consistently: [0] or [0, 1] with space after comma
|
|
427
|
+
df.at[idx, "GPUs Used"] = "[" + ", ".join(map(str, gpus_list)) + "]"
|
|
428
|
+
else:
|
|
429
|
+
df.at[idx, "GPUs Used"] = str(gpus_value)
|
|
430
|
+
except:
|
|
431
|
+
# If parsing fails, keep original but ensure it's a string
|
|
432
|
+
df.at[idx, "GPUs Used"] = str(gpus_value)
|
|
433
|
+
|
|
434
|
+
# Write back with consistent quoting
|
|
435
|
+
df.to_csv(csv_filepath, index=False, quoting=csv.QUOTE_NONNUMERIC)
|
|
436
|
+
|
|
437
|
+
"""
|
|
438
|
+
update_csv updates a completed trial after the code has exited the mseed_predictor loop and into the last steps of completing the trial.
|
|
439
|
+
If the trial either was a success or had errors, we update the last row with the success/error information of the trial.
|
|
440
|
+
"""
|
|
441
|
+
def update_csv(csv_filepath, success, error_message):
|
|
442
|
+
df = pd.read_csv(csv_filepath)
|
|
443
|
+
if "Error Message" not in df.columns:
|
|
444
|
+
df["Error Message"] = ""
|
|
445
|
+
|
|
446
|
+
# Ensure string dtype
|
|
447
|
+
df["Error Message"] = df["Error Message"].astype("string")
|
|
448
|
+
|
|
449
|
+
last_idx = df.index[-1] # Get last row id number
|
|
450
|
+
df.loc[last_idx, 'Trial Success'] = success # Access value at row last_idx, column 'Trial Success'
|
|
451
|
+
df.loc[last_idx, 'Error Message'] = error_message # Access value at row last_idx, column 'Error Message'
|
|
452
|
+
|
|
453
|
+
df.to_csv(csv_filepath, index=False)
|
|
454
|
+
|
|
455
|
+
# """
|
|
456
|
+
# remove_directory removes a specified directory if it exists. (NOT CURRENTLY USED)
|
|
457
|
+
# """
|
|
458
|
+
# def remove_directory(path):
|
|
459
|
+
# """
|
|
460
|
+
# Removes the specified directory if it exists.
|
|
461
|
+
# """
|
|
462
|
+
# if os.path.exists(path):
|
|
463
|
+
# shutil.rmtree(path)
|
|
464
|
+
# print(f"Removed directory: {path}")
|
|
465
|
+
# else:
|
|
466
|
+
# print(f"Directory '{path}' does not exist anymore.")
|
|
467
|
+
|
|
468
|
+
"""
|
|
469
|
+
remove_output_subdirs removes all the subdirectoreies of a specified directory. We use it in the code for removing the contents of the 'output' dir
|
|
470
|
+
once a trial has completed, because if we do not, then the code will recognize the directory name and believe that the waveform was already analyzed
|
|
471
|
+
when in reality it was in the previous configuration iteration. Does not remove trial contents or log, just station subdirs created by mseed_predictor
|
|
472
|
+
and parallel_predict.
|
|
473
|
+
"""
|
|
474
|
+
def remove_output_subdirs(output_dir: str, logger: logging.Logger | None = None) -> None:
|
|
475
|
+
"""
|
|
476
|
+
Delete any *_outputs subdirectories in `output_dir`.
|
|
477
|
+
Logs via `logger` if provided; otherwise falls back to print.
|
|
478
|
+
"""
|
|
479
|
+
try:
|
|
480
|
+
for name in os.listdir(output_dir):
|
|
481
|
+
path = os.path.join(output_dir, name)
|
|
482
|
+
if os.path.isdir(path) and name.endswith("_outputs"):
|
|
483
|
+
shutil.rmtree(path, ignore_errors=True)
|
|
484
|
+
msg = f"Removed subdirectory: {path}"
|
|
485
|
+
(logger.info if logger else print)(msg)
|
|
486
|
+
except Exception as e:
|
|
487
|
+
msg = f"Failed to remove output subdirs in {output_dir}: {e}"
|
|
488
|
+
(logger.error if logger else print)(msg)
|
|
489
|
+
|
|
490
|
+
"""
|
|
491
|
+
We need to check to make sure that the input dir's contents (IE. the station subdirs in each timechunk dir)
|
|
492
|
+
are the same length. We need to ensure that they are
|
|
493
|
+
"""
|
|
494
|
+
def check_station_dirs(input_dir):
|
|
495
|
+
subdir_lens, station_list_f = [], []
|
|
496
|
+
sorted_input_dir = sorted(os.listdir(input_dir))
|
|
497
|
+
subdirs = [item for item in sorted_input_dir if os.path.isdir(os.path.join(input_dir, item))] # Inherits the sorted
|
|
498
|
+
for timechunk_dir in subdirs:
|
|
499
|
+
subdir_path = os.path.join(input_dir, timechunk_dir)
|
|
500
|
+
try:
|
|
501
|
+
station_list = os.listdir(subdir_path)
|
|
502
|
+
subdir_lens.append(len(station_list))
|
|
503
|
+
station_list_f = station_list
|
|
504
|
+
except OSError as e:
|
|
505
|
+
print(f"Warning: Could not read directory {subdir_path}. Error: {e}")
|
|
506
|
+
|
|
507
|
+
check_if_lens_are_same = len(set(subdir_lens))
|
|
508
|
+
if check_if_lens_are_same != 1:
|
|
509
|
+
# Bad case
|
|
510
|
+
statement = f"The contents across your timechunk directories are not the same. They must match for EQCCTPro. Fix station subdirs so each timechunk has the same stations. Exiting..."
|
|
511
|
+
return statement, station_list_f, True
|
|
512
|
+
else:
|
|
513
|
+
statement = f"Stations subdirs in timechunk directories are consistent. Continuing EQCCTPro..."
|
|
514
|
+
return statement, station_list_f, False
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
"""
|
|
518
|
+
tf_environ sets the tensorflow environment for either an allocated CPU or GPU configuration.
|
|
519
|
+
If it is a CPU, the CUDA_DEVICE_ORDER is configured for a CPU; Intra/inter parallelism threads are also set as well
|
|
520
|
+
If a GPU, the CUDA_DEVICE_ORDER is set for a GPU, as well as intra/inter threads, and we configure that GPU to only
|
|
521
|
+
use up to a limited number of vram (vram_limit_mb), which we need for the trials.
|
|
522
|
+
"""
|
|
523
|
+
def tf_environ(gpu_id, vram_limit_mb=None, gpus_to_use=None, intra_threads=None, inter_threads=None, log_device=True, logger=None, skip_tf=False):
|
|
524
|
+
"""
|
|
525
|
+
Configure TensorFlow to use fixed VRAM slices per visible GPU.
|
|
526
|
+
Call this ONCE per Ray actor, BEFORE building/loading any TF model.
|
|
527
|
+
"""
|
|
528
|
+
|
|
529
|
+
# Normalize logger: if None, use a silent logger that discards records
|
|
530
|
+
if logger is None:
|
|
531
|
+
logger = logging.getLogger("eqcctpro.null")
|
|
532
|
+
logger.propagate = False
|
|
533
|
+
if not logger.handlers:
|
|
534
|
+
# logger.addHandler(logging.StreamHandler())
|
|
535
|
+
logger.addHandler(logging.NullHandler())
|
|
536
|
+
|
|
537
|
+
# C++ backend verbosity (must be set before importing TF)
|
|
538
|
+
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3") # 0=all,1=INFO-,2=WARNING-,3=ERROR-
|
|
539
|
+
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # avoid oneDNN “custom ops” info line
|
|
540
|
+
|
|
541
|
+
# 0) Visibility must be set BEFORE importing tensorflow
|
|
542
|
+
if gpu_id == -1:
|
|
543
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
544
|
+
logger.info(f"GPU disabled (CPU-only).")
|
|
545
|
+
elif gpus_to_use is not None:
|
|
546
|
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
547
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpus_to_use))
|
|
548
|
+
logger.info(f"GPU enabled. Visible GPU IDs: {gpus_to_use}")
|
|
549
|
+
else:
|
|
550
|
+
logger.info(f"GPU visibility left to environment (Ray). CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES')}")
|
|
551
|
+
|
|
552
|
+
if skip_tf:
|
|
553
|
+
logger.info("Skipping framework-specific initialization in the driver process.")
|
|
554
|
+
return
|
|
555
|
+
|
|
556
|
+
# 1) Now import TF (it will honor visibility)
|
|
557
|
+
import tensorflow as tf
|
|
558
|
+
tf.get_logger().setLevel(logging.ERROR)
|
|
559
|
+
try:
|
|
560
|
+
from absl import logging as absl_logging
|
|
561
|
+
absl_logging.set_verbosity(absl_logging.ERROR)
|
|
562
|
+
except Exception:
|
|
563
|
+
pass
|
|
564
|
+
if log_device:
|
|
565
|
+
tf.debugging.set_log_device_placement(True)
|
|
566
|
+
|
|
567
|
+
# 2) Threading (optional)
|
|
568
|
+
if intra_threads is not None:
|
|
569
|
+
tf.config.threading.set_intra_op_parallelism_threads(int(intra_threads))
|
|
570
|
+
logger.info(f"Configured Intra-op threads = {intra_threads}")
|
|
571
|
+
if inter_threads is not None:
|
|
572
|
+
tf.config.threading.set_inter_op_parallelism_threads(int(inter_threads))
|
|
573
|
+
logger.info(f"Configured Inter-op threads = {inter_threads}")
|
|
574
|
+
|
|
575
|
+
# 3) Configure fixed VRAM slices on all visible GPUs
|
|
576
|
+
vis_gpus = tf.config.list_physical_devices("GPU")
|
|
577
|
+
if not vis_gpus:
|
|
578
|
+
logger.info(f"TensorFlow: No GPUs visible; TF will proceed on CPU.")
|
|
579
|
+
logger.info("")
|
|
580
|
+
return {"logical_gpus": [], "physical_gpus": []}
|
|
581
|
+
|
|
582
|
+
if vram_limit_mb is None or vram_limit_mb <= 0:
|
|
583
|
+
raise ValueError("vram_limit_mb must be a positive integer when using fixed VRAM slicing.")
|
|
584
|
+
|
|
585
|
+
try:
|
|
586
|
+
for gpu in vis_gpus:
|
|
587
|
+
# One logical device per physical GPU, each with a hard VRAM cap
|
|
588
|
+
tf.config.set_logical_device_configuration(gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=int(vram_limit_mb))])
|
|
589
|
+
# Force logical devices to materialize
|
|
590
|
+
# Logical devices are a virtual rep. of a physical hardware component (CPU/GPU) that TF creates to manage workload distribution
|
|
591
|
+
# Can have more logical devices than what you have physically, however you are constrained by the physical limitations of your hardware
|
|
592
|
+
logical = tf.config.list_logical_devices("GPU")
|
|
593
|
+
logger.info(
|
|
594
|
+
f"Set VRAM slicing: "
|
|
595
|
+
f"{vram_limit_mb} MB per logical GPU "
|
|
596
|
+
f"({len(logical)} logical over {len(vis_gpus)} physical)."
|
|
597
|
+
)
|
|
598
|
+
except RuntimeError as e:
|
|
599
|
+
# Happens if any TF GPU context was already initialized
|
|
600
|
+
raise RuntimeError(
|
|
601
|
+
"Failed to set logical device configuration. "
|
|
602
|
+
"Ensure tf_environ() is called before any TensorFlow GPU ops or model creation.\n"
|
|
603
|
+
f"Original error: {e}")
|
|
604
|
+
|
|
605
|
+
"""
|
|
606
|
+
find_optimal_configurations_cpu/gpu find at the end of the trial run throught find the:
|
|
607
|
+
1. The best number of concurrent predictions for each (stations, CPUs/GPU VRAM) pair that results in the fastest runtime.
|
|
608
|
+
2. The overall best configuration balancing stations, CPUs/GPU VRAM, and runtime.
|
|
609
|
+
"""
|
|
610
|
+
def find_optimal_configurations_cpu(df):
|
|
611
|
+
"""
|
|
612
|
+
Find:
|
|
613
|
+
1. The best number of concurrent predictions for each (stations, CPUs) pair that results in the fastest runtime.
|
|
614
|
+
2. The overall best configuration balancing stations, CPUs, and runtime.
|
|
615
|
+
"""
|
|
616
|
+
|
|
617
|
+
# Convert relevant columns to numeric, handling NaNs gracefully
|
|
618
|
+
df["Number of Stations Used"] = pd.to_numeric(df["Number of Stations Used"], errors="coerce")
|
|
619
|
+
df["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
620
|
+
df["Total Number of Timechunks"] = pd.to_numeric(df["Total Number of Timechunks"], errors="coerce")
|
|
621
|
+
df["Concurrent Timechunks Used"] = pd.to_numeric(df["Concurrent Timechunks Used"], errors="coerce")
|
|
622
|
+
df["Number of Concurrent Station Tasks"] = pd.to_numeric(df["Number of Concurrent Station Tasks"], errors="coerce")
|
|
623
|
+
df["Total Run time for Picker (s)"] = pd.to_numeric(df["Total Run time for Picker (s)"], errors="coerce")
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# Drop rows with missing values in these essential columns
|
|
627
|
+
df_cleaned = df.dropna(subset=["Number of Stations Used", "Number of CPUs Allocated for Ray to Use",
|
|
628
|
+
"Concurrent Timechunks Used", "Number of Concurrent Station Tasks", "Total Run time for Picker (s)"])
|
|
629
|
+
|
|
630
|
+
# Find the best concurrent prediction configuration for each combination of (Stations, Timechunks, CPUs, Model)
|
|
631
|
+
optimal_concurrent_preds = df_cleaned.loc[
|
|
632
|
+
df_cleaned.groupby(["Number of Stations Used", "Concurrent Timechunks Used", "Number of CPUs Allocated for Ray to Use", "Model Used"])
|
|
633
|
+
["Total Run time for Picker (s)"].idxmin()
|
|
634
|
+
]
|
|
635
|
+
|
|
636
|
+
# Define what "moderate" means in terms of CPU usage (e.g., middle 50% of available CPUs)
|
|
637
|
+
cpu_min = df_cleaned["Number of CPUs Allocated for Ray to Use"].quantile(0.25)
|
|
638
|
+
cpu_max = df_cleaned["Number of CPUs Allocated for Ray to Use"].quantile(0.75)
|
|
639
|
+
|
|
640
|
+
# Filter for rows within the moderate CPU range
|
|
641
|
+
df_moderate_cpus = df_cleaned[(df_cleaned["Number of CPUs Allocated for Ray to Use"] >= cpu_min) &
|
|
642
|
+
(df_cleaned["Number of CPUs Allocated for Ray to Use"] <= cpu_max)]
|
|
643
|
+
|
|
644
|
+
# Sort by the highest number of stations first, then by the fastest runtime
|
|
645
|
+
best_overall_config = df_moderate_cpus.sort_values(
|
|
646
|
+
by=["Number of Stations Used", "Total Run time for Picker (s)"],
|
|
647
|
+
ascending=[False, True] # Maximize stations, minimize runtime
|
|
648
|
+
).iloc[0]
|
|
649
|
+
|
|
650
|
+
# Format the output for human readability
|
|
651
|
+
formatted_output = {
|
|
652
|
+
"Trial Number": best_overall_config["Trial Number"],
|
|
653
|
+
"Number of Stations Used": best_overall_config["Number of Stations Used"],
|
|
654
|
+
"Total Number of Timechunks": best_overall_config["Total Number of Timechunks"],
|
|
655
|
+
"Concurrent Timechunks Used": best_overall_config["Concurrent Timechunks Used"],
|
|
656
|
+
"Length of Timechunk (min)": str(best_overall_config["Length of Timechunk (min)"]),
|
|
657
|
+
"Total Waveform Analysis Timespace (min)": str(best_overall_config["Total Waveform Analysis Timespace (min)"]),
|
|
658
|
+
"Number of Concurrent Station Tasks per Timechunk": best_overall_config["Number of Concurrent Station Tasks"],
|
|
659
|
+
"Number of CPUs Allocated for Ray to Use": best_overall_config["Number of CPUs Allocated for Ray to Use"],
|
|
660
|
+
"Intra-parallelism Threads": best_overall_config["Intra-parallelism Threads"],
|
|
661
|
+
"Inter-parallelism Threads": best_overall_config["Inter-parallelism Threads"],
|
|
662
|
+
"Total Run time for Picker (s)": best_overall_config["Total Run time for Picker (s)"],
|
|
663
|
+
"Model Used": best_overall_config["Model Used"],
|
|
664
|
+
"Trial Success": best_overall_config["Trial Success"],
|
|
665
|
+
"Error Message": best_overall_config["Error Message"],
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
best_overall_df = pd.DataFrame([formatted_output])
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
return optimal_concurrent_preds, best_overall_df
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
def find_optimal_configurations_gpu(df):
|
|
675
|
+
"""
|
|
676
|
+
Find:
|
|
677
|
+
1) Best concurrency for each (stations, CPUs, GPUs, VRAM) combo (fastest runtime).
|
|
678
|
+
2) Best overall balanced configuration.
|
|
679
|
+
"""
|
|
680
|
+
# 1) Numeric normalization
|
|
681
|
+
numeric_cols = [
|
|
682
|
+
"Number of Stations Used",
|
|
683
|
+
"Number of CPUs Allocated for Ray to Use",
|
|
684
|
+
"Number of Concurrent Station Tasks",
|
|
685
|
+
"Total Run time for Picker (s)",
|
|
686
|
+
"Inference Actor Memory Limit (MB)",
|
|
687
|
+
]
|
|
688
|
+
for col in numeric_cols:
|
|
689
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
690
|
+
|
|
691
|
+
# 2) Normalize GPUs Used -> list[int], then create a *hashable* key
|
|
692
|
+
df["GPUs Used"] = df["GPUs Used"].apply(_parse_gpus_field)
|
|
693
|
+
df["GPUs Used (key)"] = df["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else tuple())
|
|
694
|
+
|
|
695
|
+
# 3) Drop rows missing essentials
|
|
696
|
+
essentials = numeric_cols + ["GPUs Used (key)"]
|
|
697
|
+
df_cleaned = df.dropna(subset=essentials).copy()
|
|
698
|
+
|
|
699
|
+
if df_cleaned.empty:
|
|
700
|
+
# Nothing to optimize; return empty frames shaped like callers expect
|
|
701
|
+
return df_cleaned, df_cleaned
|
|
702
|
+
|
|
703
|
+
# 4) Fastest runtime per (Stations, CPUs, GPUs, VRAM, Model) bucket
|
|
704
|
+
grp_cols = [
|
|
705
|
+
"Number of Stations Used",
|
|
706
|
+
"Number of CPUs Allocated for Ray to Use",
|
|
707
|
+
"GPUs Used (key)",
|
|
708
|
+
"Inference Actor Memory Limit (MB)",
|
|
709
|
+
"Model Used",
|
|
710
|
+
]
|
|
711
|
+
idx = (
|
|
712
|
+
df_cleaned
|
|
713
|
+
.groupby(grp_cols)["Total Run time for Picker (s)"]
|
|
714
|
+
.idxmin()
|
|
715
|
+
)
|
|
716
|
+
optimal_concurrent_preds = df_cleaned.loc[idx].copy()
|
|
717
|
+
|
|
718
|
+
# For readability in outputs, show GPUs as list again
|
|
719
|
+
optimal_concurrent_preds["GPUs Used"] = optimal_concurrent_preds["GPUs Used (key)"].apply(list)
|
|
720
|
+
# (Optional) drop helper key in the returned table
|
|
721
|
+
optimal_concurrent_preds.drop(columns=["GPUs Used (key)"], inplace=True, errors="ignore")
|
|
722
|
+
|
|
723
|
+
# 5) “Moderate VRAM” window; if empty, fall back safely
|
|
724
|
+
vram_min = df_cleaned["Inference Actor Memory Limit (MB)"].quantile(0.25)
|
|
725
|
+
vram_max = df_cleaned["Inference Actor Memory Limit (MB)"].quantile(0.75)
|
|
726
|
+
df_moderate_vram = df_cleaned[
|
|
727
|
+
(df_cleaned["Inference Actor Memory Limit (MB)"] >= vram_min)
|
|
728
|
+
& (df_cleaned["Inference Actor Memory Limit (MB)"] <= vram_max)
|
|
729
|
+
].copy()
|
|
730
|
+
if df_moderate_vram.empty:
|
|
731
|
+
df_moderate_vram = df_cleaned.copy()
|
|
732
|
+
|
|
733
|
+
# Highest stations first, then fastest runtime
|
|
734
|
+
best_overall_config = df_moderate_vram.sort_values(
|
|
735
|
+
by=["Number of Stations Used", "Total Run time for Picker (s)"],
|
|
736
|
+
ascending=[False, True],
|
|
737
|
+
).iloc[0]
|
|
738
|
+
|
|
739
|
+
formatted_output = {
|
|
740
|
+
"Trial Number": best_overall_config["Trial Number"],
|
|
741
|
+
"Number of Stations Used": best_overall_config["Number of Stations Used"],
|
|
742
|
+
"Total Number of Timechunks": best_overall_config["Total Number of Timechunks"],
|
|
743
|
+
"Concurrent Timechunks Used": best_overall_config["Concurrent Timechunks Used"],
|
|
744
|
+
"Length of Timechunk (min)": str(best_overall_config["Length of Timechunk (min)"]),
|
|
745
|
+
"Total Waveform Analysis Timespace (min)": str(best_overall_config["Total Waveform Analysis Timespace (min)"]),
|
|
746
|
+
"Number of Concurrent Station Tasks per Timechunk": best_overall_config["Number of Concurrent Station Tasks"],
|
|
747
|
+
"Number of CPUs Allocated for Ray to Use": best_overall_config["Number of CPUs Allocated for Ray to Use"],
|
|
748
|
+
"GPUs Used": list(best_overall_config.get("GPUs Used (key)", ())) or best_overall_config.get("GPUs Used", []),
|
|
749
|
+
"Inference Actor Memory Limit (MB)": best_overall_config["Inference Actor Memory Limit (MB)"],
|
|
750
|
+
"Intra-parallelism Threads": best_overall_config["Intra-parallelism Threads"],
|
|
751
|
+
"Inter-parallelism Threads": best_overall_config["Inter-parallelism Threads"],
|
|
752
|
+
"Total Run time for Picker (s)": best_overall_config["Total Run time for Picker (s)"],
|
|
753
|
+
"Model Used": best_overall_config["Model Used"],
|
|
754
|
+
"Trial Success": best_overall_config["Trial Success"],
|
|
755
|
+
"Error Message": best_overall_config["Error Message"],
|
|
756
|
+
}
|
|
757
|
+
best_overall_df = pd.DataFrame([formatted_output])
|
|
758
|
+
|
|
759
|
+
return optimal_concurrent_preds, best_overall_df
|
|
760
|
+
|
|
761
|
+
"""
|
|
762
|
+
find_optimal_configuration_cpu/gpu returns back the best overall usecase results configuration, and takes those values to be used as the
|
|
763
|
+
current operation's runtime configuration."""
|
|
764
|
+
def find_optimal_configuration_cpu(best_overall_usecase:bool, eval_sys_results_dir:str, cpu:int=None, station_count:int=None):
|
|
765
|
+
# Check if eval_sys_results_dir is valid
|
|
766
|
+
if not eval_sys_results_dir or not os.path.isdir(eval_sys_results_dir):
|
|
767
|
+
print(f"Error: The provided directory path '{eval_sys_results_dir}' is invalid or does not exist.")
|
|
768
|
+
print("Please provide a valid directory path for the input parameter 'csv_dir'.")
|
|
769
|
+
return exit() # Exit early if the directory is invalid
|
|
770
|
+
|
|
771
|
+
if best_overall_usecase is True:
|
|
772
|
+
file_path = f"{eval_sys_results_dir}/best_overall_usecase_cpu.csv"
|
|
773
|
+
|
|
774
|
+
# Check if the CSV file exists before reading
|
|
775
|
+
if not os.path.exists(file_path):
|
|
776
|
+
print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
|
|
777
|
+
return exit()
|
|
778
|
+
|
|
779
|
+
# Load the CSV
|
|
780
|
+
df_best_overall = pd.read_csv(file_path)
|
|
781
|
+
# Convert into a dictionary for easy access
|
|
782
|
+
best_config_dict = df_best_overall.set_index(df_best_overall.columns[0]).to_dict()[df_best_overall.columns[1]]
|
|
783
|
+
|
|
784
|
+
# Extract required values
|
|
785
|
+
num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
|
|
786
|
+
waveform_timespace = best_config_dict.get("Total Waveform Analysis Timespace (min)")
|
|
787
|
+
total_num_timechunks = best_config_dict.get("Total Number of Timechunks")
|
|
788
|
+
num_concurrent_timechunks = best_config_dict.get("Concurrent Timechunks Used")
|
|
789
|
+
length_of_timechunks = best_config_dict.get("Length of Timechunk (min)")
|
|
790
|
+
num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks")
|
|
791
|
+
intra_threads = best_config_dict.get("Intra-parallelism Threads")
|
|
792
|
+
inter_threads = best_config_dict.get("Inter-parallelism Threads")
|
|
793
|
+
num_stations = best_config_dict.get("Number of Stations Used")
|
|
794
|
+
total_runtime = best_config_dict.get("Total Run time for Picker (s)")
|
|
795
|
+
model_used = best_config_dict.get("Model Used")
|
|
796
|
+
|
|
797
|
+
print("\nBest Overall Usecase Configuration Based on Trial Data:")
|
|
798
|
+
print(f"Model Used: {model_used}\n"
|
|
799
|
+
f"CPU: {num_cpus}\n"
|
|
800
|
+
f"Intra-parallelism Threads: {intra_threads}\n"
|
|
801
|
+
f"Inter-parallelism Threads: {inter_threads}\n"
|
|
802
|
+
f"Waveform Timespace: {waveform_timespace}"
|
|
803
|
+
f"Total Number of Timechunks: {total_num_timechunks}"
|
|
804
|
+
f"Length of Timechunks (min): {length_of_timechunks}"
|
|
805
|
+
f"Concurrent Timechunks: {num_concurrent_stations}\n"
|
|
806
|
+
f"Concurrent Stations: {num_concurrent_stations}\n"
|
|
807
|
+
f"Stations: {num_stations}\n"
|
|
808
|
+
f"Total Runtime (s): {total_runtime}")
|
|
809
|
+
|
|
810
|
+
# Return the extracted values
|
|
811
|
+
return int(float(num_cpus)), int(float(num_concurrent_stations)), int(float(intra_threads)), int(float(inter_threads)), int(float(num_stations))
|
|
812
|
+
|
|
813
|
+
else: # Optimal Configuration for User-Specified CPUs and Number of Stations to use
|
|
814
|
+
# Ensure valid CPU and station count values
|
|
815
|
+
if cpu is None or station_count is None:
|
|
816
|
+
print("Error: CPU and station_count must have valid values.")
|
|
817
|
+
return exit()
|
|
818
|
+
|
|
819
|
+
file_path = f"{eval_sys_results_dir}/optimal_configurations_cpu.csv"
|
|
820
|
+
|
|
821
|
+
# Check if the CSV file exists before reading
|
|
822
|
+
if not os.path.exists(file_path):
|
|
823
|
+
print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
|
|
824
|
+
return exit()
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
df_optimal = pd.read_csv(file_path)
|
|
828
|
+
|
|
829
|
+
# Convert relevant columns to numeric, handling NaNs gracefully
|
|
830
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
831
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
832
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
833
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
834
|
+
filtered_df = df_optimal[
|
|
835
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == cpu) &
|
|
836
|
+
(df_optimal["Number of Stations Used"] == station_count)]
|
|
837
|
+
if filtered_df.empty:
|
|
838
|
+
print("No matching configuration found. Please enter a valid entry.")
|
|
839
|
+
exit()
|
|
840
|
+
|
|
841
|
+
# Find the best configuration (fastest runtime)
|
|
842
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
843
|
+
|
|
844
|
+
print("\nBest Configuration for Requested Input Parameters Based on Trial Data:")
|
|
845
|
+
print(f"Model Used: {best_config.get('Model Used')}\n"
|
|
846
|
+
f"CPU: {cpu}\nConcurrent Predictions: {best_config['Number of Concurrent Station Tasks']}\n"
|
|
847
|
+
f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}\n"
|
|
848
|
+
f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}\n"
|
|
849
|
+
f"Stations: {station_count}\nTotal Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
850
|
+
|
|
851
|
+
return int(float(cpu)), int(float(best_config["Number of Concurrent Station Tasks"])), int(float(best_config["Intra-parallelism Threads"])), int(float(best_config["Inter-parallelism Threads"])), int(float(station_count))
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
def find_optimal_configuration_gpu(best_overall_usecase: bool, eval_sys_results_dir: str, num_cpus: int = None, num_gpus: list = None, station_count: int = None):
|
|
855
|
+
"""
|
|
856
|
+
Find the optimal GPU configuration for a given number of CPUs, GPUs, and stations.
|
|
857
|
+
Returns the best configuration including CPUs, concurrent predictions, intra/inter parallelism threads,
|
|
858
|
+
GPUs, VRAM, and stations.
|
|
859
|
+
"""
|
|
860
|
+
|
|
861
|
+
# Check if eval_sys_results_dir is valid
|
|
862
|
+
if not eval_sys_results_dir or not os.path.isdir(eval_sys_results_dir):
|
|
863
|
+
print(f"Error: The provided directory path '{eval_sys_results_dir}' is invalid or does not exist.")
|
|
864
|
+
print("Please provide a valid directory path for the input parameter 'csv_dir'.")
|
|
865
|
+
return None # Exit early if the directory is invalid
|
|
866
|
+
|
|
867
|
+
if best_overall_usecase:
|
|
868
|
+
file_path = f"{eval_sys_results_dir}/best_overall_usecase_gpu.csv"
|
|
869
|
+
|
|
870
|
+
# Check if the CSV file exists before reading
|
|
871
|
+
if not os.path.exists(file_path):
|
|
872
|
+
print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
|
|
873
|
+
return None
|
|
874
|
+
|
|
875
|
+
# Load the CSV
|
|
876
|
+
df_best_overall = pd.read_csv(file_path, header=None, index_col=0)
|
|
877
|
+
|
|
878
|
+
# Convert into a dictionary for easy access
|
|
879
|
+
best_config_dict = df_best_overall.to_dict()[1] # Extract key-value pairs
|
|
880
|
+
|
|
881
|
+
# Extract required values
|
|
882
|
+
num_cpus = best_config_dict.get("Number of CPUs Allocated for Ray to Use")
|
|
883
|
+
num_concurrent_stations = best_config_dict.get("Number of Concurrent Station Tasks")
|
|
884
|
+
intra_threads = best_config_dict.get("Intra-parallelism Threads")
|
|
885
|
+
inter_threads = best_config_dict.get("Inter-parallelism Threads")
|
|
886
|
+
num_stations = best_config_dict.get("Number of Stations Used")
|
|
887
|
+
total_runtime = best_config_dict.get("Total Run time for Picker (s)")
|
|
888
|
+
vram_used = best_config_dict.get("Inference Actor Memory Limit (MB)")
|
|
889
|
+
num_gpus_st = best_config_dict.get("GPUs Used")
|
|
890
|
+
num_gpus = ast.literal_eval(num_gpus_st)
|
|
891
|
+
model_used = best_config_dict.get("Model Used")
|
|
892
|
+
|
|
893
|
+
print("\nBest Overall Usecase Configuration Based on Trial Data:")
|
|
894
|
+
print(f"Model Used: {model_used}\n"
|
|
895
|
+
f"CPU: {num_cpus}\n"
|
|
896
|
+
f"GPU ID(s): {num_gpus}\n"
|
|
897
|
+
f"Concurrent Predictions: {num_concurrent_stations}\n"
|
|
898
|
+
f"Intra-parallelism Threads: {intra_threads}\n"
|
|
899
|
+
f"Inter-parallelism Threads: {inter_threads}\n"
|
|
900
|
+
f"Stations: {num_stations}\n"
|
|
901
|
+
f"Inference Actor Memory Limit (MB): {vram_used}\n"
|
|
902
|
+
f"Total Runtime (s): {total_runtime}")
|
|
903
|
+
|
|
904
|
+
return int(float(num_cpus)), int(float(num_concurrent_stations)), int(float(intra_threads)), int(float(inter_threads)), num_gpus, int(float(vram_used)), int(float(num_stations))
|
|
905
|
+
|
|
906
|
+
else: # Optimal Configuration for User-Specified CPUs, GPUs, and Number of Stations to use
|
|
907
|
+
# Ensure valid CPU, GPU, and station count values
|
|
908
|
+
if num_cpus is None or station_count is None or num_gpus is None:
|
|
909
|
+
print("Error: num_cpus, station_count, and num_gpus must have valid values.")
|
|
910
|
+
return None
|
|
911
|
+
|
|
912
|
+
file_path = f"{eval_sys_results_dir}/optimal_configurations_gpu.csv"
|
|
913
|
+
|
|
914
|
+
# Check if the CSV file exists before reading
|
|
915
|
+
if not os.path.exists(file_path):
|
|
916
|
+
print(f"Error: The file '{file_path}' does not exist. Ensure the file is in the correct directory.")
|
|
917
|
+
return None
|
|
918
|
+
|
|
919
|
+
df_optimal = pd.read_csv(file_path)
|
|
920
|
+
|
|
921
|
+
# Convert relevant columns to numeric, handling NaNs gracefully
|
|
922
|
+
df_optimal["Number of Stations Used"] = pd.to_numeric(df_optimal["Number of Stations Used"], errors="coerce")
|
|
923
|
+
df_optimal["Number of CPUs Allocated for Ray to Use"] = pd.to_numeric(df_optimal["Number of CPUs Allocated for Ray to Use"], errors="coerce")
|
|
924
|
+
df_optimal["Number of Concurrent Station Tasks"] = pd.to_numeric(df_optimal["Number of Concurrent Station Tasks"], errors="coerce")
|
|
925
|
+
df_optimal["Total Run time for Picker (s)"] = pd.to_numeric(df_optimal["Total Run time for Picker (s)"], errors="coerce")
|
|
926
|
+
df_optimal["Inference Actor Memory Limit (MB)"] = pd.to_numeric(df_optimal["Inference Actor Memory Limit (MB)"], errors="coerce")
|
|
927
|
+
|
|
928
|
+
# Convert "GPUs Used" from string representation to list
|
|
929
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
|
930
|
+
|
|
931
|
+
# Convert GPU lists to tuples for comparison
|
|
932
|
+
df_optimal["GPUs Used"] = df_optimal["GPUs Used"].apply(lambda x: tuple(x) if isinstance(x, list) else (x,))
|
|
933
|
+
|
|
934
|
+
# Ensure num_gpus is in tuple format for comparison
|
|
935
|
+
num_gpus_tuple = tuple(num_gpus) if isinstance(num_gpus, list) else (num_gpus,)
|
|
936
|
+
|
|
937
|
+
filtered_df = df_optimal[
|
|
938
|
+
(df_optimal["Number of CPUs Allocated for Ray to Use"] == num_cpus) &
|
|
939
|
+
(df_optimal["GPUs Used"] == num_gpus_tuple) &
|
|
940
|
+
(df_optimal["Number of Stations Used"] == station_count)
|
|
941
|
+
]
|
|
942
|
+
|
|
943
|
+
if filtered_df.empty:
|
|
944
|
+
print("No matching configuration found. Please enter a valid entry.")
|
|
945
|
+
exit()
|
|
946
|
+
|
|
947
|
+
# Find the best configuration (fastest runtime)
|
|
948
|
+
best_config = filtered_df.nsmallest(1, "Total Run time for Picker (s)").iloc[0]
|
|
949
|
+
|
|
950
|
+
print("\nBest Configuration for Requested Application Usecase Based on Trial Data:")
|
|
951
|
+
print(f"Model Used: {best_config.get('Model Used')}\n"
|
|
952
|
+
f"CPU: {num_cpus}\n"
|
|
953
|
+
f"GPU: {num_gpus}\n"
|
|
954
|
+
f"Concurrent Predictions: {best_config['Number of Concurrent Station Tasks']}\n"
|
|
955
|
+
f"Intra-parallelism Threads: {best_config['Intra-parallelism Threads']}\n"
|
|
956
|
+
f"Inter-parallelism Threads: {best_config['Inter-parallelism Threads']}\n"
|
|
957
|
+
f"Stations: {station_count}\n"
|
|
958
|
+
f"Inference Actor Memory Limit (MB): {best_config['Inference Actor Memory Limit (MB)']}\n"
|
|
959
|
+
f"Total Runtime (s): {best_config['Total Run time for Picker (s)']}")
|
|
960
|
+
|
|
961
|
+
return int(float(best_config["Number of CPUs Allocated for Ray to Use"])), \
|
|
962
|
+
int(float(best_config["Number of Concurrent Station Tasks"])), \
|
|
963
|
+
int(float(best_config["Intra-parallelism Threads"])), \
|
|
964
|
+
int(float(best_config["Inter-parallelism Threads"])), \
|
|
965
|
+
num_gpus, \
|
|
966
|
+
int(float(best_config["Inference Actor Memory Limit (MB)"])), \
|
|
967
|
+
int(float(station_count))
|
|
968
|
+
|