omniopt2 8423__py3-none-any.whl → 8454__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omniopt2 might be problematic. Click here for more details.
- .omniopt.py +142 -77
- .tpe.py +4 -3
- omniopt +37 -15
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt.py +142 -77
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.tpe.py +4 -3
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/omniopt +37 -15
- {omniopt2-8423.dist-info → omniopt2-8454.dist-info}/METADATA +1 -1
- {omniopt2-8423.dist-info → omniopt2-8454.dist-info}/RECORD +38 -38
- omniopt2.egg-info/PKG-INFO +1 -1
- pyproject.toml +1 -1
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.colorfunctions.sh +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.general.sh +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.helpers.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_cpu_ram_usage.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_general.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_gpu_usage.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_kde.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_scatter.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_scatter_generation_method.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_scatter_hex.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_time_and_exit_code.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_trial_index_result.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.omniopt_plot_worker.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.random_generator.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/.shellscript_functions +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/LICENSE +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/apt-dependencies.txt +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/omniopt_docker +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/omniopt_evaluate +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/omniopt_plot +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/omniopt_share +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/pylint.rc +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/requirements.txt +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/setup.py +0 -0
- {omniopt2-8423.data → omniopt2-8454.data}/data/bin/test_requirements.txt +0 -0
- {omniopt2-8423.dist-info → omniopt2-8454.dist-info}/WHEEL +0 -0
- {omniopt2-8423.dist-info → omniopt2-8454.dist-info}/licenses/LICENSE +0 -0
- {omniopt2-8423.dist-info → omniopt2-8454.dist-info}/top_level.txt +0 -0
.omniopt.py
CHANGED
|
@@ -478,7 +478,7 @@ def get_current_run_folder(name: Optional[str] = None) -> str:
|
|
|
478
478
|
|
|
479
479
|
return CURRENT_RUN_FOLDER
|
|
480
480
|
|
|
481
|
-
def get_state_file_name(name) -> str:
|
|
481
|
+
def get_state_file_name(name: str) -> str:
|
|
482
482
|
state_files_folder = f"{get_current_run_folder()}/state_files/"
|
|
483
483
|
makedirs(state_files_folder)
|
|
484
484
|
|
|
@@ -577,20 +577,22 @@ def _debug(msg: str, _lvl: int = 0, eee: Union[None, str, Exception] = None) ->
|
|
|
577
577
|
def _get_debug_json(time_str: str, msg: str) -> str:
|
|
578
578
|
function_stack = []
|
|
579
579
|
try:
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
func_name = frame.f_code
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
580
|
+
cf = inspect.currentframe()
|
|
581
|
+
if cf:
|
|
582
|
+
frame = cf.f_back # skip _get_debug_json
|
|
583
|
+
while frame:
|
|
584
|
+
func_name = _function_name_cache.get(frame.f_code)
|
|
585
|
+
if func_name is None:
|
|
586
|
+
func_name = frame.f_code.co_name
|
|
587
|
+
_function_name_cache[frame.f_code] = func_name
|
|
588
|
+
|
|
589
|
+
if func_name not in ("<module>", "print_debug", "wrapper"):
|
|
590
|
+
function_stack.append({
|
|
591
|
+
"function": func_name,
|
|
592
|
+
"line_number": frame.f_lineno
|
|
593
|
+
})
|
|
594
|
+
|
|
595
|
+
frame = frame.f_back
|
|
594
596
|
except (SignalUSR, SignalINT, SignalCONT):
|
|
595
597
|
print_red("\n⚠ You pressed CTRL-C. This is ignored in _get_debug_json.")
|
|
596
598
|
|
|
@@ -692,11 +694,14 @@ def my_exit(_code: int = 0) -> None:
|
|
|
692
694
|
if is_skip_search() and os.getenv("SKIP_SEARCH_EXIT_CODE"):
|
|
693
695
|
skip_search_exit_code = os.getenv("SKIP_SEARCH_EXIT_CODE")
|
|
694
696
|
|
|
697
|
+
skip_search_exit_code_found = None
|
|
698
|
+
|
|
695
699
|
try:
|
|
696
|
-
|
|
700
|
+
skip_search_exit_code_found = int(skip_search_exit_code)
|
|
701
|
+
|
|
702
|
+
sys.exit(skip_search_exit_code_found)
|
|
697
703
|
except ValueError:
|
|
698
|
-
|
|
699
|
-
sys.exit(_code)
|
|
704
|
+
print_debug(f"Trying to look for SKIP_SEARCH_EXIT_CODE failed. Exiting with original exit code {_code}")
|
|
700
705
|
|
|
701
706
|
sys.exit(_code)
|
|
702
707
|
|
|
@@ -2087,12 +2092,6 @@ def init_live_share() -> bool:
|
|
|
2087
2092
|
|
|
2088
2093
|
return ret
|
|
2089
2094
|
|
|
2090
|
-
async def start_periodic_live_share() -> None:
|
|
2091
|
-
if args.live_share and not os.environ.get("CI"):
|
|
2092
|
-
while True:
|
|
2093
|
-
live_share(force=False)
|
|
2094
|
-
time.sleep(30)
|
|
2095
|
-
|
|
2096
2095
|
def init_storage(db_url: str) -> None:
|
|
2097
2096
|
init_engine_and_session_factory(url=db_url, force_init=True)
|
|
2098
2097
|
engine = get_engine()
|
|
@@ -2162,6 +2161,9 @@ def save_results_csv() -> Optional[str]:
|
|
|
2162
2161
|
|
|
2163
2162
|
try:
|
|
2164
2163
|
df = fetch_and_prepare_trials()
|
|
2164
|
+
if df is None:
|
|
2165
|
+
print_red(f"save_results_csv: fetch_and_prepare_trials returned an empty element: {df}")
|
|
2166
|
+
return None
|
|
2165
2167
|
write_csv(df, pd_csv)
|
|
2166
2168
|
write_json_snapshot(pd_json)
|
|
2167
2169
|
save_experiment_to_file()
|
|
@@ -2174,14 +2176,17 @@ def save_results_csv() -> Optional[str]:
|
|
|
2174
2176
|
except (SignalUSR, SignalCONT, SignalINT) as e:
|
|
2175
2177
|
raise type(e)(str(e)) from e
|
|
2176
2178
|
except Exception as e:
|
|
2177
|
-
print_red(f"
|
|
2179
|
+
print_red(f"\nWhile saving all trials as a pandas-dataframe-csv, an error occurred: {e}")
|
|
2178
2180
|
|
|
2179
2181
|
return pd_csv
|
|
2180
2182
|
|
|
2181
2183
|
def get_results_paths() -> tuple[str, str]:
|
|
2182
2184
|
return (get_current_run_folder(RESULTS_CSV_FILENAME), get_state_file_name('pd.json'))
|
|
2183
2185
|
|
|
2184
|
-
def fetch_and_prepare_trials() -> pd.DataFrame:
|
|
2186
|
+
def fetch_and_prepare_trials() -> Optional[pd.DataFrame]:
|
|
2187
|
+
if not ax_client:
|
|
2188
|
+
return None
|
|
2189
|
+
|
|
2185
2190
|
ax_client.experiment.fetch_data()
|
|
2186
2191
|
df = ax_client.get_trials_data_frame()
|
|
2187
2192
|
|
|
@@ -2202,15 +2207,21 @@ def write_csv(df: pd.DataFrame, path: str) -> None:
|
|
|
2202
2207
|
df.to_csv(path, index=False, float_format="%.30f")
|
|
2203
2208
|
|
|
2204
2209
|
def write_json_snapshot(path: str) -> None:
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2210
|
+
if ax_client is not None:
|
|
2211
|
+
json_snapshot = ax_client.to_json_snapshot()
|
|
2212
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
2213
|
+
json.dump(json_snapshot, f, indent=4)
|
|
2214
|
+
else:
|
|
2215
|
+
print_red("write_json_snapshot: ax_client was None")
|
|
2208
2216
|
|
|
2209
2217
|
def save_experiment_to_file() -> None:
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
|
|
2218
|
+
if ax_client is not None:
|
|
2219
|
+
save_experiment(
|
|
2220
|
+
ax_client.experiment,
|
|
2221
|
+
get_state_file_name("ax_client.experiment.json")
|
|
2222
|
+
)
|
|
2223
|
+
else:
|
|
2224
|
+
print_red("save_experiment: ax_client is None")
|
|
2214
2225
|
|
|
2215
2226
|
def should_save_to_database() -> bool:
|
|
2216
2227
|
return args.model not in uncontinuable_models and args.save_to_database
|
|
@@ -5431,9 +5442,14 @@ def set_objectives() -> dict:
|
|
|
5431
5442
|
|
|
5432
5443
|
return objectives
|
|
5433
5444
|
|
|
5434
|
-
def set_experiment_constraints(experiment_constraints: Optional[list], experiment_args: dict, _experiment_parameters: Union[dict, list]) -> dict:
|
|
5435
|
-
if
|
|
5445
|
+
def set_experiment_constraints(experiment_constraints: Optional[list], experiment_args: dict, _experiment_parameters: Optional[Union[dict, list]]) -> dict:
|
|
5446
|
+
if _experiment_parameters is None:
|
|
5447
|
+
print_red("set_experiment_constraints: _experiment_parameters was None")
|
|
5448
|
+
my_exit(95)
|
|
5436
5449
|
|
|
5450
|
+
return {}
|
|
5451
|
+
|
|
5452
|
+
if experiment_constraints and len(experiment_constraints):
|
|
5437
5453
|
experiment_args["parameter_constraints"] = []
|
|
5438
5454
|
|
|
5439
5455
|
if experiment_constraints:
|
|
@@ -5463,6 +5479,10 @@ def set_experiment_constraints(experiment_constraints: Optional[list], experimen
|
|
|
5463
5479
|
return experiment_args
|
|
5464
5480
|
|
|
5465
5481
|
def replace_parameters_for_continued_jobs(parameter: Optional[list], cli_params_experiment_parameters: Optional[list]) -> None:
|
|
5482
|
+
if not experiment_parameters:
|
|
5483
|
+
print_red("replace_parameters_for_continued_jobs: experiment_parameters was False")
|
|
5484
|
+
return None
|
|
5485
|
+
|
|
5466
5486
|
if args.worker_generator_path:
|
|
5467
5487
|
return None
|
|
5468
5488
|
|
|
@@ -5548,13 +5568,13 @@ def copy_continue_uuid() -> None:
|
|
|
5548
5568
|
print_debug(f"copy_continue_uuid: Source file does not exist: {source_file}")
|
|
5549
5569
|
|
|
5550
5570
|
def load_ax_client_from_experiment_parameters() -> None:
|
|
5551
|
-
|
|
5552
|
-
|
|
5571
|
+
if experiment_parameters:
|
|
5572
|
+
global ax_client
|
|
5553
5573
|
|
|
5554
|
-
|
|
5555
|
-
|
|
5556
|
-
|
|
5557
|
-
|
|
5574
|
+
tmp_file_path = get_tmp_file_from_json(experiment_parameters)
|
|
5575
|
+
ax_client = AxClient.load_from_json_file(tmp_file_path)
|
|
5576
|
+
ax_client = cast(AxClient, ax_client)
|
|
5577
|
+
os.unlink(tmp_file_path)
|
|
5558
5578
|
|
|
5559
5579
|
def save_checkpoint_for_continued() -> None:
|
|
5560
5580
|
checkpoint_filepath = get_state_file_name('checkpoint.json')
|
|
@@ -5566,12 +5586,15 @@ def save_checkpoint_for_continued() -> None:
|
|
|
5566
5586
|
_fatal_error(f"{checkpoint_filepath} not found. Cannot continue_previous_job without.", 47)
|
|
5567
5587
|
|
|
5568
5588
|
def load_original_generation_strategy(original_ax_client_file: str) -> None:
|
|
5569
|
-
|
|
5570
|
-
|
|
5571
|
-
|
|
5589
|
+
if experiment_parameters:
|
|
5590
|
+
with open(original_ax_client_file, encoding="utf-8") as f:
|
|
5591
|
+
loaded_original_ax_client_json = json.load(f)
|
|
5592
|
+
original_generation_strategy = loaded_original_ax_client_json["generation_strategy"]
|
|
5572
5593
|
|
|
5573
|
-
|
|
5574
|
-
|
|
5594
|
+
if original_generation_strategy:
|
|
5595
|
+
experiment_parameters["generation_strategy"] = original_generation_strategy
|
|
5596
|
+
else:
|
|
5597
|
+
print_red("load_original_generation_strategy: experiment_parameters was empty!")
|
|
5575
5598
|
|
|
5576
5599
|
def wait_for_checkpoint_file(checkpoint_file: str) -> None:
|
|
5577
5600
|
start_time = time.time()
|
|
@@ -5611,6 +5634,11 @@ def validate_experiment_parameters() -> None:
|
|
|
5611
5634
|
my_exit(95)
|
|
5612
5635
|
|
|
5613
5636
|
def __get_experiment_parameters__load_from_checkpoint(continue_previous_job: str, cli_params_experiment_parameters: Optional[list]) -> Tuple[Any, str, str]:
|
|
5637
|
+
if not ax_client:
|
|
5638
|
+
print_red("__get_experiment_parameters__load_from_checkpoint: ax_client was None")
|
|
5639
|
+
my_exit(101)
|
|
5640
|
+
return {}, "", ""
|
|
5641
|
+
|
|
5614
5642
|
print_debug(f"Load from checkpoint: {continue_previous_job}")
|
|
5615
5643
|
|
|
5616
5644
|
checkpoint_file = f"{continue_previous_job}/state_files/checkpoint.json"
|
|
@@ -5652,6 +5680,12 @@ def __get_experiment_parameters__load_from_checkpoint(continue_previous_job: str
|
|
|
5652
5680
|
|
|
5653
5681
|
experiment_constraints = get_constraints()
|
|
5654
5682
|
if experiment_constraints:
|
|
5683
|
+
|
|
5684
|
+
if not experiment_parameters:
|
|
5685
|
+
print_red("__get_experiment_parameters__load_from_checkpoint: experiment_parameters was None")
|
|
5686
|
+
|
|
5687
|
+
return {}, "", ""
|
|
5688
|
+
|
|
5655
5689
|
experiment_args = set_experiment_constraints(
|
|
5656
5690
|
experiment_constraints,
|
|
5657
5691
|
experiment_args,
|
|
@@ -5661,6 +5695,12 @@ def __get_experiment_parameters__load_from_checkpoint(continue_previous_job: str
|
|
|
5661
5695
|
return experiment_args, gpu_string, gpu_color
|
|
5662
5696
|
|
|
5663
5697
|
def __get_experiment_parameters__create_new_experiment() -> Tuple[dict, str, str]:
|
|
5698
|
+
if ax_client is None:
|
|
5699
|
+
print_red("__get_experiment_parameters__create_new_experiment: ax_client is None")
|
|
5700
|
+
my_exit(101)
|
|
5701
|
+
|
|
5702
|
+
return {}, "", ""
|
|
5703
|
+
|
|
5664
5704
|
objectives = set_objectives()
|
|
5665
5705
|
|
|
5666
5706
|
experiment_args = {
|
|
@@ -5950,10 +5990,13 @@ def print_overview_tables(classic_params: Optional[Union[list, dict]], experimen
|
|
|
5950
5990
|
print_result_names_overview_table()
|
|
5951
5991
|
|
|
5952
5992
|
def update_progress_bar(nr: int) -> None:
|
|
5953
|
-
|
|
5954
|
-
|
|
5955
|
-
|
|
5956
|
-
|
|
5993
|
+
if progress_bar is not None:
|
|
5994
|
+
try:
|
|
5995
|
+
progress_bar.update(nr)
|
|
5996
|
+
except Exception as e:
|
|
5997
|
+
print(f"Error updating progress bar: {e}")
|
|
5998
|
+
else:
|
|
5999
|
+
print_red("update_progress_bar: progress_bar was None")
|
|
5957
6000
|
|
|
5958
6001
|
def get_current_model_name() -> str:
|
|
5959
6002
|
if overwritten_to_random:
|
|
@@ -6077,7 +6120,7 @@ def submitted_jobs(nr: int = 0) -> int:
|
|
|
6077
6120
|
def count_jobs_in_squeue() -> tuple[int, str]:
|
|
6078
6121
|
global _last_count_time, _last_count_result
|
|
6079
6122
|
|
|
6080
|
-
now = time.time()
|
|
6123
|
+
now = int(time.time())
|
|
6081
6124
|
if _last_count_result != (0, "") and now - _last_count_time < 15:
|
|
6082
6125
|
return _last_count_result
|
|
6083
6126
|
|
|
@@ -6299,7 +6342,7 @@ def load_existing_job_data_into_ax_client() -> None:
|
|
|
6299
6342
|
nr_of_imported_jobs = get_nr_of_imported_jobs()
|
|
6300
6343
|
set_nr_inserted_jobs(NR_INSERTED_JOBS + nr_of_imported_jobs)
|
|
6301
6344
|
|
|
6302
|
-
def parse_parameter_type_error(_error_message: Union[str, None]) -> Optional[dict]:
|
|
6345
|
+
def parse_parameter_type_error(_error_message: Union[Exception, str, None]) -> Optional[dict]:
|
|
6303
6346
|
if not _error_message:
|
|
6304
6347
|
return None
|
|
6305
6348
|
|
|
@@ -6491,7 +6534,7 @@ def normalize_path(file_path: str) -> str:
|
|
|
6491
6534
|
|
|
6492
6535
|
def insert_jobs_from_lists(csv_path: str, arm_params_list: Any, results_list: Any, __status: Any) -> None:
|
|
6493
6536
|
cnt = 0
|
|
6494
|
-
err_msgs = []
|
|
6537
|
+
err_msgs: list = []
|
|
6495
6538
|
|
|
6496
6539
|
for i, (arm_params, result) in enumerate(zip(arm_params_list, results_list)):
|
|
6497
6540
|
base_str = f"[bold green]Loading job {i}/{len(results_list)} from {csv_path} into ax_client, result: {result}"
|
|
@@ -6525,9 +6568,13 @@ def try_insert_job(csv_path: str, arm_params: Dict, result: Any, i: int, arm_par
|
|
|
6525
6568
|
f"This can happen when the csv file has different parameters or results as the main job one's "
|
|
6526
6569
|
f"or other imported jobs. Error: {e}"
|
|
6527
6570
|
)
|
|
6528
|
-
|
|
6529
|
-
|
|
6530
|
-
err_msgs
|
|
6571
|
+
|
|
6572
|
+
if err_msgs is None:
|
|
6573
|
+
print_red("try_insert_job: err_msgs was None")
|
|
6574
|
+
else:
|
|
6575
|
+
if err_msg not in err_msgs:
|
|
6576
|
+
print_red(err_msg)
|
|
6577
|
+
err_msgs.append(err_msg)
|
|
6531
6578
|
|
|
6532
6579
|
return cnt
|
|
6533
6580
|
|
|
@@ -6553,12 +6600,18 @@ def __insert_job_into_ax_client__check_ax_client() -> None:
|
|
|
6553
6600
|
_fatal_error("insert_job_into_ax_client: ax_client was not defined where it should have been", 101)
|
|
6554
6601
|
|
|
6555
6602
|
def __insert_job_into_ax_client__attach_trial(arm_params: dict) -> Tuple[Any, int]:
|
|
6603
|
+
if ax_client is None:
|
|
6604
|
+
raise RuntimeError("__insert_job_into_ax_client__attach_trial: ax_client was empty")
|
|
6605
|
+
|
|
6556
6606
|
new_trial = ax_client.attach_trial(arm_params)
|
|
6557
6607
|
if not isinstance(new_trial, tuple) or len(new_trial) < 2:
|
|
6558
6608
|
raise RuntimeError("attach_trial didn't return the expected tuple")
|
|
6559
6609
|
return new_trial
|
|
6560
6610
|
|
|
6561
6611
|
def __insert_job_into_ax_client__get_trial(trial_idx: int) -> Any:
|
|
6612
|
+
if ax_client is None:
|
|
6613
|
+
raise RuntimeError("__insert_job_into_ax_client__get_trial: ax_client was empty")
|
|
6614
|
+
|
|
6562
6615
|
trial = ax_client.experiment.trials.get(trial_idx)
|
|
6563
6616
|
if trial is None:
|
|
6564
6617
|
raise RuntimeError(f"Trial with index {trial_idx} not found")
|
|
@@ -6569,6 +6622,9 @@ def __insert_job_into_ax_client__create_generator_run(arm_params: dict, trial_id
|
|
|
6569
6622
|
return GeneratorRun(arms=[arm], generation_node_name=new_job_type)
|
|
6570
6623
|
|
|
6571
6624
|
def __insert_job_into_ax_client__complete_trial_if_result(trial_idx: int, result: dict, __status: Optional[Any], base_str: Optional[str]) -> None:
|
|
6625
|
+
if ax_client is None:
|
|
6626
|
+
raise RuntimeError("__insert_job_into_ax_client__complete_trial_if_result: ax_client was empty")
|
|
6627
|
+
|
|
6572
6628
|
if f"{result}" != "":
|
|
6573
6629
|
__insert_job_into_ax_client__update_status(__status, base_str, "Completing trial")
|
|
6574
6630
|
is_ok = True
|
|
@@ -7376,11 +7432,15 @@ def is_already_in_defective_nodes(hostname: str) -> bool:
|
|
|
7376
7432
|
return True
|
|
7377
7433
|
except Exception as e:
|
|
7378
7434
|
print_red(f"is_already_in_defective_nodes: Error reading the file {file_path}: {e}")
|
|
7379
|
-
return False
|
|
7380
7435
|
|
|
7381
7436
|
return False
|
|
7382
7437
|
|
|
7383
7438
|
def submit_new_job(parameters: Union[dict, str], trial_index: int) -> Any:
|
|
7439
|
+
if submitit_executor is None:
|
|
7440
|
+
print_red("submit_new_job: submitit_executor was None")
|
|
7441
|
+
|
|
7442
|
+
return None
|
|
7443
|
+
|
|
7384
7444
|
print_debug(f"Submitting new job for trial_index {trial_index}, parameters {parameters}")
|
|
7385
7445
|
|
|
7386
7446
|
start = time.time()
|
|
@@ -7396,18 +7456,21 @@ def submit_new_job(parameters: Union[dict, str], trial_index: int) -> Any:
|
|
|
7396
7456
|
def orchestrator_start_trial(parameters: Union[dict, str], trial_index: int) -> None:
|
|
7397
7457
|
if submitit_executor and ax_client:
|
|
7398
7458
|
new_job = submit_new_job(parameters, trial_index)
|
|
7399
|
-
|
|
7459
|
+
if new_job:
|
|
7460
|
+
submitted_jobs(1)
|
|
7400
7461
|
|
|
7401
|
-
|
|
7462
|
+
_trial = ax_client.get_trial(trial_index)
|
|
7402
7463
|
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7407
|
-
|
|
7464
|
+
try:
|
|
7465
|
+
_trial.mark_staged(unsafe=True)
|
|
7466
|
+
except Exception as e:
|
|
7467
|
+
print_debug(f"orchestrator_start_trial: error {e}")
|
|
7468
|
+
_trial.mark_running(unsafe=True, no_runner_required=True)
|
|
7408
7469
|
|
|
7409
|
-
|
|
7410
|
-
|
|
7470
|
+
print_debug(f"orchestrator_start_trial: appending job {new_job} to global_vars['jobs'], trial_index: {trial_index}")
|
|
7471
|
+
global_vars["jobs"].append((new_job, trial_index))
|
|
7472
|
+
else:
|
|
7473
|
+
print_red("orchestrator_start_trial: Failed to start new job")
|
|
7411
7474
|
else:
|
|
7412
7475
|
_fatal_error("submitit_executor or ax_client could not be found properly", 9)
|
|
7413
7476
|
|
|
@@ -7539,15 +7602,18 @@ def execute_evaluation(_params: list) -> Optional[int]:
|
|
|
7539
7602
|
try:
|
|
7540
7603
|
initialize_job_environment()
|
|
7541
7604
|
new_job = submit_new_job(parameters, trial_index)
|
|
7542
|
-
|
|
7605
|
+
if new_job:
|
|
7606
|
+
submitted_jobs(1)
|
|
7543
7607
|
|
|
7544
|
-
|
|
7545
|
-
|
|
7608
|
+
print_debug(f"execute_evaluation: appending job {new_job} to global_vars['jobs'], trial_index: {trial_index}")
|
|
7609
|
+
global_vars["jobs"].append((new_job, trial_index))
|
|
7546
7610
|
|
|
7547
|
-
|
|
7548
|
-
|
|
7611
|
+
mark_trial_stage("mark_running", "Marking the trial as running failed")
|
|
7612
|
+
trial_counter += 1
|
|
7549
7613
|
|
|
7550
|
-
|
|
7614
|
+
progressbar_description("started new job")
|
|
7615
|
+
else:
|
|
7616
|
+
progressbar_description("Failed to start new job")
|
|
7551
7617
|
except submitit.core.utils.FailedJobError as error:
|
|
7552
7618
|
handle_failed_job(error, trial_index, new_job)
|
|
7553
7619
|
trial_counter += 1
|
|
@@ -7645,10 +7711,12 @@ def show_debug_table_for_break_run_search(_name: str, _max_eval: Optional[int])
|
|
|
7645
7711
|
("failed_jobs()", failed_jobs()),
|
|
7646
7712
|
("count_done_jobs()", count_done_jobs()),
|
|
7647
7713
|
("_max_eval", _max_eval),
|
|
7648
|
-
("progress_bar.total", progress_bar.total),
|
|
7649
7714
|
("NR_INSERTED_JOBS", NR_INSERTED_JOBS)
|
|
7650
7715
|
]
|
|
7651
7716
|
|
|
7717
|
+
if progress_bar is not None:
|
|
7718
|
+
rows.append(("progress_bar.total", progress_bar.total))
|
|
7719
|
+
|
|
7652
7720
|
for row in rows:
|
|
7653
7721
|
table.add_row(str(row[0]), str(row[1]))
|
|
7654
7722
|
|
|
@@ -8400,7 +8468,7 @@ def get_model_from_name(name: str) -> Any:
|
|
|
8400
8468
|
return gen
|
|
8401
8469
|
raise ValueError(f"Unknown or unsupported model: {name}")
|
|
8402
8470
|
|
|
8403
|
-
def get_name_from_model(model) -> Optional[str]:
|
|
8471
|
+
def get_name_from_model(model: Any) -> Optional[str]:
|
|
8404
8472
|
if not isinstance(SUPPORTED_MODELS, (list, set, tuple)):
|
|
8405
8473
|
return None
|
|
8406
8474
|
|
|
@@ -10403,8 +10471,6 @@ def main() -> None:
|
|
|
10403
10471
|
|
|
10404
10472
|
init_live_share()
|
|
10405
10473
|
|
|
10406
|
-
start_periodic_live_share()
|
|
10407
|
-
|
|
10408
10474
|
show_available_hardware_and_generation_strategy_string(gpu_string, gpu_color)
|
|
10409
10475
|
|
|
10410
10476
|
original_print(f"Run-Program: {global_vars['joined_run_program']}")
|
|
@@ -11169,7 +11235,6 @@ def auto_wrap_namespace(namespace: Any) -> Any:
|
|
|
11169
11235
|
"_record_stats",
|
|
11170
11236
|
"_open",
|
|
11171
11237
|
"_check_memory_leak",
|
|
11172
|
-
"start_periodic_live_share",
|
|
11173
11238
|
"start_logging_daemon",
|
|
11174
11239
|
"get_current_run_folder",
|
|
11175
11240
|
"show_func_name_wrapper"
|
.tpe.py
CHANGED
|
@@ -2,7 +2,8 @@ import sys
|
|
|
2
2
|
import os
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Optional, Any
|
|
6
|
+
|
|
6
7
|
try:
|
|
7
8
|
import optuna
|
|
8
9
|
from optuna.trial import create_trial
|
|
@@ -52,7 +53,7 @@ def tpe_suggest_point(trial: optuna.Trial, parameters: dict) -> dict:
|
|
|
52
53
|
if pvaltype == 'INT':
|
|
53
54
|
point[param_name] = trial.suggest_int(param_name, rmin, rmax)
|
|
54
55
|
elif pvaltype == 'FLOAT':
|
|
55
|
-
point[param_name] = trial.suggest_float(param_name, rmin, rmax)
|
|
56
|
+
point[param_name] = trial.suggest_float(param_name, rmin, rmax) # type: ignore[assignment]
|
|
56
57
|
else:
|
|
57
58
|
raise ValueError(f"Unsupported type {pvaltype} for RANGE")
|
|
58
59
|
|
|
@@ -162,7 +163,7 @@ def add_existing_trial_to_study(study: optuna.study.study.Study, trial_entry: li
|
|
|
162
163
|
)
|
|
163
164
|
|
|
164
165
|
@beartype
|
|
165
|
-
def get_best_or_new_point(study:
|
|
166
|
+
def get_best_or_new_point(study: Any, parameters: dict, direction: str) -> dict:
|
|
166
167
|
best_trial_value = study.best_trial.value
|
|
167
168
|
if best_trial_value is not None:
|
|
168
169
|
if (direction == "minimize" and best_trial_value < 1e6) or \
|
omniopt
CHANGED
|
@@ -346,6 +346,31 @@
|
|
|
346
346
|
send_anonymized_usage_stats=0
|
|
347
347
|
already_shown_oo_base_url_msg=0
|
|
348
348
|
|
|
349
|
+
function run_live_share {
|
|
350
|
+
if [[ $RUN_UUID != "" ]]; then
|
|
351
|
+
full_log_file="$ORIGINAL_PWD/logs/$RUN_UUID"
|
|
352
|
+
if [[ -e "$full_log_file" ]]; then
|
|
353
|
+
set +e
|
|
354
|
+
run_folder=$(cat "$full_log_file" | grep "Run-folder:" | sed -e 's#Run-folder: ##')
|
|
355
|
+
if [[ -z $run_folder ]]; then
|
|
356
|
+
true
|
|
357
|
+
else
|
|
358
|
+
bash "$SCRIPT_DIR/omniopt_share" --username="$USER" "$run_folder" 2>/dev/null >/dev/null
|
|
359
|
+
fi
|
|
360
|
+
set -e
|
|
361
|
+
else
|
|
362
|
+
red_text "--live_share enabled, but $full_log_file could not be found. Cannot share once again in finalization.\n"
|
|
363
|
+
fi
|
|
364
|
+
fi
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
function start_periodidic_live_share {
|
|
368
|
+
while true; do
|
|
369
|
+
run_live_share 2>/dev/null >/dev/null
|
|
370
|
+
sleep 30
|
|
371
|
+
done
|
|
372
|
+
}
|
|
373
|
+
|
|
349
374
|
function myexit {
|
|
350
375
|
CODE=$1
|
|
351
376
|
|
|
@@ -382,21 +407,7 @@
|
|
|
382
407
|
|
|
383
408
|
if [[ $follow -eq 1 ]] || ! command -v sbatch 2>/dev/null >/dev/null || [[ $force_local_execution -eq 1 ]]; then
|
|
384
409
|
if [[ $live_share -eq 1 ]]; then
|
|
385
|
-
|
|
386
|
-
full_log_file="$ORIGINAL_PWD/logs/$RUN_UUID"
|
|
387
|
-
if [[ -e "$full_log_file" ]]; then
|
|
388
|
-
set +e
|
|
389
|
-
run_folder=$(cat "$full_log_file" | grep "Run-folder:" | sed -e 's#Run-folder: ##')
|
|
390
|
-
if [[ -z $run_folder ]]; then
|
|
391
|
-
true
|
|
392
|
-
else
|
|
393
|
-
bash "$SCRIPT_DIR/omniopt_share" --username="$USER" "$run_folder" 2>/dev/null >/dev/null
|
|
394
|
-
fi
|
|
395
|
-
set -e
|
|
396
|
-
else
|
|
397
|
-
red_text "--live_share enabled, but $full_log_file could not be found. Cannot share once again in finalization.\n"
|
|
398
|
-
fi
|
|
399
|
-
fi
|
|
410
|
+
run_live_share
|
|
400
411
|
fi
|
|
401
412
|
fi
|
|
402
413
|
|
|
@@ -1620,6 +1631,13 @@ EOF
|
|
|
1620
1631
|
set +e
|
|
1621
1632
|
trap - ERR
|
|
1622
1633
|
|
|
1634
|
+
live_share_pid=""
|
|
1635
|
+
|
|
1636
|
+
if [[ $live_share -eq 1 ]]; then
|
|
1637
|
+
start_periodidic_live_share &
|
|
1638
|
+
live_share_pid=$!
|
|
1639
|
+
fi
|
|
1640
|
+
|
|
1623
1641
|
if [[ -z $RUN_WITH_COVERAGE ]]; then
|
|
1624
1642
|
if [[ -z $RUN_WITH_PYSPY ]]; then
|
|
1625
1643
|
stdbuf -e 0 -o 0 python3 "$SCRIPT_DIR/.omniopt.py" $args_string
|
|
@@ -1636,6 +1654,10 @@ EOF
|
|
|
1636
1654
|
EXIT_CODE=$?
|
|
1637
1655
|
fi
|
|
1638
1656
|
|
|
1657
|
+
if [[ $live_share -eq 1 ]] && [[ -n $live_share_pid ]]; then
|
|
1658
|
+
kill -9 $live_share_pid
|
|
1659
|
+
fi
|
|
1660
|
+
|
|
1639
1661
|
set -e
|
|
1640
1662
|
trap 'calltracer' ERR
|
|
1641
1663
|
|