ert 16.0.9__py3-none-any.whl → 19.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _ert/events.py +19 -2
- _ert/forward_model_runner/client.py +6 -2
- _ert/forward_model_runner/fm_dispatch.py +9 -6
- _ert/forward_model_runner/reporting/event.py +1 -0
- _ert/forward_model_runner/runner.py +1 -2
- _ert/utils.py +12 -0
- ert/__main__.py +58 -38
- ert/analysis/_enif_update.py +8 -4
- ert/analysis/_es_update.py +19 -6
- ert/analysis/_update_commons.py +16 -6
- ert/base_model_context.py +1 -1
- ert/cli/main.py +17 -12
- ert/cli/monitor.py +7 -0
- ert/config/__init__.py +17 -6
- ert/config/_create_observation_dataframes.py +118 -21
- ert/config/_get_num_cpu.py +1 -1
- ert/config/_observations.py +91 -2
- ert/config/_read_summary.py +74 -328
- ert/config/design_matrix.py +62 -23
- ert/config/distribution.py +1 -1
- ert/config/ensemble_config.py +9 -17
- ert/config/ert_config.py +155 -58
- ert/config/everest_control.py +234 -0
- ert/config/{everest_constraints_config.py → everest_response.py} +27 -15
- ert/config/field.py +99 -90
- ert/config/forward_model_step.py +122 -17
- ert/config/gen_data_config.py +5 -10
- ert/config/gen_kw_config.py +11 -41
- ert/config/known_response_types.py +14 -0
- ert/config/parameter_config.py +1 -33
- ert/config/parsing/_option_dict.py +10 -2
- ert/config/parsing/config_errors.py +1 -1
- ert/config/parsing/config_keywords.py +2 -1
- ert/config/parsing/config_schema.py +23 -11
- ert/config/parsing/config_schema_deprecations.py +3 -3
- ert/config/parsing/config_schema_item.py +26 -11
- ert/config/parsing/context_values.py +3 -3
- ert/config/parsing/file_context_token.py +1 -1
- ert/config/parsing/observations_parser.py +6 -2
- ert/config/parsing/queue_system.py +9 -0
- ert/config/parsing/schema_item_type.py +1 -0
- ert/config/queue_config.py +42 -50
- ert/config/response_config.py +0 -8
- ert/config/rft_config.py +275 -0
- ert/config/summary_config.py +3 -8
- ert/config/surface_config.py +73 -26
- ert/config/workflow_fixtures.py +2 -1
- ert/config/workflow_job.py +135 -54
- ert/dark_storage/client/__init__.py +2 -2
- ert/dark_storage/client/_session.py +4 -4
- ert/dark_storage/client/client.py +2 -2
- ert/dark_storage/common.py +12 -3
- ert/dark_storage/compute/misfits.py +11 -7
- ert/dark_storage/endpoints/compute/misfits.py +6 -4
- ert/dark_storage/endpoints/ensembles.py +4 -0
- ert/dark_storage/endpoints/experiment_server.py +30 -24
- ert/dark_storage/endpoints/experiments.py +2 -2
- ert/dark_storage/endpoints/observations.py +8 -6
- ert/dark_storage/endpoints/parameters.py +4 -12
- ert/dark_storage/endpoints/responses.py +24 -5
- ert/dark_storage/json_schema/ensemble.py +3 -0
- ert/dark_storage/json_schema/experiment.py +1 -1
- ert/data/_measured_data.py +6 -5
- ert/ensemble_evaluator/__init__.py +8 -1
- ert/ensemble_evaluator/config.py +2 -1
- ert/ensemble_evaluator/evaluator.py +81 -29
- ert/ensemble_evaluator/event.py +6 -0
- ert/ensemble_evaluator/snapshot.py +3 -1
- ert/ensemble_evaluator/state.py +1 -0
- ert/field_utils/__init__.py +8 -0
- ert/field_utils/field_utils.py +228 -15
- ert/field_utils/grdecl_io.py +1 -1
- ert/field_utils/roff_io.py +1 -1
- ert/gui/__init__.py +5 -2
- ert/gui/ertnotifier.py +1 -1
- ert/gui/ertwidgets/__init__.py +23 -16
- ert/gui/ertwidgets/analysismoduleedit.py +2 -2
- ert/gui/ertwidgets/checklist.py +1 -1
- ert/gui/ertwidgets/closabledialog.py +2 -0
- ert/gui/ertwidgets/copyablelabel.py +2 -0
- ert/gui/ertwidgets/create_experiment_dialog.py +3 -1
- ert/gui/ertwidgets/ensembleselector.py +2 -2
- ert/gui/ertwidgets/listeditbox.py +2 -0
- ert/gui/ertwidgets/models/__init__.py +2 -0
- ert/gui/ertwidgets/models/activerealizationsmodel.py +5 -1
- ert/gui/ertwidgets/models/path_model.py +1 -1
- ert/gui/ertwidgets/models/targetensemblemodel.py +5 -1
- ert/gui/ertwidgets/models/text_model.py +4 -1
- ert/gui/ertwidgets/pathchooser.py +0 -3
- ert/gui/ertwidgets/searchbox.py +17 -4
- ert/gui/ertwidgets/stringbox.py +2 -0
- ert/gui/{suggestor → ertwidgets/suggestor}/_suggestor_message.py +13 -4
- ert/gui/{suggestor → ertwidgets/suggestor}/suggestor.py +63 -30
- ert/gui/main.py +41 -13
- ert/gui/main_window.py +3 -7
- ert/gui/model/fm_step_list.py +3 -0
- ert/gui/model/real_list.py +1 -0
- ert/gui/model/snapshot.py +1 -0
- ert/gui/simulation/combobox_with_description.py +3 -0
- ert/gui/simulation/ensemble_experiment_panel.py +8 -2
- ert/gui/simulation/ensemble_information_filter_panel.py +7 -2
- ert/gui/simulation/ensemble_smoother_panel.py +8 -2
- ert/gui/simulation/evaluate_ensemble_panel.py +17 -7
- ert/gui/simulation/experiment_panel.py +18 -6
- ert/gui/simulation/manual_update_panel.py +35 -10
- ert/gui/simulation/multiple_data_assimilation_panel.py +13 -9
- ert/gui/simulation/run_dialog.py +47 -20
- ert/gui/simulation/single_test_run_panel.py +6 -3
- ert/gui/simulation/view/progress_widget.py +2 -0
- ert/gui/simulation/view/realization.py +5 -1
- ert/gui/simulation/view/update.py +2 -0
- ert/gui/summarypanel.py +20 -1
- ert/gui/tools/event_viewer/panel.py +3 -4
- ert/gui/tools/event_viewer/tool.py +2 -0
- ert/gui/tools/load_results/load_results_panel.py +1 -1
- ert/gui/tools/load_results/load_results_tool.py +2 -0
- ert/gui/tools/manage_experiments/export_dialog.py +136 -0
- ert/gui/tools/manage_experiments/manage_experiments_panel.py +2 -0
- ert/gui/tools/manage_experiments/storage_info_widget.py +121 -16
- ert/gui/tools/manage_experiments/storage_widget.py +4 -3
- ert/gui/tools/plot/customize/color_chooser.py +5 -2
- ert/gui/tools/plot/customize/customize_plot_dialog.py +2 -0
- ert/gui/tools/plot/customize/default_customization_view.py +4 -0
- ert/gui/tools/plot/customize/limits_customization_view.py +3 -0
- ert/gui/tools/plot/customize/statistics_customization_view.py +3 -0
- ert/gui/tools/plot/customize/style_chooser.py +2 -0
- ert/gui/tools/plot/customize/style_customization_view.py +3 -0
- ert/gui/tools/plot/data_type_keys_widget.py +2 -0
- ert/gui/tools/plot/data_type_proxy_model.py +3 -0
- ert/gui/tools/plot/plot_api.py +50 -28
- ert/gui/tools/plot/plot_ensemble_selection_widget.py +17 -10
- ert/gui/tools/plot/plot_widget.py +15 -2
- ert/gui/tools/plot/plot_window.py +41 -19
- ert/gui/tools/plot/plottery/plot_config.py +2 -0
- ert/gui/tools/plot/plottery/plot_context.py +14 -0
- ert/gui/tools/plot/plottery/plots/__init__.py +2 -0
- ert/gui/tools/plot/plottery/plots/cesp.py +3 -1
- ert/gui/tools/plot/plottery/plots/distribution.py +6 -1
- ert/gui/tools/plot/plottery/plots/ensemble.py +13 -5
- ert/gui/tools/plot/plottery/plots/gaussian_kde.py +12 -2
- ert/gui/tools/plot/plottery/plots/histogram.py +3 -1
- ert/gui/tools/plot/plottery/plots/misfits.py +436 -0
- ert/gui/tools/plot/plottery/plots/observations.py +18 -4
- ert/gui/tools/plot/plottery/plots/statistics.py +62 -20
- ert/gui/tools/plot/plottery/plots/std_dev.py +3 -1
- ert/gui/tools/plot/widgets/clearable_line_edit.py +9 -0
- ert/gui/tools/plot/widgets/filter_popup.py +2 -0
- ert/gui/tools/plot/widgets/filterable_kw_list_model.py +3 -0
- ert/gui/tools/plugins/plugin.py +1 -1
- ert/gui/tools/plugins/plugins_tool.py +2 -0
- ert/gui/tools/plugins/process_job_dialog.py +3 -0
- ert/gui/tools/workflows/workflow_dialog.py +2 -0
- ert/gui/tools/workflows/workflows_tool.py +2 -0
- ert/libres_facade.py +5 -7
- ert/logging/__init__.py +4 -1
- ert/mode_definitions.py +2 -0
- ert/plugins/__init__.py +4 -6
- ert/plugins/hook_implementations/workflows/csv_export.py +2 -3
- ert/plugins/hook_implementations/workflows/gen_data_rft_export.py +10 -2
- ert/plugins/hook_specifications/__init__.py +0 -10
- ert/plugins/hook_specifications/jobs.py +0 -9
- ert/plugins/plugin_manager.py +53 -124
- ert/resources/forward_models/run_reservoirsimulator.py +8 -4
- ert/resources/forward_models/template_render.py +10 -10
- ert/resources/shell_scripts/delete_directory.py +2 -2
- ert/run_models/__init__.py +24 -6
- ert/run_models/_create_run_path.py +133 -38
- ert/run_models/ensemble_experiment.py +10 -4
- ert/run_models/ensemble_information_filter.py +8 -1
- ert/run_models/ensemble_smoother.py +9 -3
- ert/run_models/evaluate_ensemble.py +8 -6
- ert/run_models/event.py +7 -3
- ert/run_models/everest_run_model.py +337 -113
- ert/run_models/initial_ensemble_run_model.py +25 -24
- ert/run_models/manual_update.py +6 -3
- ert/run_models/manual_update_enif.py +37 -0
- ert/run_models/model_factory.py +78 -18
- ert/run_models/multiple_data_assimilation.py +22 -11
- ert/run_models/run_model.py +72 -73
- ert/run_models/single_test_run.py +7 -4
- ert/run_models/update_run_model.py +4 -2
- ert/runpaths.py +5 -6
- ert/sample_prior.py +9 -4
- ert/scheduler/__init__.py +10 -5
- ert/scheduler/driver.py +40 -0
- ert/scheduler/event.py +3 -1
- ert/scheduler/job.py +23 -13
- ert/scheduler/lsf_driver.py +15 -5
- ert/scheduler/openpbs_driver.py +10 -4
- ert/scheduler/scheduler.py +5 -0
- ert/scheduler/slurm_driver.py +20 -5
- ert/services/__init__.py +2 -2
- ert/services/_base_service.py +37 -20
- ert/services/_storage_main.py +20 -18
- ert/services/ert_server.py +317 -0
- ert/shared/_doc_utils/__init__.py +4 -2
- ert/shared/_doc_utils/ert_jobs.py +1 -4
- ert/shared/net_utils.py +43 -18
- ert/shared/storage/connection.py +3 -3
- ert/shared/version.py +3 -3
- ert/storage/__init__.py +14 -1
- ert/storage/local_ensemble.py +44 -13
- ert/storage/local_experiment.py +54 -34
- ert/storage/local_storage.py +90 -58
- ert/storage/migration/to10.py +3 -2
- ert/storage/migration/to11.py +9 -10
- ert/storage/migration/to12.py +19 -20
- ert/storage/migration/to13.py +28 -27
- ert/storage/migration/to14.py +3 -3
- ert/storage/migration/to15.py +25 -0
- ert/storage/migration/to16.py +38 -0
- ert/storage/migration/to17.py +42 -0
- ert/storage/migration/to18.py +11 -0
- ert/storage/migration/to19.py +34 -0
- ert/storage/migration/to20.py +23 -0
- ert/storage/migration/to21.py +25 -0
- ert/storage/migration/to6.py +3 -2
- ert/storage/migration/to7.py +12 -13
- ert/storage/migration/to8.py +9 -11
- ert/storage/migration/to9.py +5 -4
- ert/storage/realization_storage_state.py +7 -7
- ert/substitutions.py +12 -28
- ert/validation/active_range.py +7 -7
- ert/validation/ensemble_realizations_argument.py +4 -2
- ert/validation/rangestring.py +16 -16
- ert/workflow_runner.py +6 -3
- {ert-16.0.9.dist-info → ert-19.0.0rc2.dist-info}/METADATA +21 -15
- ert-19.0.0rc2.dist-info/RECORD +524 -0
- {ert-16.0.9.dist-info → ert-19.0.0rc2.dist-info}/WHEEL +1 -1
- everest/api/everest_data_api.py +14 -1
- everest/assets/everest_logo.svg +406 -0
- everest/bin/config_branch_script.py +30 -14
- everest/bin/everconfigdump_script.py +2 -10
- everest/bin/everest_script.py +53 -33
- everest/bin/everlint_script.py +3 -5
- everest/bin/kill_script.py +7 -5
- everest/bin/main.py +11 -24
- everest/bin/monitor_script.py +64 -35
- everest/bin/utils.py +58 -43
- everest/bin/visualization_script.py +23 -13
- everest/config/__init__.py +4 -1
- everest/config/control_config.py +81 -6
- everest/config/control_variable_config.py +4 -3
- everest/config/everest_config.py +102 -79
- everest/config/forward_model_config.py +5 -3
- everest/config/install_data_config.py +7 -5
- everest/config/install_job_config.py +45 -3
- everest/config/install_template_config.py +3 -3
- everest/config/optimization_config.py +19 -6
- everest/config/output_constraint_config.py +8 -2
- everest/config/server_config.py +6 -55
- everest/config/simulator_config.py +62 -17
- everest/config/utils.py +25 -105
- everest/config/validation_utils.py +34 -15
- everest/config_file_loader.py +30 -21
- everest/detached/__init__.py +0 -6
- everest/detached/client.py +7 -52
- everest/detached/everserver.py +19 -45
- everest/everest_storage.py +24 -40
- everest/gui/everest_client.py +2 -3
- everest/gui/main_window.py +2 -2
- everest/optimizer/everest2ropt.py +68 -42
- everest/optimizer/opt_model_transforms.py +15 -20
- everest/optimizer/utils.py +0 -29
- everest/plugins/hook_specs.py +0 -24
- everest/strings.py +1 -6
- everest/util/__init__.py +3 -1
- ert/config/everest_objective_config.py +0 -95
- ert/config/ext_param_config.py +0 -107
- ert/gui/tools/export/__init__.py +0 -3
- ert/gui/tools/export/export_panel.py +0 -83
- ert/gui/tools/export/export_tool.py +0 -67
- ert/gui/tools/export/exporter.py +0 -36
- ert/plugins/hook_specifications/ecl_config.py +0 -29
- ert/services/storage_service.py +0 -127
- ert/summary_key_type.py +0 -234
- ert-16.0.9.dist-info/RECORD +0 -521
- everest/bin/everexport_script.py +0 -53
- everest/config/sampler_config.py +0 -103
- everest/simulator/__init__.py +0 -88
- everest/simulator/everest_to_ert.py +0 -252
- /ert/gui/{suggestor → ertwidgets/suggestor}/__init__.py +0 -0
- /ert/gui/{suggestor → ertwidgets/suggestor}/_colors.py +0 -0
- {ert-16.0.9.dist-info → ert-19.0.0rc2.dist-info}/entry_points.txt +0 -0
- {ert-16.0.9.dist-info → ert-19.0.0rc2.dist-info}/licenses/COPYING +0 -0
- {ert-16.0.9.dist-info → ert-19.0.0rc2.dist-info}/top_level.txt +0 -0
ert/scheduler/lsf_driver.py
CHANGED
|
@@ -19,10 +19,15 @@ from typing import (
|
|
|
19
19
|
get_args,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
from .driver import
|
|
22
|
+
from .driver import (
|
|
23
|
+
_POLL_PERIOD,
|
|
24
|
+
SIGNAL_OFFSET,
|
|
25
|
+
Driver,
|
|
26
|
+
FailedSubmit,
|
|
27
|
+
create_submit_script,
|
|
28
|
+
)
|
|
23
29
|
from .event import DriverEvent, FinishedEvent, StartedEvent
|
|
24
30
|
|
|
25
|
-
_POLL_PERIOD = 2.0 # seconds
|
|
26
31
|
LSF_FAILED_JOB = SIGNAL_OFFSET + 65 # first non signal returncode
|
|
27
32
|
"""Return code we use when lsf reports failed jobs"""
|
|
28
33
|
|
|
@@ -257,6 +262,7 @@ class LsfDriver(Driver):
|
|
|
257
262
|
bkill_cmd: str | None = None,
|
|
258
263
|
bhist_cmd: str | None = None,
|
|
259
264
|
activate_script: str = "",
|
|
265
|
+
poll_period: float = _POLL_PERIOD,
|
|
260
266
|
) -> None:
|
|
261
267
|
super().__init__(activate_script)
|
|
262
268
|
self._queue_name = queue_name
|
|
@@ -277,7 +283,7 @@ class LsfDriver(Driver):
|
|
|
277
283
|
self._sleep_time_between_cmd_retries = 3
|
|
278
284
|
self._max_bsub_attempts = 10
|
|
279
285
|
|
|
280
|
-
self._poll_period =
|
|
286
|
+
self._poll_period = poll_period
|
|
281
287
|
|
|
282
288
|
self._bhist_cmd = Path(bhist_cmd or shutil.which("bhist") or "bhist")
|
|
283
289
|
self._bhist_cache: dict[str, dict[str, int]] | None = None
|
|
@@ -438,6 +444,7 @@ class LsfDriver(Driver):
|
|
|
438
444
|
|
|
439
445
|
async def poll(self) -> None:
|
|
440
446
|
while True:
|
|
447
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
441
448
|
if not self._jobs.keys():
|
|
442
449
|
await asyncio.sleep(self._poll_period)
|
|
443
450
|
continue
|
|
@@ -455,6 +462,7 @@ class LsfDriver(Driver):
|
|
|
455
462
|
)
|
|
456
463
|
except OSError as e:
|
|
457
464
|
logger.error(str(e))
|
|
465
|
+
self._last_polling_error_message = str(e)
|
|
458
466
|
await asyncio.sleep(self._poll_period)
|
|
459
467
|
continue
|
|
460
468
|
|
|
@@ -462,10 +470,11 @@ class LsfDriver(Driver):
|
|
|
462
470
|
if process.returncode:
|
|
463
471
|
# bjobs may give nonzero return code even when it is providing
|
|
464
472
|
# at least some correct information
|
|
473
|
+
error_msg = stderr.decode()
|
|
465
474
|
logger.warning(
|
|
466
|
-
f"bjobs gave returncode {process.returncode} "
|
|
467
|
-
f"and error {stderr.decode()}"
|
|
475
|
+
f"bjobs gave returncode {process.returncode} and error {error_msg}"
|
|
468
476
|
)
|
|
477
|
+
self._last_polling_error_message = error_msg
|
|
469
478
|
bjobs_states = _parse_jobs_dict(parse_bjobs(stdout.decode(errors="ignore")))
|
|
470
479
|
self.update_and_log_exec_hosts(
|
|
471
480
|
parse_bjobs_exec_hosts(stdout.decode(errors="ignore"))
|
|
@@ -497,6 +506,7 @@ class LsfDriver(Driver):
|
|
|
497
506
|
"bhist did not give status for job_ids "
|
|
498
507
|
f"{missing_in_bhist_and_bjobs}, giving up for now."
|
|
499
508
|
)
|
|
509
|
+
self._last_successful_poll = time.time()
|
|
500
510
|
await asyncio.sleep(self._poll_period)
|
|
501
511
|
|
|
502
512
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/openpbs_driver.py
CHANGED
|
@@ -5,17 +5,17 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import shlex
|
|
7
7
|
import shutil
|
|
8
|
+
import time
|
|
8
9
|
from collections.abc import Iterable, Mapping, MutableMapping
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Any, Literal, cast, get_type_hints
|
|
12
13
|
|
|
13
|
-
from .driver import Driver, FailedSubmit, create_submit_script
|
|
14
|
+
from .driver import _POLL_PERIOD, Driver, FailedSubmit, create_submit_script
|
|
14
15
|
from .event import DriverEvent, FinishedEvent, StartedEvent
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
|
-
_POLL_PERIOD = 2.0 # seconds
|
|
19
19
|
JOB_STATES = [
|
|
20
20
|
"B", # Begun
|
|
21
21
|
"E", # Exiting with or without errors
|
|
@@ -130,6 +130,7 @@ class OpenPBSDriver(Driver):
|
|
|
130
130
|
qstat_cmd: str | None = None,
|
|
131
131
|
qdel_cmd: str | None = None,
|
|
132
132
|
activate_script: str = "",
|
|
133
|
+
poll_period: float = _POLL_PERIOD,
|
|
133
134
|
) -> None:
|
|
134
135
|
super().__init__(activate_script)
|
|
135
136
|
|
|
@@ -140,7 +141,7 @@ class OpenPBSDriver(Driver):
|
|
|
140
141
|
self._job_prefix = job_prefix
|
|
141
142
|
self._max_pbs_cmd_attempts = 10
|
|
142
143
|
self._sleep_time_between_cmd_retries = 2
|
|
143
|
-
self._poll_period =
|
|
144
|
+
self._poll_period = poll_period
|
|
144
145
|
|
|
145
146
|
self._qsub_cmd = Path(qsub_cmd or shutil.which("qsub") or "qsub")
|
|
146
147
|
self._qstat_cmd = Path(qstat_cmd or shutil.which("qstat") or "qstat")
|
|
@@ -264,6 +265,7 @@ class OpenPBSDriver(Driver):
|
|
|
264
265
|
|
|
265
266
|
async def poll(self) -> None:
|
|
266
267
|
while True:
|
|
268
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
267
269
|
if not self._jobs:
|
|
268
270
|
await asyncio.sleep(self._poll_period)
|
|
269
271
|
continue
|
|
@@ -280,6 +282,7 @@ class OpenPBSDriver(Driver):
|
|
|
280
282
|
)
|
|
281
283
|
except OSError as e:
|
|
282
284
|
logger.error(str(e))
|
|
285
|
+
self._last_polling_error_message = str(e)
|
|
283
286
|
await asyncio.sleep(self._poll_period)
|
|
284
287
|
continue
|
|
285
288
|
stdout, stderr = await process.communicate()
|
|
@@ -289,10 +292,12 @@ class OpenPBSDriver(Driver):
|
|
|
289
292
|
await asyncio.sleep(self._poll_period)
|
|
290
293
|
continue
|
|
291
294
|
if process.returncode == QSTAT_UNKNOWN_JOB_ID:
|
|
295
|
+
error_msg = stderr.decode(errors="ignore")
|
|
292
296
|
logger.debug(
|
|
293
297
|
f"qstat gave returncode {QSTAT_UNKNOWN_JOB_ID} "
|
|
294
|
-
f"with message {
|
|
298
|
+
f"with message {error_msg}"
|
|
295
299
|
)
|
|
300
|
+
self._last_polling_error_message = error_msg
|
|
296
301
|
parsed_jobs = _parse_jobs_dict(
|
|
297
302
|
parse_qstat(stdout.decode(errors="ignore"))
|
|
298
303
|
)
|
|
@@ -330,6 +335,7 @@ class OpenPBSDriver(Driver):
|
|
|
330
335
|
for job_id, job in parsed_jobs_dict.items():
|
|
331
336
|
await self._process_job_update(job_id, job)
|
|
332
337
|
|
|
338
|
+
self._last_successful_poll = time.time()
|
|
333
339
|
await asyncio.sleep(self._poll_period)
|
|
334
340
|
|
|
335
341
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/scheduler.py
CHANGED
|
@@ -15,6 +15,7 @@ import orjson
|
|
|
15
15
|
from pydantic.dataclasses import dataclass
|
|
16
16
|
|
|
17
17
|
from _ert.events import (
|
|
18
|
+
EnsembleEvaluationWarning,
|
|
18
19
|
ForwardModelStepChecksum,
|
|
19
20
|
RealizationEvent,
|
|
20
21
|
RealizationFailed,
|
|
@@ -343,6 +344,10 @@ class Scheduler:
|
|
|
343
344
|
async def _process_event_queue(self) -> None:
|
|
344
345
|
while True:
|
|
345
346
|
event = await self.driver.event_queue.get()
|
|
347
|
+
if isinstance(event, EnsembleEvaluationWarning):
|
|
348
|
+
if self._ensemble_evaluator_queue:
|
|
349
|
+
await self._ensemble_evaluator_queue.put(event)
|
|
350
|
+
continue
|
|
346
351
|
job = self._jobs[event.iens]
|
|
347
352
|
|
|
348
353
|
# Any event implies the job has at least started
|
ert/scheduler/slurm_driver.py
CHANGED
|
@@ -14,7 +14,13 @@ from enum import Enum, auto
|
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
from tempfile import NamedTemporaryFile
|
|
16
16
|
|
|
17
|
-
from .driver import
|
|
17
|
+
from .driver import (
|
|
18
|
+
_POLL_PERIOD,
|
|
19
|
+
SIGNAL_OFFSET,
|
|
20
|
+
Driver,
|
|
21
|
+
FailedSubmit,
|
|
22
|
+
create_submit_script,
|
|
23
|
+
)
|
|
18
24
|
from .event import DriverEvent, FinishedEvent, StartedEvent
|
|
19
25
|
|
|
20
26
|
SLURM_FAILED_EXIT_CODE_FETCH = SIGNAL_OFFSET + 66
|
|
@@ -77,9 +83,10 @@ class SlurmDriver(Driver):
|
|
|
77
83
|
user: str | None = None,
|
|
78
84
|
queue_name: str | None = None,
|
|
79
85
|
max_runtime: float | None = None,
|
|
80
|
-
squeue_timeout: float =
|
|
86
|
+
squeue_timeout: float | None = None,
|
|
81
87
|
project_code: str | None = None,
|
|
82
88
|
activate_script: str = "",
|
|
89
|
+
poll_period: float = _POLL_PERIOD,
|
|
83
90
|
) -> None:
|
|
84
91
|
super().__init__(activate_script)
|
|
85
92
|
self._submit_locks: dict[int, asyncio.Lock] = {}
|
|
@@ -109,7 +116,11 @@ class SlurmDriver(Driver):
|
|
|
109
116
|
|
|
110
117
|
self._sleep_time_between_cmd_retries = 3
|
|
111
118
|
self._sleep_time_between_kills = 30
|
|
112
|
-
|
|
119
|
+
|
|
120
|
+
self._poll_period = (
|
|
121
|
+
squeue_timeout if squeue_timeout is not None else poll_period
|
|
122
|
+
)
|
|
123
|
+
|
|
113
124
|
self._project_code = project_code
|
|
114
125
|
|
|
115
126
|
def _submit_cmd(
|
|
@@ -251,6 +262,7 @@ class SlurmDriver(Driver):
|
|
|
251
262
|
|
|
252
263
|
async def poll(self) -> None:
|
|
253
264
|
while True:
|
|
265
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
254
266
|
if not self._jobs.keys():
|
|
255
267
|
await asyncio.sleep(self._poll_period)
|
|
256
268
|
continue
|
|
@@ -266,14 +278,16 @@ class SlurmDriver(Driver):
|
|
|
266
278
|
)
|
|
267
279
|
except OSError as e:
|
|
268
280
|
logger.error(str(e))
|
|
281
|
+
self._last_polling_error_message = str(e)
|
|
269
282
|
await asyncio.sleep(self._poll_period)
|
|
270
283
|
continue
|
|
271
284
|
stdout, stderr = await process.communicate()
|
|
272
285
|
if process.returncode:
|
|
286
|
+
error_msg = stderr.decode()
|
|
273
287
|
logger.warning(
|
|
274
|
-
f"squeue gave returncode {process.returncode} "
|
|
275
|
-
f"and error {stderr.decode()}"
|
|
288
|
+
f"squeue gave returncode {process.returncode} and error {error_msg}"
|
|
276
289
|
)
|
|
290
|
+
self._last_polling_error_message = error_msg
|
|
277
291
|
squeue_states = dict(_parse_squeue_output(stdout.decode(errors="ignore")))
|
|
278
292
|
|
|
279
293
|
job_ids_found_in_squeue_output = set(squeue_states.keys())
|
|
@@ -306,6 +320,7 @@ class SlurmDriver(Driver):
|
|
|
306
320
|
"scontrol did not give status for job_ids "
|
|
307
321
|
f"{missing_in_squeue_and_scontrol}, giving up for now."
|
|
308
322
|
)
|
|
323
|
+
self._last_successful_poll = time.time()
|
|
309
324
|
await asyncio.sleep(self._poll_period)
|
|
310
325
|
|
|
311
326
|
async def _process_job_update(self, job_id: str, new_info: JobInfo) -> None:
|
ert/services/__init__.py
CHANGED
ert/services/_base_service.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This file contains a more generic version of "ert services", and
|
|
3
|
+
is scheduled for removal when WebvizErt is removed.
|
|
4
|
+
"""
|
|
5
|
+
|
|
1
6
|
from __future__ import annotations
|
|
2
7
|
|
|
3
8
|
import contextlib
|
|
@@ -7,6 +12,7 @@ import os
|
|
|
7
12
|
import signal
|
|
8
13
|
import sys
|
|
9
14
|
import threading
|
|
15
|
+
import types
|
|
10
16
|
from collections.abc import Callable, Mapping, Sequence
|
|
11
17
|
from logging import Logger, getLogger
|
|
12
18
|
from pathlib import Path
|
|
@@ -15,13 +21,21 @@ from subprocess import Popen, TimeoutExpired
|
|
|
15
21
|
from tempfile import NamedTemporaryFile
|
|
16
22
|
from time import sleep
|
|
17
23
|
from types import FrameType
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Generic, Self, TypeVar
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Generic, Self, TypedDict, TypeVar
|
|
19
25
|
|
|
20
26
|
if TYPE_CHECKING:
|
|
21
|
-
|
|
27
|
+
pass
|
|
22
28
|
|
|
23
29
|
T = TypeVar("T", bound="BaseService")
|
|
24
|
-
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ErtServerConnectionInfo(TypedDict):
|
|
33
|
+
urls: list[str]
|
|
34
|
+
authtoken: str
|
|
35
|
+
host: str
|
|
36
|
+
port: str
|
|
37
|
+
cert: str
|
|
38
|
+
auth: str
|
|
25
39
|
|
|
26
40
|
|
|
27
41
|
SERVICE_CONF_PATHS: set[str] = set()
|
|
@@ -74,9 +88,9 @@ class _Context(Generic[T]):
|
|
|
74
88
|
|
|
75
89
|
def __exit__(
|
|
76
90
|
self,
|
|
77
|
-
exc_type: type[BaseException],
|
|
78
|
-
exc_value: BaseException,
|
|
79
|
-
traceback:
|
|
91
|
+
exc_type: type[BaseException] | None,
|
|
92
|
+
exc_value: BaseException | None,
|
|
93
|
+
traceback: types.TracebackType | None,
|
|
80
94
|
) -> bool:
|
|
81
95
|
self._service.shutdown()
|
|
82
96
|
return exc_type is None
|
|
@@ -88,7 +102,9 @@ class _Proc(threading.Thread):
|
|
|
88
102
|
service_name: str,
|
|
89
103
|
exec_args: Sequence[str],
|
|
90
104
|
timeout: int,
|
|
91
|
-
|
|
105
|
+
on_connection_info_received: Callable[
|
|
106
|
+
[ErtServerConnectionInfo | Exception | None], None
|
|
107
|
+
],
|
|
92
108
|
project: Path,
|
|
93
109
|
) -> None:
|
|
94
110
|
super().__init__()
|
|
@@ -98,7 +114,7 @@ class _Proc(threading.Thread):
|
|
|
98
114
|
self._service_name = service_name
|
|
99
115
|
self._exec_args = exec_args
|
|
100
116
|
self._timeout = timeout
|
|
101
|
-
self.
|
|
117
|
+
self._propagate_connection_info_from_childproc = on_connection_info_received
|
|
102
118
|
self._service_config_path = project / f"{self._service_name}_server.json"
|
|
103
119
|
|
|
104
120
|
fd_read, fd_write = os.pipe()
|
|
@@ -119,13 +135,13 @@ class _Proc(threading.Thread):
|
|
|
119
135
|
os.close(fd_write)
|
|
120
136
|
|
|
121
137
|
def run(self) -> None:
|
|
122
|
-
comm = self.
|
|
138
|
+
comm = self._read_connection_info_from_process(self._childproc)
|
|
123
139
|
|
|
124
140
|
if comm is None:
|
|
125
|
-
self.
|
|
141
|
+
self._propagate_connection_info_from_childproc(TimeoutError())
|
|
126
142
|
return # _read_conn_info() has already cleaned up in this case
|
|
127
143
|
|
|
128
|
-
conn_info:
|
|
144
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None
|
|
129
145
|
try:
|
|
130
146
|
conn_info = json.loads(comm)
|
|
131
147
|
except json.JSONDecodeError:
|
|
@@ -134,7 +150,7 @@ class _Proc(threading.Thread):
|
|
|
134
150
|
conn_info = exc
|
|
135
151
|
|
|
136
152
|
try:
|
|
137
|
-
self.
|
|
153
|
+
self._propagate_connection_info_from_childproc(conn_info)
|
|
138
154
|
|
|
139
155
|
while True:
|
|
140
156
|
if self._childproc.poll() is not None:
|
|
@@ -148,15 +164,16 @@ class _Proc(threading.Thread):
|
|
|
148
164
|
self.logger.exception(e)
|
|
149
165
|
|
|
150
166
|
finally:
|
|
151
|
-
self.
|
|
167
|
+
self._ensure_connection_info_file_is_deleted()
|
|
152
168
|
|
|
153
169
|
def shutdown(self) -> int:
|
|
154
170
|
"""Shutdown the server."""
|
|
155
171
|
self._shutdown.set()
|
|
156
172
|
self.join()
|
|
173
|
+
|
|
157
174
|
return self._childproc.returncode
|
|
158
175
|
|
|
159
|
-
def
|
|
176
|
+
def _read_connection_info_from_process(self, proc: Popen[bytes]) -> str | None:
|
|
160
177
|
comm_buf = io.StringIO()
|
|
161
178
|
first_iter = True
|
|
162
179
|
while first_iter or proc.poll() is None:
|
|
@@ -166,7 +183,7 @@ class _Proc(threading.Thread):
|
|
|
166
183
|
# Timeout reached, exit with a failure
|
|
167
184
|
if ready == ([], [], []):
|
|
168
185
|
self._do_shutdown()
|
|
169
|
-
self.
|
|
186
|
+
self._ensure_connection_info_file_is_deleted()
|
|
170
187
|
return None
|
|
171
188
|
|
|
172
189
|
x = self._comm_pipe.read(PIPE_BUF)
|
|
@@ -190,7 +207,7 @@ class _Proc(threading.Thread):
|
|
|
190
207
|
f"waiting for child-process exceeded timeout {self._timeout}s"
|
|
191
208
|
)
|
|
192
209
|
|
|
193
|
-
def
|
|
210
|
+
def _ensure_connection_info_file_is_deleted(self) -> None:
|
|
194
211
|
"""
|
|
195
212
|
Ensure that the JSON connection information file is deleted
|
|
196
213
|
"""
|
|
@@ -241,14 +258,14 @@ class BaseService:
|
|
|
241
258
|
self,
|
|
242
259
|
exec_args: Sequence[str] = (),
|
|
243
260
|
timeout: int = 120,
|
|
244
|
-
conn_info:
|
|
261
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None,
|
|
245
262
|
project: str | None = None,
|
|
246
263
|
) -> None:
|
|
247
264
|
self._exec_args = exec_args
|
|
248
265
|
self._timeout = timeout
|
|
249
266
|
|
|
250
267
|
self._proc: _Proc | None = None
|
|
251
|
-
self._conn_info:
|
|
268
|
+
self._conn_info: ErtServerConnectionInfo | Exception | None = conn_info
|
|
252
269
|
self._conn_info_event = threading.Event()
|
|
253
270
|
self._project = Path(project) if project is not None else Path.cwd()
|
|
254
271
|
|
|
@@ -261,7 +278,7 @@ class BaseService:
|
|
|
261
278
|
)
|
|
262
279
|
|
|
263
280
|
@classmethod
|
|
264
|
-
def start_server(cls
|
|
281
|
+
def start_server(cls, *args: Any, **kwargs: Any) -> _Context[Self]:
|
|
265
282
|
if cls._instance is not None:
|
|
266
283
|
raise RuntimeError("Server already running")
|
|
267
284
|
cls._instance = obj = cls(*args, **kwargs)
|
|
@@ -313,7 +330,7 @@ class BaseService:
|
|
|
313
330
|
if self._proc is not None:
|
|
314
331
|
self._proc.join()
|
|
315
332
|
|
|
316
|
-
def set_conn_info(self, info:
|
|
333
|
+
def set_conn_info(self, info: ErtServerConnectionInfo | Exception | None) -> None:
|
|
317
334
|
if self._conn_info is not None:
|
|
318
335
|
raise ValueError("Connection information already set")
|
|
319
336
|
if info is None:
|
ert/services/_storage_main.py
CHANGED
|
@@ -28,7 +28,7 @@ from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapProp
|
|
|
28
28
|
from uvicorn.supervisors import ChangeReload
|
|
29
29
|
|
|
30
30
|
from ert.logging import STORAGE_LOG_CONFIG
|
|
31
|
-
from ert.plugins import
|
|
31
|
+
from ert.plugins import setup_site_logging
|
|
32
32
|
from ert.services._base_service import BaseServiceExit
|
|
33
33
|
from ert.shared import __file__ as ert_shared_path
|
|
34
34
|
from ert.shared import find_available_socket, get_machine_name
|
|
@@ -274,10 +274,12 @@ def main() -> None:
|
|
|
274
274
|
"ssl_keyfile_password": key_pw,
|
|
275
275
|
"ssl_version": ssl.PROTOCOL_TLS_SERVER,
|
|
276
276
|
}
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
277
|
+
|
|
278
|
+
logging_conf = yaml.safe_load(
|
|
279
|
+
Path(args.logging_config or STORAGE_LOG_CONFIG).read_text(encoding="utf-8")
|
|
280
|
+
)
|
|
281
|
+
logging.config.dictConfig(logging_conf)
|
|
282
|
+
config_args.update(log_config=logging_conf)
|
|
281
283
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
282
284
|
|
|
283
285
|
if args.debug:
|
|
@@ -300,19 +302,19 @@ def main() -> None:
|
|
|
300
302
|
terminate_on_parent_death_thread = threading.Thread(
|
|
301
303
|
target=terminate_on_parent_death, args=[stopped, args.parent_pid, 1.0]
|
|
302
304
|
)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
305
|
+
setup_site_logging(logging.getLogger())
|
|
306
|
+
terminate_on_parent_death_thread.start()
|
|
307
|
+
with tracer.start_as_current_span("run_storage_server", ctx):
|
|
308
|
+
logger = logging.getLogger("ert.shared.storage.info")
|
|
309
|
+
try:
|
|
310
|
+
logger.info("Starting dark storage")
|
|
311
|
+
logger.info(f"Started dark storage with parent {args.parent_pid}")
|
|
312
|
+
run_server(args, debug=False, uvicorn_config=uvicorn_config)
|
|
313
|
+
except (SystemExit, BaseServiceExit):
|
|
314
|
+
logger.info("Stopping dark storage")
|
|
315
|
+
finally:
|
|
316
|
+
stopped.set()
|
|
317
|
+
_join_terminate_thread(terminate_on_parent_death_thread)
|
|
316
318
|
|
|
317
319
|
|
|
318
320
|
if __name__ == "__main__":
|