ert 17.0.0__py3-none-any.whl → 19.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _ert/events.py +19 -2
- _ert/forward_model_runner/client.py +6 -2
- ert/__main__.py +28 -13
- ert/analysis/_enif_update.py +8 -4
- ert/analysis/_es_update.py +19 -6
- ert/analysis/_update_commons.py +16 -6
- ert/cli/main.py +13 -6
- ert/cli/monitor.py +7 -0
- ert/config/__init__.py +15 -6
- ert/config/_create_observation_dataframes.py +117 -20
- ert/config/_get_num_cpu.py +1 -1
- ert/config/_observations.py +91 -2
- ert/config/_read_summary.py +8 -6
- ert/config/design_matrix.py +51 -24
- ert/config/distribution.py +1 -1
- ert/config/ensemble_config.py +9 -17
- ert/config/ert_config.py +103 -19
- ert/config/everest_control.py +234 -0
- ert/config/{everest_objective_config.py → everest_response.py} +24 -15
- ert/config/field.py +96 -84
- ert/config/forward_model_step.py +122 -17
- ert/config/gen_data_config.py +5 -10
- ert/config/gen_kw_config.py +5 -35
- ert/config/known_response_types.py +14 -0
- ert/config/parameter_config.py +1 -33
- ert/config/parsing/_option_dict.py +10 -2
- ert/config/parsing/config_keywords.py +2 -0
- ert/config/parsing/config_schema.py +23 -3
- ert/config/parsing/config_schema_deprecations.py +3 -14
- ert/config/parsing/config_schema_item.py +26 -11
- ert/config/parsing/context_values.py +3 -3
- ert/config/parsing/file_context_token.py +1 -1
- ert/config/parsing/observations_parser.py +6 -2
- ert/config/parsing/queue_system.py +9 -0
- ert/config/parsing/schema_item_type.py +1 -0
- ert/config/queue_config.py +4 -5
- ert/config/response_config.py +0 -8
- ert/config/rft_config.py +275 -0
- ert/config/summary_config.py +3 -8
- ert/config/surface_config.py +59 -16
- ert/config/workflow_fixtures.py +2 -1
- ert/dark_storage/client/__init__.py +2 -2
- ert/dark_storage/client/_session.py +4 -4
- ert/dark_storage/client/client.py +2 -2
- ert/dark_storage/common.py +1 -1
- ert/dark_storage/compute/misfits.py +11 -7
- ert/dark_storage/endpoints/compute/misfits.py +6 -4
- ert/dark_storage/endpoints/experiment_server.py +12 -9
- ert/dark_storage/endpoints/experiments.py +2 -2
- ert/dark_storage/endpoints/observations.py +8 -6
- ert/dark_storage/endpoints/parameters.py +2 -18
- ert/dark_storage/endpoints/responses.py +24 -5
- ert/dark_storage/json_schema/experiment.py +1 -1
- ert/data/_measured_data.py +6 -5
- ert/ensemble_evaluator/__init__.py +8 -1
- ert/ensemble_evaluator/config.py +2 -1
- ert/ensemble_evaluator/evaluator.py +81 -29
- ert/ensemble_evaluator/event.py +6 -0
- ert/ensemble_evaluator/snapshot.py +3 -1
- ert/ensemble_evaluator/state.py +1 -0
- ert/field_utils/__init__.py +8 -0
- ert/field_utils/field_utils.py +212 -3
- ert/field_utils/roff_io.py +1 -1
- ert/gui/__init__.py +5 -2
- ert/gui/ertnotifier.py +1 -1
- ert/gui/ertwidgets/__init__.py +23 -16
- ert/gui/ertwidgets/analysismoduleedit.py +2 -2
- ert/gui/ertwidgets/checklist.py +1 -1
- ert/gui/ertwidgets/create_experiment_dialog.py +3 -1
- ert/gui/ertwidgets/ensembleselector.py +2 -2
- ert/gui/ertwidgets/models/__init__.py +2 -0
- ert/gui/ertwidgets/models/activerealizationsmodel.py +2 -1
- ert/gui/ertwidgets/models/path_model.py +1 -1
- ert/gui/ertwidgets/models/targetensemblemodel.py +2 -1
- ert/gui/ertwidgets/models/text_model.py +1 -1
- ert/gui/ertwidgets/pathchooser.py +0 -3
- ert/gui/ertwidgets/searchbox.py +13 -4
- ert/gui/{suggestor → ertwidgets/suggestor}/_suggestor_message.py +13 -4
- ert/gui/{suggestor → ertwidgets/suggestor}/suggestor.py +63 -30
- ert/gui/main.py +37 -8
- ert/gui/main_window.py +1 -7
- ert/gui/simulation/ensemble_experiment_panel.py +1 -1
- ert/gui/simulation/ensemble_information_filter_panel.py +1 -1
- ert/gui/simulation/ensemble_smoother_panel.py +1 -1
- ert/gui/simulation/evaluate_ensemble_panel.py +1 -1
- ert/gui/simulation/experiment_panel.py +16 -3
- ert/gui/simulation/manual_update_panel.py +31 -8
- ert/gui/simulation/multiple_data_assimilation_panel.py +12 -8
- ert/gui/simulation/run_dialog.py +27 -20
- ert/gui/simulation/single_test_run_panel.py +2 -2
- ert/gui/summarypanel.py +20 -1
- ert/gui/tools/load_results/load_results_panel.py +1 -1
- ert/gui/tools/manage_experiments/export_dialog.py +136 -0
- ert/gui/tools/manage_experiments/storage_info_widget.py +121 -16
- ert/gui/tools/manage_experiments/storage_widget.py +1 -2
- ert/gui/tools/plot/plot_api.py +37 -25
- ert/gui/tools/plot/plot_widget.py +10 -2
- ert/gui/tools/plot/plot_window.py +38 -18
- ert/gui/tools/plot/plottery/plot_config.py +2 -0
- ert/gui/tools/plot/plottery/plot_context.py +14 -0
- ert/gui/tools/plot/plottery/plots/__init__.py +2 -0
- ert/gui/tools/plot/plottery/plots/cesp.py +3 -1
- ert/gui/tools/plot/plottery/plots/distribution.py +6 -1
- ert/gui/tools/plot/plottery/plots/ensemble.py +12 -3
- ert/gui/tools/plot/plottery/plots/gaussian_kde.py +12 -2
- ert/gui/tools/plot/plottery/plots/histogram.py +3 -1
- ert/gui/tools/plot/plottery/plots/misfits.py +436 -0
- ert/gui/tools/plot/plottery/plots/observations.py +18 -4
- ert/gui/tools/plot/plottery/plots/statistics.py +62 -20
- ert/gui/tools/plot/plottery/plots/std_dev.py +3 -1
- ert/mode_definitions.py +2 -0
- ert/plugins/__init__.py +0 -1
- ert/plugins/hook_implementations/workflows/csv_export.py +2 -3
- ert/plugins/hook_implementations/workflows/gen_data_rft_export.py +10 -2
- ert/plugins/hook_specifications/__init__.py +0 -2
- ert/plugins/hook_specifications/jobs.py +0 -9
- ert/plugins/plugin_manager.py +6 -33
- ert/resources/forward_models/run_reservoirsimulator.py +8 -3
- ert/resources/shell_scripts/delete_directory.py +2 -2
- ert/run_models/__init__.py +18 -5
- ert/run_models/_create_run_path.py +131 -37
- ert/run_models/ensemble_experiment.py +10 -4
- ert/run_models/ensemble_information_filter.py +8 -1
- ert/run_models/ensemble_smoother.py +9 -3
- ert/run_models/evaluate_ensemble.py +8 -6
- ert/run_models/event.py +7 -3
- ert/run_models/everest_run_model.py +159 -46
- ert/run_models/initial_ensemble_run_model.py +25 -24
- ert/run_models/manual_update.py +6 -3
- ert/run_models/manual_update_enif.py +37 -0
- ert/run_models/model_factory.py +81 -21
- ert/run_models/multiple_data_assimilation.py +22 -11
- ert/run_models/run_model.py +64 -55
- ert/run_models/single_test_run.py +7 -4
- ert/run_models/update_run_model.py +4 -2
- ert/runpaths.py +5 -6
- ert/sample_prior.py +9 -4
- ert/scheduler/driver.py +37 -0
- ert/scheduler/event.py +3 -1
- ert/scheduler/job.py +23 -13
- ert/scheduler/lsf_driver.py +6 -2
- ert/scheduler/openpbs_driver.py +7 -1
- ert/scheduler/scheduler.py +5 -0
- ert/scheduler/slurm_driver.py +6 -2
- ert/services/__init__.py +2 -2
- ert/services/_base_service.py +37 -20
- ert/services/ert_server.py +317 -0
- ert/shared/_doc_utils/__init__.py +4 -2
- ert/shared/_doc_utils/ert_jobs.py +1 -4
- ert/shared/net_utils.py +43 -18
- ert/shared/storage/connection.py +3 -3
- ert/shared/version.py +3 -3
- ert/storage/__init__.py +2 -0
- ert/storage/local_ensemble.py +38 -12
- ert/storage/local_experiment.py +8 -16
- ert/storage/local_storage.py +68 -42
- ert/storage/migration/to11.py +1 -1
- ert/storage/migration/to16.py +38 -0
- ert/storage/migration/to17.py +42 -0
- ert/storage/migration/to18.py +11 -0
- ert/storage/migration/to19.py +34 -0
- ert/storage/migration/to20.py +23 -0
- ert/storage/migration/to21.py +25 -0
- ert/storage/migration/to8.py +4 -4
- ert/substitutions.py +12 -28
- ert/validation/active_range.py +7 -7
- ert/validation/rangestring.py +16 -16
- ert/workflow_runner.py +2 -1
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/METADATA +9 -8
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/RECORD +208 -205
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/WHEEL +1 -1
- everest/api/everest_data_api.py +14 -1
- everest/bin/config_branch_script.py +3 -6
- everest/bin/everconfigdump_script.py +1 -9
- everest/bin/everest_script.py +21 -11
- everest/bin/everlint_script.py +0 -2
- everest/bin/kill_script.py +2 -2
- everest/bin/monitor_script.py +2 -2
- everest/bin/utils.py +8 -4
- everest/bin/visualization_script.py +6 -14
- everest/config/__init__.py +4 -1
- everest/config/control_config.py +81 -6
- everest/config/control_variable_config.py +4 -3
- everest/config/everest_config.py +75 -42
- everest/config/forward_model_config.py +5 -3
- everest/config/install_data_config.py +7 -5
- everest/config/install_job_config.py +7 -3
- everest/config/install_template_config.py +3 -3
- everest/config/optimization_config.py +19 -6
- everest/config/output_constraint_config.py +8 -2
- everest/config/server_config.py +6 -49
- everest/config/utils.py +25 -105
- everest/config/validation_utils.py +17 -11
- everest/config_file_loader.py +13 -4
- everest/detached/client.py +3 -3
- everest/detached/everserver.py +7 -8
- everest/everest_storage.py +6 -12
- everest/gui/everest_client.py +2 -3
- everest/gui/main_window.py +2 -2
- everest/optimizer/everest2ropt.py +59 -32
- everest/optimizer/opt_model_transforms.py +12 -13
- everest/optimizer/utils.py +0 -29
- everest/strings.py +0 -5
- ert/config/everest_constraints_config.py +0 -95
- ert/config/ext_param_config.py +0 -106
- ert/gui/tools/export/__init__.py +0 -3
- ert/gui/tools/export/export_panel.py +0 -83
- ert/gui/tools/export/export_tool.py +0 -69
- ert/gui/tools/export/exporter.py +0 -36
- ert/services/storage_service.py +0 -127
- everest/config/sampler_config.py +0 -103
- everest/simulator/__init__.py +0 -88
- everest/simulator/everest_to_ert.py +0 -51
- /ert/gui/{suggestor → ertwidgets/suggestor}/__init__.py +0 -0
- /ert/gui/{suggestor → ertwidgets/suggestor}/_colors.py +0 -0
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/entry_points.txt +0 -0
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/licenses/COPYING +0 -0
- {ert-17.0.0.dist-info → ert-19.0.0rc2.dist-info}/top_level.txt +0 -0
ert/runpaths.py
CHANGED
|
@@ -101,14 +101,13 @@ class Runpaths:
|
|
|
101
101
|
with open(self.runpath_list_filename, "w", encoding="utf-8") as filehandle:
|
|
102
102
|
for iteration in iteration_numbers:
|
|
103
103
|
for realization in realization_numbers:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
realization,
|
|
107
|
-
iteration,
|
|
104
|
+
real_iter_substituter = self.substitutions.real_iter_substituter(
|
|
105
|
+
realization, iteration
|
|
108
106
|
)
|
|
109
|
-
|
|
110
|
-
self.
|
|
107
|
+
job_name_or_eclbase = real_iter_substituter.substitute(
|
|
108
|
+
self._eclbase or self._jobname_format,
|
|
111
109
|
)
|
|
110
|
+
runpath = real_iter_substituter.substitute(self._runpath_format)
|
|
112
111
|
|
|
113
112
|
filehandle.write(
|
|
114
113
|
f"{realization:03d} {runpath} "
|
ert/sample_prior.py
CHANGED
|
@@ -33,6 +33,7 @@ def sample_prior(
|
|
|
33
33
|
parameter_configs = ensemble.experiment.parameter_configuration
|
|
34
34
|
if parameters is None:
|
|
35
35
|
parameters = list(parameter_configs.keys())
|
|
36
|
+
complete_dataset: pl.DataFrame | None = None
|
|
36
37
|
for parameter in parameters:
|
|
37
38
|
config_node = parameter_configs[parameter]
|
|
38
39
|
if config_node.forward_init:
|
|
@@ -75,13 +76,17 @@ def sample_prior(
|
|
|
75
76
|
if datasets:
|
|
76
77
|
dataset = pl.concat(datasets, how="vertical")
|
|
77
78
|
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
)
|
|
79
|
+
if complete_dataset is None:
|
|
80
|
+
complete_dataset = dataset
|
|
81
|
+
elif dataset is not None:
|
|
82
|
+
complete_dataset = complete_dataset.join(dataset, on="realization")
|
|
82
83
|
else:
|
|
83
84
|
for realization_nr in active_realizations:
|
|
84
85
|
ds = config_node.read_from_runpath(Path(), realization_nr, 0)
|
|
85
86
|
ensemble.save_parameters(ds, parameter, realization_nr)
|
|
86
87
|
|
|
88
|
+
if complete_dataset is not None:
|
|
89
|
+
ensemble.save_parameters(
|
|
90
|
+
dataset=complete_dataset,
|
|
91
|
+
)
|
|
87
92
|
ensemble.refresh_ensemble_state()
|
ert/scheduler/driver.py
CHANGED
|
@@ -3,10 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import shlex
|
|
6
|
+
import time
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
8
|
from collections.abc import Iterable
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
11
|
+
from _ert.events import EnsembleEvaluationWarning
|
|
12
|
+
|
|
10
13
|
from .event import DriverEvent
|
|
11
14
|
|
|
12
15
|
SIGNAL_OFFSET = 128
|
|
@@ -34,12 +37,19 @@ class FailedSubmit(RuntimeError):
|
|
|
34
37
|
class Driver(ABC):
|
|
35
38
|
"""Adapter for the HPC cluster."""
|
|
36
39
|
|
|
40
|
+
POLLING_TIMEOUT_PERIOD = 600
|
|
41
|
+
|
|
37
42
|
def __init__(self, activate_script: str = "") -> None:
|
|
38
43
|
self._event_queue: asyncio.Queue[DriverEvent] | None = None
|
|
39
44
|
self._job_error_message_by_iens: dict[int, str] = {}
|
|
40
45
|
self.activate_script = activate_script
|
|
41
46
|
self._poll_period = _POLL_PERIOD
|
|
42
47
|
|
|
48
|
+
self._polling_timeout_period = Driver.POLLING_TIMEOUT_PERIOD
|
|
49
|
+
self._last_successful_poll = time.time()
|
|
50
|
+
self._last_polling_error_message: str | None = None
|
|
51
|
+
self._has_warned_evaluator_of_polling_error = False
|
|
52
|
+
|
|
43
53
|
@property
|
|
44
54
|
def event_queue(self) -> asyncio.Queue[DriverEvent]:
|
|
45
55
|
if self._event_queue is None:
|
|
@@ -178,3 +188,30 @@ class Driver(ABC):
|
|
|
178
188
|
)
|
|
179
189
|
logger.error(error_message)
|
|
180
190
|
return False, error_message
|
|
191
|
+
|
|
192
|
+
async def _warn_evaluator_if_polling_has_failed_for_some_time(self) -> None:
|
|
193
|
+
if (
|
|
194
|
+
(self._last_successful_poll < time.time() - self._polling_timeout_period)
|
|
195
|
+
and self._last_polling_error_message
|
|
196
|
+
and not self._has_warned_evaluator_of_polling_error
|
|
197
|
+
):
|
|
198
|
+
await self._warn_evaluator_about_polling_difficulties()
|
|
199
|
+
self._has_warned_evaluator_of_polling_error = True
|
|
200
|
+
|
|
201
|
+
async def _warn_evaluator_about_polling_difficulties(self) -> None:
|
|
202
|
+
last_polling_error_message = self._last_polling_error_message
|
|
203
|
+
logger = logging.getLogger(__name__)
|
|
204
|
+
logger.warning(
|
|
205
|
+
"Driver has not successfully polled statuses for "
|
|
206
|
+
f"{self._polling_timeout_period}s. The previous error "
|
|
207
|
+
f"was due to '{last_polling_error_message}'"
|
|
208
|
+
)
|
|
209
|
+
formatted_msg = (
|
|
210
|
+
"ert has not been able to update the job status for some time. This might "
|
|
211
|
+
"be resolved by itself, and it does not mean that the run has crashed.\n"
|
|
212
|
+
"Please check the runpath if it seems to still be running.\n"
|
|
213
|
+
f"The last error message was '{last_polling_error_message}'"
|
|
214
|
+
)
|
|
215
|
+
await self.event_queue.put(
|
|
216
|
+
EnsembleEvaluationWarning(warning_message=formatted_msg)
|
|
217
|
+
)
|
ert/scheduler/event.py
CHANGED
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
|
|
5
|
+
from _ert.events import EnsembleEvaluationWarning
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
@dataclass
|
|
7
9
|
class StartedEvent:
|
|
@@ -16,4 +18,4 @@ class FinishedEvent:
|
|
|
16
18
|
exec_hosts: str = "-"
|
|
17
19
|
|
|
18
20
|
|
|
19
|
-
DriverEvent = StartedEvent | FinishedEvent
|
|
21
|
+
DriverEvent = StartedEvent | FinishedEvent | EnsembleEvaluationWarning
|
ert/scheduler/job.py
CHANGED
|
@@ -12,6 +12,7 @@ from enum import StrEnum
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import TYPE_CHECKING, assert_never
|
|
14
14
|
|
|
15
|
+
import anyio
|
|
15
16
|
from lxml import etree
|
|
16
17
|
from opentelemetry.trace import Status, StatusCode
|
|
17
18
|
|
|
@@ -31,7 +32,7 @@ from ert.storage import (
|
|
|
31
32
|
RealizationStorageState,
|
|
32
33
|
load_realization_parameters_and_responses,
|
|
33
34
|
)
|
|
34
|
-
from ert.trace import trace
|
|
35
|
+
from ert.trace import trace
|
|
35
36
|
from ert.warnings import PostSimulationWarning
|
|
36
37
|
|
|
37
38
|
from .driver import Driver, FailedSubmit
|
|
@@ -238,7 +239,6 @@ class Job:
|
|
|
238
239
|
f"{method_name} spent {elapsed_time} seconds waiting for files"
|
|
239
240
|
)
|
|
240
241
|
|
|
241
|
-
@tracer.start_as_current_span(f"{__name__}.run")
|
|
242
242
|
async def run(
|
|
243
243
|
self,
|
|
244
244
|
sem: asyncio.BoundedSemaphore,
|
|
@@ -335,8 +335,12 @@ class Job:
|
|
|
335
335
|
|
|
336
336
|
valid_checksums = [info for info in checksum.values() if "error" not in info]
|
|
337
337
|
|
|
338
|
-
|
|
339
|
-
|
|
338
|
+
async def all_paths_exist(paths: list[Path]) -> bool:
|
|
339
|
+
return all(
|
|
340
|
+
await asyncio.gather(*[anyio.Path(path).exists() for path in paths])
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
while not await all_paths_exist([info["path"] for info in valid_checksums]):
|
|
340
344
|
if timeout <= 0:
|
|
341
345
|
break
|
|
342
346
|
timeout -= DISK_SYNCHRONIZATION_POLLING_INTERVAL
|
|
@@ -344,17 +348,19 @@ class Job:
|
|
|
344
348
|
await asyncio.sleep(DISK_SYNCHRONIZATION_POLLING_INTERVAL)
|
|
345
349
|
async with checksum_lock:
|
|
346
350
|
for info in valid_checksums:
|
|
347
|
-
file_path = Path(info["path"])
|
|
351
|
+
file_path = anyio.Path(info["path"])
|
|
348
352
|
expected_md5sum = info.get("md5sum")
|
|
349
|
-
|
|
350
|
-
|
|
353
|
+
file_path_exists = await file_path.exists()
|
|
354
|
+
if file_path_exists and expected_md5sum:
|
|
355
|
+
file_bytes = await file_path.read_bytes()
|
|
356
|
+
actual_md5sum = hashlib.md5(file_bytes).hexdigest()
|
|
351
357
|
if expected_md5sum == actual_md5sum:
|
|
352
358
|
logger.debug(f"File {file_path} checksum successful.")
|
|
353
359
|
else:
|
|
354
360
|
logger.warning(
|
|
355
361
|
f"File {file_path} checksum verification failed."
|
|
356
362
|
)
|
|
357
|
-
elif
|
|
363
|
+
elif file_path_exists and expected_md5sum is None:
|
|
358
364
|
logger.warning(f"Checksum not received for file {file_path}")
|
|
359
365
|
else:
|
|
360
366
|
logger.error(f"Disk synchronization failed for {file_path}")
|
|
@@ -506,11 +512,12 @@ async def log_warnings_from_forward_model(
|
|
|
506
512
|
or "- ERROR - " in line
|
|
507
513
|
)
|
|
508
514
|
|
|
509
|
-
async def log_warnings_from_file(
|
|
515
|
+
async def log_warnings_from_file(
|
|
510
516
|
file: Path, iens: int, step: ForwardModelStep, step_idx: int, filetype: str
|
|
511
517
|
) -> None:
|
|
512
518
|
captured: list[str] = []
|
|
513
|
-
|
|
519
|
+
file_text = await anyio.Path(file).read_text(encoding="utf-8")
|
|
520
|
+
for line in file_text.splitlines():
|
|
514
521
|
if line_contains_warning(line):
|
|
515
522
|
captured.append(line[:max_length])
|
|
516
523
|
|
|
@@ -527,9 +534,12 @@ async def log_warnings_from_forward_model(
|
|
|
527
534
|
return 0
|
|
528
535
|
remaining_timeout = _timeout
|
|
529
536
|
for _ in range(_timeout):
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
537
|
+
file_path_exists = await anyio.Path(file_path).exists()
|
|
538
|
+
if file_path_exists:
|
|
539
|
+
st_mtime = (await anyio.Path(file_path).stat()).st_mtime
|
|
540
|
+
else:
|
|
541
|
+
st_mtime = 0
|
|
542
|
+
if not (file_path_exists and st_mtime >= job_submission_time):
|
|
533
543
|
remaining_timeout -= 1
|
|
534
544
|
await asyncio.sleep(1)
|
|
535
545
|
else:
|
ert/scheduler/lsf_driver.py
CHANGED
|
@@ -444,6 +444,7 @@ class LsfDriver(Driver):
|
|
|
444
444
|
|
|
445
445
|
async def poll(self) -> None:
|
|
446
446
|
while True:
|
|
447
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
447
448
|
if not self._jobs.keys():
|
|
448
449
|
await asyncio.sleep(self._poll_period)
|
|
449
450
|
continue
|
|
@@ -461,6 +462,7 @@ class LsfDriver(Driver):
|
|
|
461
462
|
)
|
|
462
463
|
except OSError as e:
|
|
463
464
|
logger.error(str(e))
|
|
465
|
+
self._last_polling_error_message = str(e)
|
|
464
466
|
await asyncio.sleep(self._poll_period)
|
|
465
467
|
continue
|
|
466
468
|
|
|
@@ -468,10 +470,11 @@ class LsfDriver(Driver):
|
|
|
468
470
|
if process.returncode:
|
|
469
471
|
# bjobs may give nonzero return code even when it is providing
|
|
470
472
|
# at least some correct information
|
|
473
|
+
error_msg = stderr.decode()
|
|
471
474
|
logger.warning(
|
|
472
|
-
f"bjobs gave returncode {process.returncode} "
|
|
473
|
-
f"and error {stderr.decode()}"
|
|
475
|
+
f"bjobs gave returncode {process.returncode} and error {error_msg}"
|
|
474
476
|
)
|
|
477
|
+
self._last_polling_error_message = error_msg
|
|
475
478
|
bjobs_states = _parse_jobs_dict(parse_bjobs(stdout.decode(errors="ignore")))
|
|
476
479
|
self.update_and_log_exec_hosts(
|
|
477
480
|
parse_bjobs_exec_hosts(stdout.decode(errors="ignore"))
|
|
@@ -503,6 +506,7 @@ class LsfDriver(Driver):
|
|
|
503
506
|
"bhist did not give status for job_ids "
|
|
504
507
|
f"{missing_in_bhist_and_bjobs}, giving up for now."
|
|
505
508
|
)
|
|
509
|
+
self._last_successful_poll = time.time()
|
|
506
510
|
await asyncio.sleep(self._poll_period)
|
|
507
511
|
|
|
508
512
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/openpbs_driver.py
CHANGED
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import shlex
|
|
7
7
|
import shutil
|
|
8
|
+
import time
|
|
8
9
|
from collections.abc import Iterable, Mapping, MutableMapping
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from pathlib import Path
|
|
@@ -264,6 +265,7 @@ class OpenPBSDriver(Driver):
|
|
|
264
265
|
|
|
265
266
|
async def poll(self) -> None:
|
|
266
267
|
while True:
|
|
268
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
267
269
|
if not self._jobs:
|
|
268
270
|
await asyncio.sleep(self._poll_period)
|
|
269
271
|
continue
|
|
@@ -280,6 +282,7 @@ class OpenPBSDriver(Driver):
|
|
|
280
282
|
)
|
|
281
283
|
except OSError as e:
|
|
282
284
|
logger.error(str(e))
|
|
285
|
+
self._last_polling_error_message = str(e)
|
|
283
286
|
await asyncio.sleep(self._poll_period)
|
|
284
287
|
continue
|
|
285
288
|
stdout, stderr = await process.communicate()
|
|
@@ -289,10 +292,12 @@ class OpenPBSDriver(Driver):
|
|
|
289
292
|
await asyncio.sleep(self._poll_period)
|
|
290
293
|
continue
|
|
291
294
|
if process.returncode == QSTAT_UNKNOWN_JOB_ID:
|
|
295
|
+
error_msg = stderr.decode(errors="ignore")
|
|
292
296
|
logger.debug(
|
|
293
297
|
f"qstat gave returncode {QSTAT_UNKNOWN_JOB_ID} "
|
|
294
|
-
f"with message {
|
|
298
|
+
f"with message {error_msg}"
|
|
295
299
|
)
|
|
300
|
+
self._last_polling_error_message = error_msg
|
|
296
301
|
parsed_jobs = _parse_jobs_dict(
|
|
297
302
|
parse_qstat(stdout.decode(errors="ignore"))
|
|
298
303
|
)
|
|
@@ -330,6 +335,7 @@ class OpenPBSDriver(Driver):
|
|
|
330
335
|
for job_id, job in parsed_jobs_dict.items():
|
|
331
336
|
await self._process_job_update(job_id, job)
|
|
332
337
|
|
|
338
|
+
self._last_successful_poll = time.time()
|
|
333
339
|
await asyncio.sleep(self._poll_period)
|
|
334
340
|
|
|
335
341
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/scheduler.py
CHANGED
|
@@ -15,6 +15,7 @@ import orjson
|
|
|
15
15
|
from pydantic.dataclasses import dataclass
|
|
16
16
|
|
|
17
17
|
from _ert.events import (
|
|
18
|
+
EnsembleEvaluationWarning,
|
|
18
19
|
ForwardModelStepChecksum,
|
|
19
20
|
RealizationEvent,
|
|
20
21
|
RealizationFailed,
|
|
@@ -343,6 +344,10 @@ class Scheduler:
|
|
|
343
344
|
async def _process_event_queue(self) -> None:
|
|
344
345
|
while True:
|
|
345
346
|
event = await self.driver.event_queue.get()
|
|
347
|
+
if isinstance(event, EnsembleEvaluationWarning):
|
|
348
|
+
if self._ensemble_evaluator_queue:
|
|
349
|
+
await self._ensemble_evaluator_queue.put(event)
|
|
350
|
+
continue
|
|
346
351
|
job = self._jobs[event.iens]
|
|
347
352
|
|
|
348
353
|
# Any event implies the job has at least started
|
ert/scheduler/slurm_driver.py
CHANGED
|
@@ -262,6 +262,7 @@ class SlurmDriver(Driver):
|
|
|
262
262
|
|
|
263
263
|
async def poll(self) -> None:
|
|
264
264
|
while True:
|
|
265
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
265
266
|
if not self._jobs.keys():
|
|
266
267
|
await asyncio.sleep(self._poll_period)
|
|
267
268
|
continue
|
|
@@ -277,14 +278,16 @@ class SlurmDriver(Driver):
|
|
|
277
278
|
)
|
|
278
279
|
except OSError as e:
|
|
279
280
|
logger.error(str(e))
|
|
281
|
+
self._last_polling_error_message = str(e)
|
|
280
282
|
await asyncio.sleep(self._poll_period)
|
|
281
283
|
continue
|
|
282
284
|
stdout, stderr = await process.communicate()
|
|
283
285
|
if process.returncode:
|
|
286
|
+
error_msg = stderr.decode()
|
|
284
287
|
logger.warning(
|
|
285
|
-
f"squeue gave returncode {process.returncode} "
|
|
286
|
-
f"and error {stderr.decode()}"
|
|
288
|
+
f"squeue gave returncode {process.returncode} and error {error_msg}"
|
|
287
289
|
)
|
|
290
|
+
self._last_polling_error_message = error_msg
|
|
288
291
|
squeue_states = dict(_parse_squeue_output(stdout.decode(errors="ignore")))
|
|
289
292
|
|
|
290
293
|
job_ids_found_in_squeue_output = set(squeue_states.keys())
|
|
@@ -317,6 +320,7 @@ class SlurmDriver(Driver):
|
|
|
317
320
|
"scontrol did not give status for job_ids "
|
|
318
321
|
f"{missing_in_squeue_and_scontrol}, giving up for now."
|
|
319
322
|
)
|
|
323
|
+
self._last_successful_poll = time.time()
|
|
320
324
|
await asyncio.sleep(self._poll_period)
|
|
321
325
|
|
|
322
326
|
async def _process_job_update(self, job_id: str, new_info: JobInfo) -> None:
|
ert/services/__init__.py
CHANGED
ert/services/_base_service.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This file contains a more generic version of "ert services", and
|
|
3
|
+
is scheduled for removal when WebvizErt is removed.
|
|
4
|
+
"""
|
|
5
|
+
|
|
1
6
|
from __future__ import annotations
|
|
2
7
|
|
|
3
8
|
import contextlib
|
|
@@ -7,6 +12,7 @@ import os
|
|
|
7
12
|
import signal
|
|
8
13
|
import sys
|
|
9
14
|
import threading
|
|
15
|
+
import types
|
|
10
16
|
from collections.abc import Callable, Mapping, Sequence
|
|
11
17
|
from logging import Logger, getLogger
|
|
12
18
|
from pathlib import Path
|
|
@@ -15,13 +21,21 @@ from subprocess import Popen, TimeoutExpired
|
|
|
15
21
|
from tempfile import NamedTemporaryFile
|
|
16
22
|
from time import sleep
|
|
17
23
|
from types import FrameType
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Generic, Self, TypeVar
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Generic, Self, TypedDict, TypeVar
|
|
19
25
|
|
|
20
26
|
if TYPE_CHECKING:
|
|
21
|
-
|
|
27
|
+
pass
|
|
22
28
|
|
|
23
29
|
T = TypeVar("T", bound="BaseService")
|
|
24
|
-
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ErtServerConnectionInfo(TypedDict):
|
|
33
|
+
urls: list[str]
|
|
34
|
+
authtoken: str
|
|
35
|
+
host: str
|
|
36
|
+
port: str
|
|
37
|
+
cert: str
|
|
38
|
+
auth: str
|
|
25
39
|
|
|
26
40
|
|
|
27
41
|
SERVICE_CONF_PATHS: set[str] = set()
|
|
@@ -74,9 +88,9 @@ class _Context(Generic[T]):
|
|
|
74
88
|
|
|
75
89
|
def __exit__(
|
|
76
90
|
self,
|
|
77
|
-
exc_type: type[BaseException],
|
|
78
|
-
exc_value: BaseException,
|
|
79
|
-
traceback:
|
|
91
|
+
exc_type: type[BaseException] | None,
|
|
92
|
+
exc_value: BaseException | None,
|
|
93
|
+
traceback: types.TracebackType | None,
|
|
80
94
|
) -> bool:
|
|
81
95
|
self._service.shutdown()
|
|
82
96
|
return exc_type is None
|
|
@@ -88,7 +102,9 @@ class _Proc(threading.Thread):
|
|
|
88
102
|
service_name: str,
|
|
89
103
|
exec_args: Sequence[str],
|
|
90
104
|
timeout: int,
|
|
91
|
-
|
|
105
|
+
on_connection_info_received: Callable[
|
|
106
|
+
[ErtServerConnectionInfo | Exception | None], None
|
|
107
|
+
],
|
|
92
108
|
project: Path,
|
|
93
109
|
) -> None:
|
|
94
110
|
super().__init__()
|
|
@@ -98,7 +114,7 @@ class _Proc(threading.Thread):
|
|
|
98
114
|
self._service_name = service_name
|
|
99
115
|
self._exec_args = exec_args
|
|
100
116
|
self._timeout = timeout
|
|
101
|
-
self.
|
|
117
|
+
self._propagate_connection_info_from_childproc = on_connection_info_received
|
|
102
118
|
self._service_config_path = project / f"{self._service_name}_server.json"
|
|
103
119
|
|
|
104
120
|
fd_read, fd_write = os.pipe()
|
|
@@ -119,13 +135,13 @@ class _Proc(threading.Thread):
|
|
|
119
135
|
os.close(fd_write)
|
|
120
136
|
|
|
121
137
|
def run(self) -> None:
|
|
122
|
-
comm = self.
|
|
138
|
+
comm = self._read_connection_info_from_process(self._childproc)
|
|
123
139
|
|
|
124
140
|
if comm is None:
|
|
125
|
-
self.
|
|
141
|
+
self._propagate_connection_info_from_childproc(TimeoutError())
|
|
126
142
|
return # _read_conn_info() has already cleaned up in this case
|
|
127
143
|
|
|
128
|
-
conn_info:
|
|
144
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None
|
|
129
145
|
try:
|
|
130
146
|
conn_info = json.loads(comm)
|
|
131
147
|
except json.JSONDecodeError:
|
|
@@ -134,7 +150,7 @@ class _Proc(threading.Thread):
|
|
|
134
150
|
conn_info = exc
|
|
135
151
|
|
|
136
152
|
try:
|
|
137
|
-
self.
|
|
153
|
+
self._propagate_connection_info_from_childproc(conn_info)
|
|
138
154
|
|
|
139
155
|
while True:
|
|
140
156
|
if self._childproc.poll() is not None:
|
|
@@ -148,15 +164,16 @@ class _Proc(threading.Thread):
|
|
|
148
164
|
self.logger.exception(e)
|
|
149
165
|
|
|
150
166
|
finally:
|
|
151
|
-
self.
|
|
167
|
+
self._ensure_connection_info_file_is_deleted()
|
|
152
168
|
|
|
153
169
|
def shutdown(self) -> int:
|
|
154
170
|
"""Shutdown the server."""
|
|
155
171
|
self._shutdown.set()
|
|
156
172
|
self.join()
|
|
173
|
+
|
|
157
174
|
return self._childproc.returncode
|
|
158
175
|
|
|
159
|
-
def
|
|
176
|
+
def _read_connection_info_from_process(self, proc: Popen[bytes]) -> str | None:
|
|
160
177
|
comm_buf = io.StringIO()
|
|
161
178
|
first_iter = True
|
|
162
179
|
while first_iter or proc.poll() is None:
|
|
@@ -166,7 +183,7 @@ class _Proc(threading.Thread):
|
|
|
166
183
|
# Timeout reached, exit with a failure
|
|
167
184
|
if ready == ([], [], []):
|
|
168
185
|
self._do_shutdown()
|
|
169
|
-
self.
|
|
186
|
+
self._ensure_connection_info_file_is_deleted()
|
|
170
187
|
return None
|
|
171
188
|
|
|
172
189
|
x = self._comm_pipe.read(PIPE_BUF)
|
|
@@ -190,7 +207,7 @@ class _Proc(threading.Thread):
|
|
|
190
207
|
f"waiting for child-process exceeded timeout {self._timeout}s"
|
|
191
208
|
)
|
|
192
209
|
|
|
193
|
-
def
|
|
210
|
+
def _ensure_connection_info_file_is_deleted(self) -> None:
|
|
194
211
|
"""
|
|
195
212
|
Ensure that the JSON connection information file is deleted
|
|
196
213
|
"""
|
|
@@ -241,14 +258,14 @@ class BaseService:
|
|
|
241
258
|
self,
|
|
242
259
|
exec_args: Sequence[str] = (),
|
|
243
260
|
timeout: int = 120,
|
|
244
|
-
conn_info:
|
|
261
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None,
|
|
245
262
|
project: str | None = None,
|
|
246
263
|
) -> None:
|
|
247
264
|
self._exec_args = exec_args
|
|
248
265
|
self._timeout = timeout
|
|
249
266
|
|
|
250
267
|
self._proc: _Proc | None = None
|
|
251
|
-
self._conn_info:
|
|
268
|
+
self._conn_info: ErtServerConnectionInfo | Exception | None = conn_info
|
|
252
269
|
self._conn_info_event = threading.Event()
|
|
253
270
|
self._project = Path(project) if project is not None else Path.cwd()
|
|
254
271
|
|
|
@@ -261,7 +278,7 @@ class BaseService:
|
|
|
261
278
|
)
|
|
262
279
|
|
|
263
280
|
@classmethod
|
|
264
|
-
def start_server(cls
|
|
281
|
+
def start_server(cls, *args: Any, **kwargs: Any) -> _Context[Self]:
|
|
265
282
|
if cls._instance is not None:
|
|
266
283
|
raise RuntimeError("Server already running")
|
|
267
284
|
cls._instance = obj = cls(*args, **kwargs)
|
|
@@ -313,7 +330,7 @@ class BaseService:
|
|
|
313
330
|
if self._proc is not None:
|
|
314
331
|
self._proc.join()
|
|
315
332
|
|
|
316
|
-
def set_conn_info(self, info:
|
|
333
|
+
def set_conn_info(self, info: ErtServerConnectionInfo | Exception | None) -> None:
|
|
317
334
|
if self._conn_info is not None:
|
|
318
335
|
raise ValueError("Connection information already set")
|
|
319
336
|
if info is None:
|