ert 17.1.9__py3-none-any.whl → 18.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _ert/events.py +19 -2
- ert/__main__.py +8 -7
- ert/analysis/_update_commons.py +12 -3
- ert/cli/main.py +6 -3
- ert/cli/monitor.py +7 -0
- ert/config/__init__.py +13 -3
- ert/config/_create_observation_dataframes.py +60 -12
- ert/config/_observations.py +14 -1
- ert/config/_read_summary.py +8 -6
- ert/config/ensemble_config.py +6 -14
- ert/config/ert_config.py +19 -13
- ert/config/{everest_objective_config.py → everest_response.py} +23 -12
- ert/config/ext_param_config.py +133 -1
- ert/config/field.py +12 -8
- ert/config/forward_model_step.py +108 -6
- ert/config/gen_data_config.py +2 -6
- ert/config/gen_kw_config.py +0 -9
- ert/config/known_response_types.py +14 -0
- ert/config/parameter_config.py +0 -17
- ert/config/parsing/config_keywords.py +1 -0
- ert/config/parsing/config_schema.py +12 -0
- ert/config/parsing/config_schema_deprecations.py +11 -0
- ert/config/parsing/config_schema_item.py +1 -1
- ert/config/queue_config.py +4 -4
- ert/config/response_config.py +0 -7
- ert/config/rft_config.py +230 -0
- ert/config/summary_config.py +2 -6
- ert/config/violations.py +0 -0
- ert/config/workflow_fixtures.py +2 -1
- ert/dark_storage/client/__init__.py +2 -2
- ert/dark_storage/client/_session.py +4 -4
- ert/dark_storage/client/client.py +2 -2
- ert/dark_storage/compute/misfits.py +7 -6
- ert/dark_storage/endpoints/compute/misfits.py +2 -2
- ert/dark_storage/endpoints/observations.py +4 -4
- ert/dark_storage/endpoints/responses.py +15 -1
- ert/ensemble_evaluator/__init__.py +8 -1
- ert/ensemble_evaluator/evaluator.py +81 -29
- ert/ensemble_evaluator/event.py +6 -0
- ert/ensemble_evaluator/snapshot.py +3 -1
- ert/ensemble_evaluator/state.py +1 -0
- ert/field_utils/__init__.py +8 -0
- ert/field_utils/field_utils.py +211 -1
- ert/gui/ertwidgets/__init__.py +23 -16
- ert/gui/ertwidgets/analysismoduleedit.py +2 -2
- ert/gui/ertwidgets/checklist.py +1 -1
- ert/gui/ertwidgets/create_experiment_dialog.py +3 -1
- ert/gui/ertwidgets/ensembleselector.py +2 -2
- ert/gui/ertwidgets/models/__init__.py +2 -0
- ert/gui/ertwidgets/models/activerealizationsmodel.py +2 -1
- ert/gui/ertwidgets/models/path_model.py +1 -1
- ert/gui/ertwidgets/models/targetensemblemodel.py +2 -1
- ert/gui/ertwidgets/models/text_model.py +1 -1
- ert/gui/ertwidgets/searchbox.py +13 -4
- ert/gui/{suggestor → ertwidgets/suggestor}/_suggestor_message.py +13 -4
- ert/gui/main.py +11 -6
- ert/gui/main_window.py +1 -2
- ert/gui/simulation/ensemble_experiment_panel.py +1 -1
- ert/gui/simulation/ensemble_information_filter_panel.py +1 -1
- ert/gui/simulation/ensemble_smoother_panel.py +1 -1
- ert/gui/simulation/evaluate_ensemble_panel.py +1 -1
- ert/gui/simulation/experiment_panel.py +1 -1
- ert/gui/simulation/manual_update_panel.py +31 -8
- ert/gui/simulation/multiple_data_assimilation_panel.py +12 -8
- ert/gui/simulation/run_dialog.py +25 -4
- ert/gui/simulation/single_test_run_panel.py +2 -2
- ert/gui/summarypanel.py +1 -1
- ert/gui/tools/load_results/load_results_panel.py +1 -1
- ert/gui/tools/manage_experiments/storage_info_widget.py +7 -7
- ert/gui/tools/manage_experiments/storage_widget.py +1 -2
- ert/gui/tools/plot/plot_api.py +13 -10
- ert/gui/tools/plot/plot_window.py +12 -0
- ert/gui/tools/plot/plottery/plot_config.py +2 -0
- ert/gui/tools/plot/plottery/plot_context.py +14 -0
- ert/gui/tools/plot/plottery/plots/ensemble.py +9 -2
- ert/gui/tools/plot/plottery/plots/statistics.py +59 -19
- ert/mode_definitions.py +2 -0
- ert/plugins/__init__.py +0 -1
- ert/plugins/hook_implementations/workflows/gen_data_rft_export.py +10 -2
- ert/plugins/hook_specifications/__init__.py +0 -2
- ert/plugins/hook_specifications/jobs.py +0 -9
- ert/plugins/plugin_manager.py +2 -33
- ert/resources/shell_scripts/delete_directory.py +2 -2
- ert/run_models/__init__.py +18 -5
- ert/run_models/_create_run_path.py +33 -21
- ert/run_models/ensemble_experiment.py +10 -4
- ert/run_models/ensemble_information_filter.py +8 -1
- ert/run_models/ensemble_smoother.py +9 -3
- ert/run_models/evaluate_ensemble.py +8 -6
- ert/run_models/event.py +7 -3
- ert/run_models/everest_run_model.py +155 -44
- ert/run_models/initial_ensemble_run_model.py +23 -22
- ert/run_models/manual_update.py +4 -2
- ert/run_models/manual_update_enif.py +37 -0
- ert/run_models/model_factory.py +81 -22
- ert/run_models/multiple_data_assimilation.py +21 -10
- ert/run_models/run_model.py +54 -34
- ert/run_models/single_test_run.py +7 -4
- ert/run_models/update_run_model.py +4 -2
- ert/runpaths.py +5 -6
- ert/sample_prior.py +9 -4
- ert/scheduler/driver.py +37 -0
- ert/scheduler/event.py +3 -1
- ert/scheduler/job.py +23 -13
- ert/scheduler/lsf_driver.py +6 -2
- ert/scheduler/openpbs_driver.py +7 -1
- ert/scheduler/scheduler.py +5 -0
- ert/scheduler/slurm_driver.py +6 -2
- ert/services/__init__.py +2 -2
- ert/services/_base_service.py +31 -15
- ert/services/ert_server.py +317 -0
- ert/shared/_doc_utils/ert_jobs.py +1 -4
- ert/shared/storage/connection.py +3 -3
- ert/shared/version.py +3 -3
- ert/storage/local_ensemble.py +25 -5
- ert/storage/local_experiment.py +6 -14
- ert/storage/local_storage.py +35 -30
- ert/storage/migration/to18.py +12 -0
- ert/storage/migration/to8.py +4 -4
- ert/substitutions.py +12 -28
- ert/validation/active_range.py +7 -7
- ert/validation/rangestring.py +16 -16
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/METADATA +8 -7
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/RECORD +160 -159
- everest/api/everest_data_api.py +1 -14
- everest/bin/config_branch_script.py +3 -6
- everest/bin/everconfigdump_script.py +1 -9
- everest/bin/everest_script.py +21 -11
- everest/bin/kill_script.py +2 -2
- everest/bin/monitor_script.py +2 -2
- everest/bin/utils.py +6 -3
- everest/config/__init__.py +4 -1
- everest/config/control_config.py +61 -2
- everest/config/control_variable_config.py +2 -1
- everest/config/everest_config.py +38 -16
- everest/config/forward_model_config.py +5 -3
- everest/config/install_data_config.py +7 -5
- everest/config/install_job_config.py +7 -3
- everest/config/install_template_config.py +3 -3
- everest/config/optimization_config.py +19 -6
- everest/config/output_constraint_config.py +8 -2
- everest/config/server_config.py +6 -49
- everest/config/utils.py +25 -105
- everest/config/validation_utils.py +10 -10
- everest/config_file_loader.py +13 -2
- everest/detached/everserver.py +7 -8
- everest/everest_storage.py +6 -10
- everest/gui/everest_client.py +0 -1
- everest/gui/main_window.py +2 -2
- everest/optimizer/everest2ropt.py +59 -32
- everest/optimizer/opt_model_transforms.py +12 -13
- everest/optimizer/utils.py +0 -29
- everest/strings.py +0 -5
- ert/config/everest_constraints_config.py +0 -95
- ert/services/storage_service.py +0 -127
- everest/config/sampler_config.py +0 -103
- everest/simulator/__init__.py +0 -88
- everest/simulator/everest_to_ert.py +0 -51
- /ert/gui/{suggestor → ertwidgets/suggestor}/__init__.py +0 -0
- /ert/gui/{suggestor → ertwidgets/suggestor}/_colors.py +0 -0
- /ert/gui/{suggestor → ertwidgets/suggestor}/suggestor.py +0 -0
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/WHEEL +0 -0
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/entry_points.txt +0 -0
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/licenses/COPYING +0 -0
- {ert-17.1.9.dist-info → ert-18.0.0.dist-info}/top_level.txt +0 -0
ert/sample_prior.py
CHANGED
|
@@ -33,6 +33,7 @@ def sample_prior(
|
|
|
33
33
|
parameter_configs = ensemble.experiment.parameter_configuration
|
|
34
34
|
if parameters is None:
|
|
35
35
|
parameters = list(parameter_configs.keys())
|
|
36
|
+
complete_dataset: pl.DataFrame | None = None
|
|
36
37
|
for parameter in parameters:
|
|
37
38
|
config_node = parameter_configs[parameter]
|
|
38
39
|
if config_node.forward_init:
|
|
@@ -75,13 +76,17 @@ def sample_prior(
|
|
|
75
76
|
if datasets:
|
|
76
77
|
dataset = pl.concat(datasets, how="vertical")
|
|
77
78
|
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
)
|
|
79
|
+
if complete_dataset is None:
|
|
80
|
+
complete_dataset = dataset
|
|
81
|
+
elif dataset is not None:
|
|
82
|
+
complete_dataset = complete_dataset.join(dataset, on="realization")
|
|
82
83
|
else:
|
|
83
84
|
for realization_nr in active_realizations:
|
|
84
85
|
ds = config_node.read_from_runpath(Path(), realization_nr, 0)
|
|
85
86
|
ensemble.save_parameters(ds, parameter, realization_nr)
|
|
86
87
|
|
|
88
|
+
if complete_dataset is not None:
|
|
89
|
+
ensemble.save_parameters(
|
|
90
|
+
dataset=complete_dataset,
|
|
91
|
+
)
|
|
87
92
|
ensemble.refresh_ensemble_state()
|
ert/scheduler/driver.py
CHANGED
|
@@ -3,10 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import shlex
|
|
6
|
+
import time
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
8
|
from collections.abc import Iterable
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
11
|
+
from _ert.events import EnsembleEvaluationWarning
|
|
12
|
+
|
|
10
13
|
from .event import DriverEvent
|
|
11
14
|
|
|
12
15
|
SIGNAL_OFFSET = 128
|
|
@@ -34,12 +37,19 @@ class FailedSubmit(RuntimeError):
|
|
|
34
37
|
class Driver(ABC):
|
|
35
38
|
"""Adapter for the HPC cluster."""
|
|
36
39
|
|
|
40
|
+
POLLING_TIMEOUT_PERIOD = 600
|
|
41
|
+
|
|
37
42
|
def __init__(self, activate_script: str = "") -> None:
|
|
38
43
|
self._event_queue: asyncio.Queue[DriverEvent] | None = None
|
|
39
44
|
self._job_error_message_by_iens: dict[int, str] = {}
|
|
40
45
|
self.activate_script = activate_script
|
|
41
46
|
self._poll_period = _POLL_PERIOD
|
|
42
47
|
|
|
48
|
+
self._polling_timeout_period = Driver.POLLING_TIMEOUT_PERIOD
|
|
49
|
+
self._last_successful_poll = time.time()
|
|
50
|
+
self._last_polling_error_message: str | None = None
|
|
51
|
+
self._has_warned_evaluator_of_polling_error = False
|
|
52
|
+
|
|
43
53
|
@property
|
|
44
54
|
def event_queue(self) -> asyncio.Queue[DriverEvent]:
|
|
45
55
|
if self._event_queue is None:
|
|
@@ -178,3 +188,30 @@ class Driver(ABC):
|
|
|
178
188
|
)
|
|
179
189
|
logger.error(error_message)
|
|
180
190
|
return False, error_message
|
|
191
|
+
|
|
192
|
+
async def _warn_evaluator_if_polling_has_failed_for_some_time(self) -> None:
|
|
193
|
+
if (
|
|
194
|
+
(self._last_successful_poll < time.time() - self._polling_timeout_period)
|
|
195
|
+
and self._last_polling_error_message
|
|
196
|
+
and not self._has_warned_evaluator_of_polling_error
|
|
197
|
+
):
|
|
198
|
+
await self._warn_evaluator_about_polling_difficulties()
|
|
199
|
+
self._has_warned_evaluator_of_polling_error = True
|
|
200
|
+
|
|
201
|
+
async def _warn_evaluator_about_polling_difficulties(self) -> None:
|
|
202
|
+
last_polling_error_message = self._last_polling_error_message
|
|
203
|
+
logger = logging.getLogger(__name__)
|
|
204
|
+
logger.warning(
|
|
205
|
+
"Driver has not successfully polled statuses for "
|
|
206
|
+
f"{self._polling_timeout_period}s. The previous error "
|
|
207
|
+
f"was due to '{last_polling_error_message}'"
|
|
208
|
+
)
|
|
209
|
+
formatted_msg = (
|
|
210
|
+
"ert has not been able to update the job status for some time. This might "
|
|
211
|
+
"be resolved by itself, and it does not mean that the run has crashed.\n"
|
|
212
|
+
"Please check the runpath if it seems to still be running.\n"
|
|
213
|
+
f"The last error message was '{last_polling_error_message}'"
|
|
214
|
+
)
|
|
215
|
+
await self.event_queue.put(
|
|
216
|
+
EnsembleEvaluationWarning(warning_message=formatted_msg)
|
|
217
|
+
)
|
ert/scheduler/event.py
CHANGED
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
|
|
5
|
+
from _ert.events import EnsembleEvaluationWarning
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
@dataclass
|
|
7
9
|
class StartedEvent:
|
|
@@ -16,4 +18,4 @@ class FinishedEvent:
|
|
|
16
18
|
exec_hosts: str = "-"
|
|
17
19
|
|
|
18
20
|
|
|
19
|
-
DriverEvent = StartedEvent | FinishedEvent
|
|
21
|
+
DriverEvent = StartedEvent | FinishedEvent | EnsembleEvaluationWarning
|
ert/scheduler/job.py
CHANGED
|
@@ -12,6 +12,7 @@ from enum import StrEnum
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import TYPE_CHECKING, assert_never
|
|
14
14
|
|
|
15
|
+
import anyio
|
|
15
16
|
from lxml import etree
|
|
16
17
|
from opentelemetry.trace import Status, StatusCode
|
|
17
18
|
|
|
@@ -31,7 +32,7 @@ from ert.storage import (
|
|
|
31
32
|
RealizationStorageState,
|
|
32
33
|
load_realization_parameters_and_responses,
|
|
33
34
|
)
|
|
34
|
-
from ert.trace import trace
|
|
35
|
+
from ert.trace import trace
|
|
35
36
|
from ert.warnings import PostSimulationWarning
|
|
36
37
|
|
|
37
38
|
from .driver import Driver, FailedSubmit
|
|
@@ -238,7 +239,6 @@ class Job:
|
|
|
238
239
|
f"{method_name} spent {elapsed_time} seconds waiting for files"
|
|
239
240
|
)
|
|
240
241
|
|
|
241
|
-
@tracer.start_as_current_span(f"{__name__}.run")
|
|
242
242
|
async def run(
|
|
243
243
|
self,
|
|
244
244
|
sem: asyncio.BoundedSemaphore,
|
|
@@ -335,8 +335,12 @@ class Job:
|
|
|
335
335
|
|
|
336
336
|
valid_checksums = [info for info in checksum.values() if "error" not in info]
|
|
337
337
|
|
|
338
|
-
|
|
339
|
-
|
|
338
|
+
async def all_paths_exist(paths: list[Path]) -> bool:
|
|
339
|
+
return all(
|
|
340
|
+
await asyncio.gather(*[anyio.Path(path).exists() for path in paths])
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
while not await all_paths_exist([info["path"] for info in valid_checksums]):
|
|
340
344
|
if timeout <= 0:
|
|
341
345
|
break
|
|
342
346
|
timeout -= DISK_SYNCHRONIZATION_POLLING_INTERVAL
|
|
@@ -344,17 +348,19 @@ class Job:
|
|
|
344
348
|
await asyncio.sleep(DISK_SYNCHRONIZATION_POLLING_INTERVAL)
|
|
345
349
|
async with checksum_lock:
|
|
346
350
|
for info in valid_checksums:
|
|
347
|
-
file_path = Path(info["path"])
|
|
351
|
+
file_path = anyio.Path(info["path"])
|
|
348
352
|
expected_md5sum = info.get("md5sum")
|
|
349
|
-
|
|
350
|
-
|
|
353
|
+
file_path_exists = await file_path.exists()
|
|
354
|
+
if file_path_exists and expected_md5sum:
|
|
355
|
+
file_bytes = await file_path.read_bytes()
|
|
356
|
+
actual_md5sum = hashlib.md5(file_bytes).hexdigest()
|
|
351
357
|
if expected_md5sum == actual_md5sum:
|
|
352
358
|
logger.debug(f"File {file_path} checksum successful.")
|
|
353
359
|
else:
|
|
354
360
|
logger.warning(
|
|
355
361
|
f"File {file_path} checksum verification failed."
|
|
356
362
|
)
|
|
357
|
-
elif
|
|
363
|
+
elif file_path_exists and expected_md5sum is None:
|
|
358
364
|
logger.warning(f"Checksum not received for file {file_path}")
|
|
359
365
|
else:
|
|
360
366
|
logger.error(f"Disk synchronization failed for {file_path}")
|
|
@@ -506,11 +512,12 @@ async def log_warnings_from_forward_model(
|
|
|
506
512
|
or "- ERROR - " in line
|
|
507
513
|
)
|
|
508
514
|
|
|
509
|
-
async def log_warnings_from_file(
|
|
515
|
+
async def log_warnings_from_file(
|
|
510
516
|
file: Path, iens: int, step: ForwardModelStep, step_idx: int, filetype: str
|
|
511
517
|
) -> None:
|
|
512
518
|
captured: list[str] = []
|
|
513
|
-
|
|
519
|
+
file_text = await anyio.Path(file).read_text(encoding="utf-8")
|
|
520
|
+
for line in file_text.splitlines():
|
|
514
521
|
if line_contains_warning(line):
|
|
515
522
|
captured.append(line[:max_length])
|
|
516
523
|
|
|
@@ -527,9 +534,12 @@ async def log_warnings_from_forward_model(
|
|
|
527
534
|
return 0
|
|
528
535
|
remaining_timeout = _timeout
|
|
529
536
|
for _ in range(_timeout):
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
537
|
+
file_path_exists = await anyio.Path(file_path).exists()
|
|
538
|
+
if file_path_exists:
|
|
539
|
+
st_mtime = (await anyio.Path(file_path).stat()).st_mtime
|
|
540
|
+
else:
|
|
541
|
+
st_mtime = 0
|
|
542
|
+
if not (file_path_exists and st_mtime >= job_submission_time):
|
|
533
543
|
remaining_timeout -= 1
|
|
534
544
|
await asyncio.sleep(1)
|
|
535
545
|
else:
|
ert/scheduler/lsf_driver.py
CHANGED
|
@@ -444,6 +444,7 @@ class LsfDriver(Driver):
|
|
|
444
444
|
|
|
445
445
|
async def poll(self) -> None:
|
|
446
446
|
while True:
|
|
447
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
447
448
|
if not self._jobs.keys():
|
|
448
449
|
await asyncio.sleep(self._poll_period)
|
|
449
450
|
continue
|
|
@@ -461,6 +462,7 @@ class LsfDriver(Driver):
|
|
|
461
462
|
)
|
|
462
463
|
except OSError as e:
|
|
463
464
|
logger.error(str(e))
|
|
465
|
+
self._last_polling_error_message = str(e)
|
|
464
466
|
await asyncio.sleep(self._poll_period)
|
|
465
467
|
continue
|
|
466
468
|
|
|
@@ -468,10 +470,11 @@ class LsfDriver(Driver):
|
|
|
468
470
|
if process.returncode:
|
|
469
471
|
# bjobs may give nonzero return code even when it is providing
|
|
470
472
|
# at least some correct information
|
|
473
|
+
error_msg = stderr.decode()
|
|
471
474
|
logger.warning(
|
|
472
|
-
f"bjobs gave returncode {process.returncode} "
|
|
473
|
-
f"and error {stderr.decode()}"
|
|
475
|
+
f"bjobs gave returncode {process.returncode} and error {error_msg}"
|
|
474
476
|
)
|
|
477
|
+
self._last_polling_error_message = error_msg
|
|
475
478
|
bjobs_states = _parse_jobs_dict(parse_bjobs(stdout.decode(errors="ignore")))
|
|
476
479
|
self.update_and_log_exec_hosts(
|
|
477
480
|
parse_bjobs_exec_hosts(stdout.decode(errors="ignore"))
|
|
@@ -503,6 +506,7 @@ class LsfDriver(Driver):
|
|
|
503
506
|
"bhist did not give status for job_ids "
|
|
504
507
|
f"{missing_in_bhist_and_bjobs}, giving up for now."
|
|
505
508
|
)
|
|
509
|
+
self._last_successful_poll = time.time()
|
|
506
510
|
await asyncio.sleep(self._poll_period)
|
|
507
511
|
|
|
508
512
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/openpbs_driver.py
CHANGED
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import shlex
|
|
7
7
|
import shutil
|
|
8
|
+
import time
|
|
8
9
|
from collections.abc import Iterable, Mapping, MutableMapping
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from pathlib import Path
|
|
@@ -264,6 +265,7 @@ class OpenPBSDriver(Driver):
|
|
|
264
265
|
|
|
265
266
|
async def poll(self) -> None:
|
|
266
267
|
while True:
|
|
268
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
267
269
|
if not self._jobs:
|
|
268
270
|
await asyncio.sleep(self._poll_period)
|
|
269
271
|
continue
|
|
@@ -280,6 +282,7 @@ class OpenPBSDriver(Driver):
|
|
|
280
282
|
)
|
|
281
283
|
except OSError as e:
|
|
282
284
|
logger.error(str(e))
|
|
285
|
+
self._last_polling_error_message = str(e)
|
|
283
286
|
await asyncio.sleep(self._poll_period)
|
|
284
287
|
continue
|
|
285
288
|
stdout, stderr = await process.communicate()
|
|
@@ -289,10 +292,12 @@ class OpenPBSDriver(Driver):
|
|
|
289
292
|
await asyncio.sleep(self._poll_period)
|
|
290
293
|
continue
|
|
291
294
|
if process.returncode == QSTAT_UNKNOWN_JOB_ID:
|
|
295
|
+
error_msg = stderr.decode(errors="ignore")
|
|
292
296
|
logger.debug(
|
|
293
297
|
f"qstat gave returncode {QSTAT_UNKNOWN_JOB_ID} "
|
|
294
|
-
f"with message {
|
|
298
|
+
f"with message {error_msg}"
|
|
295
299
|
)
|
|
300
|
+
self._last_polling_error_message = error_msg
|
|
296
301
|
parsed_jobs = _parse_jobs_dict(
|
|
297
302
|
parse_qstat(stdout.decode(errors="ignore"))
|
|
298
303
|
)
|
|
@@ -330,6 +335,7 @@ class OpenPBSDriver(Driver):
|
|
|
330
335
|
for job_id, job in parsed_jobs_dict.items():
|
|
331
336
|
await self._process_job_update(job_id, job)
|
|
332
337
|
|
|
338
|
+
self._last_successful_poll = time.time()
|
|
333
339
|
await asyncio.sleep(self._poll_period)
|
|
334
340
|
|
|
335
341
|
async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
|
ert/scheduler/scheduler.py
CHANGED
|
@@ -15,6 +15,7 @@ import orjson
|
|
|
15
15
|
from pydantic.dataclasses import dataclass
|
|
16
16
|
|
|
17
17
|
from _ert.events import (
|
|
18
|
+
EnsembleEvaluationWarning,
|
|
18
19
|
ForwardModelStepChecksum,
|
|
19
20
|
RealizationEvent,
|
|
20
21
|
RealizationFailed,
|
|
@@ -343,6 +344,10 @@ class Scheduler:
|
|
|
343
344
|
async def _process_event_queue(self) -> None:
|
|
344
345
|
while True:
|
|
345
346
|
event = await self.driver.event_queue.get()
|
|
347
|
+
if isinstance(event, EnsembleEvaluationWarning):
|
|
348
|
+
if self._ensemble_evaluator_queue:
|
|
349
|
+
await self._ensemble_evaluator_queue.put(event)
|
|
350
|
+
continue
|
|
346
351
|
job = self._jobs[event.iens]
|
|
347
352
|
|
|
348
353
|
# Any event implies the job has at least started
|
ert/scheduler/slurm_driver.py
CHANGED
|
@@ -262,6 +262,7 @@ class SlurmDriver(Driver):
|
|
|
262
262
|
|
|
263
263
|
async def poll(self) -> None:
|
|
264
264
|
while True:
|
|
265
|
+
await self._warn_evaluator_if_polling_has_failed_for_some_time()
|
|
265
266
|
if not self._jobs.keys():
|
|
266
267
|
await asyncio.sleep(self._poll_period)
|
|
267
268
|
continue
|
|
@@ -277,14 +278,16 @@ class SlurmDriver(Driver):
|
|
|
277
278
|
)
|
|
278
279
|
except OSError as e:
|
|
279
280
|
logger.error(str(e))
|
|
281
|
+
self._last_polling_error_message = str(e)
|
|
280
282
|
await asyncio.sleep(self._poll_period)
|
|
281
283
|
continue
|
|
282
284
|
stdout, stderr = await process.communicate()
|
|
283
285
|
if process.returncode:
|
|
286
|
+
error_msg = stderr.decode()
|
|
284
287
|
logger.warning(
|
|
285
|
-
f"squeue gave returncode {process.returncode} "
|
|
286
|
-
f"and error {stderr.decode()}"
|
|
288
|
+
f"squeue gave returncode {process.returncode} and error {error_msg}"
|
|
287
289
|
)
|
|
290
|
+
self._last_polling_error_message = error_msg
|
|
288
291
|
squeue_states = dict(_parse_squeue_output(stdout.decode(errors="ignore")))
|
|
289
292
|
|
|
290
293
|
job_ids_found_in_squeue_output = set(squeue_states.keys())
|
|
@@ -317,6 +320,7 @@ class SlurmDriver(Driver):
|
|
|
317
320
|
"scontrol did not give status for job_ids "
|
|
318
321
|
f"{missing_in_squeue_and_scontrol}, giving up for now."
|
|
319
322
|
)
|
|
323
|
+
self._last_successful_poll = time.time()
|
|
320
324
|
await asyncio.sleep(self._poll_period)
|
|
321
325
|
|
|
322
326
|
async def _process_job_update(self, job_id: str, new_info: JobInfo) -> None:
|
ert/services/__init__.py
CHANGED
ert/services/_base_service.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This file contains a more generic version of "ert services", and
|
|
3
|
+
is scheduled for removal when WebvizErt is removed.
|
|
4
|
+
"""
|
|
5
|
+
|
|
1
6
|
from __future__ import annotations
|
|
2
7
|
|
|
3
8
|
import contextlib
|
|
@@ -15,13 +20,21 @@ from subprocess import Popen, TimeoutExpired
|
|
|
15
20
|
from tempfile import NamedTemporaryFile
|
|
16
21
|
from time import sleep
|
|
17
22
|
from types import FrameType
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Generic, Self, TypeVar
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Generic, Self, TypedDict, TypeVar
|
|
19
24
|
|
|
20
25
|
if TYPE_CHECKING:
|
|
21
26
|
from inspect import Traceback
|
|
22
27
|
|
|
23
28
|
T = TypeVar("T", bound="BaseService")
|
|
24
|
-
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ErtServerConnectionInfo(TypedDict):
|
|
32
|
+
urls: list[str]
|
|
33
|
+
authtoken: str
|
|
34
|
+
host: str
|
|
35
|
+
port: str
|
|
36
|
+
cert: str
|
|
37
|
+
auth: str
|
|
25
38
|
|
|
26
39
|
|
|
27
40
|
SERVICE_CONF_PATHS: set[str] = set()
|
|
@@ -88,7 +101,9 @@ class _Proc(threading.Thread):
|
|
|
88
101
|
service_name: str,
|
|
89
102
|
exec_args: Sequence[str],
|
|
90
103
|
timeout: int,
|
|
91
|
-
|
|
104
|
+
on_connection_info_received: Callable[
|
|
105
|
+
[ErtServerConnectionInfo | Exception | None], None
|
|
106
|
+
],
|
|
92
107
|
project: Path,
|
|
93
108
|
) -> None:
|
|
94
109
|
super().__init__()
|
|
@@ -98,7 +113,7 @@ class _Proc(threading.Thread):
|
|
|
98
113
|
self._service_name = service_name
|
|
99
114
|
self._exec_args = exec_args
|
|
100
115
|
self._timeout = timeout
|
|
101
|
-
self.
|
|
116
|
+
self._propagate_connection_info_from_childproc = on_connection_info_received
|
|
102
117
|
self._service_config_path = project / f"{self._service_name}_server.json"
|
|
103
118
|
|
|
104
119
|
fd_read, fd_write = os.pipe()
|
|
@@ -119,13 +134,13 @@ class _Proc(threading.Thread):
|
|
|
119
134
|
os.close(fd_write)
|
|
120
135
|
|
|
121
136
|
def run(self) -> None:
|
|
122
|
-
comm = self.
|
|
137
|
+
comm = self._read_connection_info_from_process(self._childproc)
|
|
123
138
|
|
|
124
139
|
if comm is None:
|
|
125
|
-
self.
|
|
140
|
+
self._propagate_connection_info_from_childproc(TimeoutError())
|
|
126
141
|
return # _read_conn_info() has already cleaned up in this case
|
|
127
142
|
|
|
128
|
-
conn_info:
|
|
143
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None
|
|
129
144
|
try:
|
|
130
145
|
conn_info = json.loads(comm)
|
|
131
146
|
except json.JSONDecodeError:
|
|
@@ -134,7 +149,7 @@ class _Proc(threading.Thread):
|
|
|
134
149
|
conn_info = exc
|
|
135
150
|
|
|
136
151
|
try:
|
|
137
|
-
self.
|
|
152
|
+
self._propagate_connection_info_from_childproc(conn_info)
|
|
138
153
|
|
|
139
154
|
while True:
|
|
140
155
|
if self._childproc.poll() is not None:
|
|
@@ -148,15 +163,16 @@ class _Proc(threading.Thread):
|
|
|
148
163
|
self.logger.exception(e)
|
|
149
164
|
|
|
150
165
|
finally:
|
|
151
|
-
self.
|
|
166
|
+
self._ensure_connection_info_file_is_deleted()
|
|
152
167
|
|
|
153
168
|
def shutdown(self) -> int:
|
|
154
169
|
"""Shutdown the server."""
|
|
155
170
|
self._shutdown.set()
|
|
156
171
|
self.join()
|
|
172
|
+
|
|
157
173
|
return self._childproc.returncode
|
|
158
174
|
|
|
159
|
-
def
|
|
175
|
+
def _read_connection_info_from_process(self, proc: Popen[bytes]) -> str | None:
|
|
160
176
|
comm_buf = io.StringIO()
|
|
161
177
|
first_iter = True
|
|
162
178
|
while first_iter or proc.poll() is None:
|
|
@@ -166,7 +182,7 @@ class _Proc(threading.Thread):
|
|
|
166
182
|
# Timeout reached, exit with a failure
|
|
167
183
|
if ready == ([], [], []):
|
|
168
184
|
self._do_shutdown()
|
|
169
|
-
self.
|
|
185
|
+
self._ensure_connection_info_file_is_deleted()
|
|
170
186
|
return None
|
|
171
187
|
|
|
172
188
|
x = self._comm_pipe.read(PIPE_BUF)
|
|
@@ -190,7 +206,7 @@ class _Proc(threading.Thread):
|
|
|
190
206
|
f"waiting for child-process exceeded timeout {self._timeout}s"
|
|
191
207
|
)
|
|
192
208
|
|
|
193
|
-
def
|
|
209
|
+
def _ensure_connection_info_file_is_deleted(self) -> None:
|
|
194
210
|
"""
|
|
195
211
|
Ensure that the JSON connection information file is deleted
|
|
196
212
|
"""
|
|
@@ -241,14 +257,14 @@ class BaseService:
|
|
|
241
257
|
self,
|
|
242
258
|
exec_args: Sequence[str] = (),
|
|
243
259
|
timeout: int = 120,
|
|
244
|
-
conn_info:
|
|
260
|
+
conn_info: ErtServerConnectionInfo | Exception | None = None,
|
|
245
261
|
project: str | None = None,
|
|
246
262
|
) -> None:
|
|
247
263
|
self._exec_args = exec_args
|
|
248
264
|
self._timeout = timeout
|
|
249
265
|
|
|
250
266
|
self._proc: _Proc | None = None
|
|
251
|
-
self._conn_info:
|
|
267
|
+
self._conn_info: ErtServerConnectionInfo | Exception | None = conn_info
|
|
252
268
|
self._conn_info_event = threading.Event()
|
|
253
269
|
self._project = Path(project) if project is not None else Path.cwd()
|
|
254
270
|
|
|
@@ -313,7 +329,7 @@ class BaseService:
|
|
|
313
329
|
if self._proc is not None:
|
|
314
330
|
self._proc.join()
|
|
315
331
|
|
|
316
|
-
def set_conn_info(self, info:
|
|
332
|
+
def set_conn_info(self, info: ErtServerConnectionInfo | Exception | None) -> None:
|
|
317
333
|
if self._conn_info is not None:
|
|
318
334
|
raise ValueError("Connection information already set")
|
|
319
335
|
if info is None:
|