mlrun 1.7.0rc34__py3-none-any.whl → 1.7.0rc36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +1 -0
- mlrun/common/schemas/__init__.py +0 -1
- mlrun/common/schemas/api_gateway.py +1 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -2
- mlrun/common/schemas/model_monitoring/constants.py +3 -16
- mlrun/common/schemas/notification.py +1 -1
- mlrun/common/types.py +1 -0
- mlrun/config.py +7 -7
- mlrun/datastore/sources.py +8 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/db/base.py +2 -3
- mlrun/db/httpdb.py +3 -3
- mlrun/feature_store/api.py +19 -1
- mlrun/feature_store/steps.py +8 -0
- mlrun/model.py +1 -1
- mlrun/model_monitoring/api.py +23 -6
- mlrun/model_monitoring/applications/_application_steps.py +4 -0
- mlrun/model_monitoring/applications/base.py +8 -0
- mlrun/model_monitoring/applications/evidently_base.py +27 -27
- mlrun/model_monitoring/controller.py +5 -1
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +5 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +2 -2
- mlrun/model_monitoring/db/tsdb/base.py +6 -3
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +22 -3
- mlrun/model_monitoring/stream_processing.py +25 -153
- mlrun/projects/pipelines.py +76 -73
- mlrun/run.py +4 -0
- mlrun/runtimes/nuclio/api_gateway.py +1 -1
- mlrun/runtimes/nuclio/application/application.py +25 -2
- mlrun/runtimes/nuclio/function.py +5 -0
- mlrun/runtimes/nuclio/serving.py +1 -1
- mlrun/runtimes/pod.py +2 -4
- mlrun/runtimes/utils.py +18 -0
- mlrun/serving/states.py +10 -3
- mlrun/serving/v2_serving.py +5 -2
- mlrun/utils/db.py +15 -0
- mlrun/utils/helpers.py +27 -14
- mlrun/utils/http.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/METADATA +3 -1
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/RECORD +46 -47
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc36.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
from typing import Any
|
|
15
16
|
|
|
16
17
|
import mlrun.feature_store.steps
|
|
17
18
|
from mlrun.common.schemas.model_monitoring import (
|
|
@@ -21,6 +22,24 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
26
|
+
"""
|
|
27
|
+
Normalize user defined keys - input data to a model and its predictions,
|
|
28
|
+
to a form V3IO frames tolerates.
|
|
29
|
+
|
|
30
|
+
The dictionary keys should conform to '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'.
|
|
31
|
+
"""
|
|
32
|
+
prefix = "_"
|
|
33
|
+
|
|
34
|
+
def norm_key(key: str) -> str:
|
|
35
|
+
key = key.replace("-", "_") # hyphens `-` are not allowed
|
|
36
|
+
if key and key[0].isdigit(): # starting with a digit is not allowed
|
|
37
|
+
return prefix + key
|
|
38
|
+
return key
|
|
39
|
+
|
|
40
|
+
return {norm_key(k): v for k, v in event.items()}
|
|
41
|
+
|
|
42
|
+
|
|
24
43
|
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
25
44
|
def __init__(self, **kwargs):
|
|
26
45
|
"""
|
|
@@ -68,8 +87,8 @@ class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
|
68
87
|
# endpoint_features includes the event values of each feature and prediction
|
|
69
88
|
endpoint_features = {
|
|
70
89
|
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
71
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
72
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
90
|
+
**_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_PREDICTIONS]),
|
|
91
|
+
**_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_FEATURES]),
|
|
73
92
|
**base_event,
|
|
74
93
|
}
|
|
75
94
|
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
@@ -27,7 +27,6 @@ import mlrun.datastore.targets
|
|
|
27
27
|
import mlrun.feature_store as fstore
|
|
28
28
|
import mlrun.feature_store.steps
|
|
29
29
|
import mlrun.model_monitoring.db
|
|
30
|
-
import mlrun.model_monitoring.prometheus
|
|
31
30
|
import mlrun.serving.states
|
|
32
31
|
import mlrun.utils
|
|
33
32
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
@@ -37,7 +36,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
37
36
|
FileTargetKind,
|
|
38
37
|
ModelEndpointTarget,
|
|
39
38
|
ProjectSecretKeys,
|
|
40
|
-
PrometheusEndpoints,
|
|
41
39
|
)
|
|
42
40
|
from mlrun.utils import logger
|
|
43
41
|
|
|
@@ -172,39 +170,12 @@ class EventStreamProcessor:
|
|
|
172
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
173
171
|
)
|
|
174
172
|
|
|
175
|
-
# Event routing based on the provided path
|
|
176
|
-
def apply_event_routing():
|
|
177
|
-
typing.cast(
|
|
178
|
-
mlrun.serving.TaskStep,
|
|
179
|
-
graph.add_step(
|
|
180
|
-
"EventRouting",
|
|
181
|
-
full_event=True,
|
|
182
|
-
project=self.project,
|
|
183
|
-
),
|
|
184
|
-
).respond()
|
|
185
|
-
|
|
186
|
-
apply_event_routing()
|
|
187
|
-
|
|
188
|
-
# Filter out events with '-' in the path basename from going forward
|
|
189
|
-
# through the next steps of the stream graph
|
|
190
|
-
def apply_storey_filter_stream_events():
|
|
191
|
-
# Filter events with Prometheus endpoints path
|
|
192
|
-
graph.add_step(
|
|
193
|
-
"storey.Filter",
|
|
194
|
-
"filter_stream_event",
|
|
195
|
-
_fn=f"(event.path not in {PrometheusEndpoints.list()})",
|
|
196
|
-
full_event=True,
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
apply_storey_filter_stream_events()
|
|
200
|
-
|
|
201
173
|
# Process endpoint event: splitting into sub-events and validate event data
|
|
202
174
|
def apply_process_endpoint_event():
|
|
203
175
|
graph.add_step(
|
|
204
176
|
"ProcessEndpointEvent",
|
|
205
177
|
full_event=True,
|
|
206
178
|
project=self.project,
|
|
207
|
-
after="filter_stream_event",
|
|
208
179
|
)
|
|
209
180
|
|
|
210
181
|
apply_process_endpoint_event()
|
|
@@ -324,33 +295,10 @@ class EventStreamProcessor:
|
|
|
324
295
|
|
|
325
296
|
apply_storey_sample_window()
|
|
326
297
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
)
|
|
332
|
-
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
333
|
-
|
|
334
|
-
else:
|
|
335
|
-
# Prometheus
|
|
336
|
-
# Increase the prediction counter by 1 and update the latency value
|
|
337
|
-
graph.add_step(
|
|
338
|
-
"IncCounter",
|
|
339
|
-
name="IncCounter",
|
|
340
|
-
after="MapFeatureNames",
|
|
341
|
-
project=self.project,
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
# Record a sample of features and labels
|
|
345
|
-
def apply_record_features_to_prometheus():
|
|
346
|
-
graph.add_step(
|
|
347
|
-
"RecordFeatures",
|
|
348
|
-
name="RecordFeaturesToPrometheus",
|
|
349
|
-
after="sample",
|
|
350
|
-
project=self.project,
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
apply_record_features_to_prometheus()
|
|
298
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
299
|
+
project=self.project, secret_provider=secret_provider
|
|
300
|
+
)
|
|
301
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
354
302
|
|
|
355
303
|
# Parquet branch
|
|
356
304
|
# Filter and validate different keys before writing the data to Parquet target
|
|
@@ -542,11 +490,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
542
490
|
error = event.get("error")
|
|
543
491
|
if error:
|
|
544
492
|
self.error_count[endpoint_id] += 1
|
|
545
|
-
|
|
546
|
-
project=self.project,
|
|
547
|
-
endpoint_id=event["endpoint_id"],
|
|
548
|
-
model_name=event["model"],
|
|
549
|
-
)
|
|
493
|
+
# TODO: write to tsdb / kv once in a while
|
|
550
494
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
551
495
|
|
|
552
496
|
# Validate event fields
|
|
@@ -613,6 +557,26 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
613
557
|
|
|
614
558
|
# Separate each model invocation into sub events that will be stored as dictionary
|
|
615
559
|
# in list of events. This list will be used as the body for the storey event.
|
|
560
|
+
if not isinstance(features, list):
|
|
561
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
562
|
+
"Model's inputs must be a list"
|
|
563
|
+
)
|
|
564
|
+
features = (
|
|
565
|
+
features
|
|
566
|
+
if not any(not isinstance(feat, list) for feat in features)
|
|
567
|
+
else [features]
|
|
568
|
+
)
|
|
569
|
+
if not isinstance(predictions, list):
|
|
570
|
+
predictions = [[predictions]]
|
|
571
|
+
elif isinstance(predictions, list) and len(predictions) == len(features):
|
|
572
|
+
pass # predictions are already in the right format
|
|
573
|
+
else:
|
|
574
|
+
predictions = (
|
|
575
|
+
predictions
|
|
576
|
+
if not any(not isinstance(pred, list) for pred in predictions)
|
|
577
|
+
else [predictions]
|
|
578
|
+
)
|
|
579
|
+
|
|
616
580
|
events = []
|
|
617
581
|
for i, (feature, prediction) in enumerate(zip(features, predictions)):
|
|
618
582
|
if not isinstance(prediction, list):
|
|
@@ -973,98 +937,6 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
973
937
|
return event
|
|
974
938
|
|
|
975
939
|
|
|
976
|
-
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
977
|
-
"""
|
|
978
|
-
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
979
|
-
to the caller. At the moment there are several paths:
|
|
980
|
-
|
|
981
|
-
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
982
|
-
to scrape the results from the monitoring stream memory.
|
|
983
|
-
|
|
984
|
-
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
985
|
-
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
986
|
-
metrics.
|
|
987
|
-
|
|
988
|
-
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
989
|
-
|
|
990
|
-
"""
|
|
991
|
-
|
|
992
|
-
def __init__(
|
|
993
|
-
self,
|
|
994
|
-
project: str,
|
|
995
|
-
**kwargs,
|
|
996
|
-
):
|
|
997
|
-
super().__init__(**kwargs)
|
|
998
|
-
self.project: str = project
|
|
999
|
-
|
|
1000
|
-
def do(self, event):
|
|
1001
|
-
if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
|
|
1002
|
-
# Return a parsed Prometheus registry file
|
|
1003
|
-
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1004
|
-
elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
|
|
1005
|
-
# Update statistical metrics
|
|
1006
|
-
for event_metric in event.body:
|
|
1007
|
-
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1008
|
-
project=self.project,
|
|
1009
|
-
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1010
|
-
metric=event_metric[EventFieldType.METRIC],
|
|
1011
|
-
value=event_metric[EventFieldType.VALUE],
|
|
1012
|
-
)
|
|
1013
|
-
elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
|
|
1014
|
-
# Update drift status
|
|
1015
|
-
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1016
|
-
project=self.project,
|
|
1017
|
-
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1018
|
-
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1019
|
-
)
|
|
1020
|
-
|
|
1021
|
-
return event
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1025
|
-
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1026
|
-
|
|
1027
|
-
def __init__(self, project: str, **kwargs):
|
|
1028
|
-
super().__init__(**kwargs)
|
|
1029
|
-
self.project: str = project
|
|
1030
|
-
|
|
1031
|
-
def do(self, event):
|
|
1032
|
-
# Compute prediction per second
|
|
1033
|
-
|
|
1034
|
-
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1035
|
-
project=self.project,
|
|
1036
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1037
|
-
latency=event[EventFieldType.LATENCY],
|
|
1038
|
-
model_name=event[EventFieldType.MODEL],
|
|
1039
|
-
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1040
|
-
)
|
|
1041
|
-
|
|
1042
|
-
return event
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1046
|
-
"""Record a sample of features and labels in Prometheus registry"""
|
|
1047
|
-
|
|
1048
|
-
def __init__(self, project: str, **kwargs):
|
|
1049
|
-
super().__init__(**kwargs)
|
|
1050
|
-
self.project: str = project
|
|
1051
|
-
|
|
1052
|
-
def do(self, event):
|
|
1053
|
-
# Generate a dictionary of features and predictions
|
|
1054
|
-
features = {
|
|
1055
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1056
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
1057
|
-
}
|
|
1058
|
-
|
|
1059
|
-
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1060
|
-
project=self.project,
|
|
1061
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1062
|
-
features=features,
|
|
1063
|
-
)
|
|
1064
|
-
|
|
1065
|
-
return event
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
940
|
def update_endpoint_record(
|
|
1069
941
|
project: str,
|
|
1070
942
|
endpoint_id: str,
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -404,12 +404,15 @@ class _PipelineRunStatus:
|
|
|
404
404
|
return self._exc
|
|
405
405
|
|
|
406
406
|
def wait_for_completion(self, timeout=None, expected_statuses=None):
|
|
407
|
-
|
|
408
|
-
self
|
|
407
|
+
returned_state = self._engine.wait_for_completion(
|
|
408
|
+
self,
|
|
409
409
|
project=self.project,
|
|
410
410
|
timeout=timeout,
|
|
411
411
|
expected_statuses=expected_statuses,
|
|
412
412
|
)
|
|
413
|
+
# TODO: returning a state is optional until all runners implement wait_for_completion
|
|
414
|
+
if returned_state:
|
|
415
|
+
self._state = returned_state
|
|
413
416
|
return self._state
|
|
414
417
|
|
|
415
418
|
def __str__(self):
|
|
@@ -458,6 +461,48 @@ class _PipelineRunner(abc.ABC):
|
|
|
458
461
|
def get_state(run_id, project=None):
|
|
459
462
|
pass
|
|
460
463
|
|
|
464
|
+
@staticmethod
|
|
465
|
+
def get_run_status(
|
|
466
|
+
project,
|
|
467
|
+
run: _PipelineRunStatus,
|
|
468
|
+
timeout=None,
|
|
469
|
+
expected_statuses=None,
|
|
470
|
+
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
471
|
+
**kwargs,
|
|
472
|
+
):
|
|
473
|
+
timeout = timeout or 60 * 60
|
|
474
|
+
raise_error = None
|
|
475
|
+
state = ""
|
|
476
|
+
try:
|
|
477
|
+
if timeout:
|
|
478
|
+
state = run.wait_for_completion(
|
|
479
|
+
timeout=timeout, expected_statuses=expected_statuses
|
|
480
|
+
)
|
|
481
|
+
except RuntimeError as exc:
|
|
482
|
+
# push runs table also when we have errors
|
|
483
|
+
raise_error = exc
|
|
484
|
+
|
|
485
|
+
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
486
|
+
runs = mldb.list_runs(project=project.name, labels=f"workflow={run.run_id}")
|
|
487
|
+
|
|
488
|
+
# TODO: The below section duplicates notifiers.push_pipeline_run_results() logic. We should use it instead.
|
|
489
|
+
errors_counter = 0
|
|
490
|
+
for r in runs:
|
|
491
|
+
if r["status"].get("state", "") == "error":
|
|
492
|
+
errors_counter += 1
|
|
493
|
+
|
|
494
|
+
text = _PipelineRunner._generate_workflow_finished_message(
|
|
495
|
+
run.run_id, errors_counter, run._state
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
notifiers = notifiers or project.notifiers
|
|
499
|
+
if notifiers:
|
|
500
|
+
notifiers.push(text, "info", runs)
|
|
501
|
+
|
|
502
|
+
if raise_error:
|
|
503
|
+
raise raise_error
|
|
504
|
+
return state or run._state, errors_counter, text
|
|
505
|
+
|
|
461
506
|
@staticmethod
|
|
462
507
|
def _get_handler(workflow_handler, workflow_spec, project, secrets):
|
|
463
508
|
if not (workflow_handler and callable(workflow_handler)):
|
|
@@ -474,16 +519,13 @@ class _PipelineRunner(abc.ABC):
|
|
|
474
519
|
return workflow_handler
|
|
475
520
|
|
|
476
521
|
@staticmethod
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
**kwargs,
|
|
485
|
-
):
|
|
486
|
-
pass
|
|
522
|
+
def _generate_workflow_finished_message(run_id, errors_counter, state):
|
|
523
|
+
text = f"Workflow {run_id} finished"
|
|
524
|
+
if errors_counter:
|
|
525
|
+
text += f" with {errors_counter} errors"
|
|
526
|
+
if state:
|
|
527
|
+
text += f", state={state}"
|
|
528
|
+
return text
|
|
487
529
|
|
|
488
530
|
|
|
489
531
|
class _KFPRunner(_PipelineRunner):
|
|
@@ -585,12 +627,14 @@ class _KFPRunner(_PipelineRunner):
|
|
|
585
627
|
return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
|
|
586
628
|
|
|
587
629
|
@staticmethod
|
|
588
|
-
def wait_for_completion(
|
|
589
|
-
|
|
590
|
-
|
|
630
|
+
def wait_for_completion(run, project=None, timeout=None, expected_statuses=None):
|
|
631
|
+
logger.info(
|
|
632
|
+
"Waiting for pipeline run completion", run_id=run.run_id, project=project
|
|
633
|
+
)
|
|
634
|
+
timeout = timeout or 60 * 60
|
|
591
635
|
project_name = project.metadata.name if project else ""
|
|
592
636
|
run_info = wait_for_pipeline_completion(
|
|
593
|
-
run_id,
|
|
637
|
+
run.run_id,
|
|
594
638
|
timeout=timeout,
|
|
595
639
|
expected_statuses=expected_statuses,
|
|
596
640
|
project=project_name,
|
|
@@ -608,51 +652,6 @@ class _KFPRunner(_PipelineRunner):
|
|
|
608
652
|
return resp["run"].get("status", "")
|
|
609
653
|
return ""
|
|
610
654
|
|
|
611
|
-
@staticmethod
|
|
612
|
-
def get_run_status(
|
|
613
|
-
project,
|
|
614
|
-
run,
|
|
615
|
-
timeout=None,
|
|
616
|
-
expected_statuses=None,
|
|
617
|
-
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
618
|
-
**kwargs,
|
|
619
|
-
):
|
|
620
|
-
if timeout is None:
|
|
621
|
-
timeout = 60 * 60
|
|
622
|
-
state = ""
|
|
623
|
-
raise_error = None
|
|
624
|
-
try:
|
|
625
|
-
if timeout:
|
|
626
|
-
logger.info("Waiting for pipeline run completion")
|
|
627
|
-
state = run.wait_for_completion(
|
|
628
|
-
timeout=timeout, expected_statuses=expected_statuses
|
|
629
|
-
)
|
|
630
|
-
except RuntimeError as exc:
|
|
631
|
-
# push runs table also when we have errors
|
|
632
|
-
raise_error = exc
|
|
633
|
-
|
|
634
|
-
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
635
|
-
runs = mldb.list_runs(project=project.name, labels=f"workflow={run.run_id}")
|
|
636
|
-
|
|
637
|
-
# TODO: The below section duplicates notifiers.push_pipeline_run_results() logic. We should use it instead.
|
|
638
|
-
had_errors = 0
|
|
639
|
-
for r in runs:
|
|
640
|
-
if r["status"].get("state", "") == "error":
|
|
641
|
-
had_errors += 1
|
|
642
|
-
|
|
643
|
-
text = f"Workflow {run.run_id} finished"
|
|
644
|
-
if had_errors:
|
|
645
|
-
text += f" with {had_errors} errors"
|
|
646
|
-
if state:
|
|
647
|
-
text += f", state={state}"
|
|
648
|
-
|
|
649
|
-
notifiers = notifiers or project.notifiers
|
|
650
|
-
notifiers.push(text, "info", runs)
|
|
651
|
-
|
|
652
|
-
if raise_error:
|
|
653
|
-
raise raise_error
|
|
654
|
-
return state, had_errors, text
|
|
655
|
-
|
|
656
655
|
|
|
657
656
|
class _LocalRunner(_PipelineRunner):
|
|
658
657
|
"""local pipelines runner"""
|
|
@@ -732,18 +731,10 @@ class _LocalRunner(_PipelineRunner):
|
|
|
732
731
|
return ""
|
|
733
732
|
|
|
734
733
|
@staticmethod
|
|
735
|
-
def wait_for_completion(
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
def get_run_status(
|
|
740
|
-
project,
|
|
741
|
-
run,
|
|
742
|
-
timeout=None,
|
|
743
|
-
expected_statuses=None,
|
|
744
|
-
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
745
|
-
**kwargs,
|
|
746
|
-
):
|
|
734
|
+
def wait_for_completion(run, project=None, timeout=None, expected_statuses=None):
|
|
735
|
+
# TODO: local runner blocks for the duration of the pipeline.
|
|
736
|
+
# Therefore usually there will be nothing to wait for.
|
|
737
|
+
# However, users may run functions with watch=False and then it can be useful to wait for the runs here.
|
|
747
738
|
pass
|
|
748
739
|
|
|
749
740
|
|
|
@@ -924,13 +915,25 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
924
915
|
elif inner_engine.engine == _LocalRunner.engine:
|
|
925
916
|
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
926
917
|
pipeline_runner_run = mldb.read_run(run.run_id, project=project.name)
|
|
918
|
+
|
|
927
919
|
pipeline_runner_run = mlrun.run.RunObject.from_dict(pipeline_runner_run)
|
|
920
|
+
|
|
921
|
+
# here we are waiting for the pipeline run to complete and refreshing after that the pipeline run from the
|
|
922
|
+
# db
|
|
923
|
+
# TODO: do it with timeout
|
|
928
924
|
pipeline_runner_run.logs(db=mldb)
|
|
929
925
|
pipeline_runner_run.refresh()
|
|
930
926
|
run._state = mlrun.common.runtimes.constants.RunStates.run_state_to_pipeline_run_status(
|
|
931
927
|
pipeline_runner_run.status.state
|
|
932
928
|
)
|
|
933
929
|
run._exc = pipeline_runner_run.status.error
|
|
930
|
+
return _LocalRunner.get_run_status(
|
|
931
|
+
project,
|
|
932
|
+
run,
|
|
933
|
+
timeout,
|
|
934
|
+
expected_statuses,
|
|
935
|
+
notifiers=notifiers,
|
|
936
|
+
)
|
|
934
937
|
|
|
935
938
|
else:
|
|
936
939
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
mlrun/run.py
CHANGED
|
@@ -791,6 +791,10 @@ def code_to_function(
|
|
|
791
791
|
raise ValueError("code_output option is only used with notebooks")
|
|
792
792
|
|
|
793
793
|
if is_nuclio:
|
|
794
|
+
mlrun.utils.helpers.validate_single_def_handler(
|
|
795
|
+
function_kind=sub_kind, code=code
|
|
796
|
+
)
|
|
797
|
+
|
|
794
798
|
runtime = RuntimeKinds.resolve_nuclio_runtime(kind, sub_kind)
|
|
795
799
|
# default_handler is only used in :mlrun sub kind, determine the handler to invoke in function.run()
|
|
796
800
|
runtime.spec.default_handler = handler if sub_kind == "mlrun" else ""
|
|
@@ -657,7 +657,7 @@ class APIGateway(ModelObj):
|
|
|
657
657
|
host = self.spec.host
|
|
658
658
|
if not self.spec.host.startswith("http"):
|
|
659
659
|
host = f"https://{self.spec.host}"
|
|
660
|
-
return urljoin(host, self.spec.path)
|
|
660
|
+
return urljoin(host, self.spec.path).rstrip("/")
|
|
661
661
|
|
|
662
662
|
@staticmethod
|
|
663
663
|
def _generate_basic_auth(username: str, password: str):
|
|
@@ -27,7 +27,7 @@ from mlrun.runtimes.nuclio.api_gateway import (
|
|
|
27
27
|
APIGatewaySpec,
|
|
28
28
|
)
|
|
29
29
|
from mlrun.runtimes.nuclio.function import NuclioSpec, NuclioStatus
|
|
30
|
-
from mlrun.utils import logger
|
|
30
|
+
from mlrun.utils import logger, update_in
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class ApplicationSpec(NuclioSpec):
|
|
@@ -293,7 +293,7 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
293
293
|
|
|
294
294
|
:return: True if the function is ready (deployed)
|
|
295
295
|
"""
|
|
296
|
-
if self.requires_build() or force_build:
|
|
296
|
+
if (self.requires_build() and not self.spec.image) or force_build:
|
|
297
297
|
self._fill_credentials()
|
|
298
298
|
self._build_application_image(
|
|
299
299
|
builder_env=builder_env,
|
|
@@ -367,6 +367,12 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
367
367
|
)
|
|
368
368
|
|
|
369
369
|
def from_image(self, image):
|
|
370
|
+
"""
|
|
371
|
+
Deploy the function with an existing nuclio processor image.
|
|
372
|
+
This applies only for the reverse proxy and not the application image.
|
|
373
|
+
|
|
374
|
+
:param image: image name
|
|
375
|
+
"""
|
|
370
376
|
super().from_image(image)
|
|
371
377
|
# nuclio implementation detail - when providing the image and emptying out the source code and build source,
|
|
372
378
|
# nuclio skips rebuilding the image and simply takes the prebuilt image
|
|
@@ -374,6 +380,17 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
374
380
|
self.status.application_source = self.spec.build.source
|
|
375
381
|
self.spec.build.source = ""
|
|
376
382
|
|
|
383
|
+
# save the image in the status, so we won't repopulate the function source code
|
|
384
|
+
self.status.container_image = image
|
|
385
|
+
|
|
386
|
+
# ensure golang runtime and handler for the reverse proxy
|
|
387
|
+
self.spec.nuclio_runtime = "golang"
|
|
388
|
+
update_in(
|
|
389
|
+
self.spec.base_spec,
|
|
390
|
+
"spec.handler",
|
|
391
|
+
"main:Handler",
|
|
392
|
+
)
|
|
393
|
+
|
|
377
394
|
@classmethod
|
|
378
395
|
def get_filename_and_handler(cls) -> (str, str):
|
|
379
396
|
reverse_proxy_file_path = pathlib.Path(__file__).parent / "reverse_proxy.go"
|
|
@@ -549,6 +566,12 @@ class ApplicationRuntime(RemoteRuntime):
|
|
|
549
566
|
self.set_env("SIDECAR_PORT", self.spec.internal_application_port)
|
|
550
567
|
self.set_env("SIDECAR_HOST", "http://localhost")
|
|
551
568
|
|
|
569
|
+
# configure the sidecar container as the default container for logging purposes
|
|
570
|
+
self.set_config(
|
|
571
|
+
"metadata.annotations",
|
|
572
|
+
{"kubectl.kubernetes.io/default-container": self.status.sidecar_name},
|
|
573
|
+
)
|
|
574
|
+
|
|
552
575
|
def _sync_api_gateway(self):
|
|
553
576
|
if not self.status.api_gateway_name:
|
|
554
577
|
return
|
|
@@ -446,6 +446,11 @@ class RemoteRuntime(KubeResource):
|
|
|
446
446
|
return self
|
|
447
447
|
|
|
448
448
|
def from_image(self, image):
|
|
449
|
+
"""
|
|
450
|
+
Deploy the function with an existing nuclio processor image.
|
|
451
|
+
|
|
452
|
+
:param image: image name
|
|
453
|
+
"""
|
|
449
454
|
config = nuclio.config.new_config()
|
|
450
455
|
update_in(
|
|
451
456
|
config,
|
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -480,7 +480,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
480
480
|
trigger_args = stream.trigger_args or {}
|
|
481
481
|
|
|
482
482
|
engine = self.spec.graph.engine or "async"
|
|
483
|
-
if mlrun.mlconf.
|
|
483
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
484
484
|
trigger_args["explicit_ack_mode"] = trigger_args.get(
|
|
485
485
|
"explicit_ack_mode", "explicitOnly"
|
|
486
486
|
)
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -215,9 +215,7 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
215
215
|
image_pull_secret or mlrun.mlconf.function.spec.image_pull_secret.default
|
|
216
216
|
)
|
|
217
217
|
self.node_name = node_name
|
|
218
|
-
self.node_selector =
|
|
219
|
-
node_selector or mlrun.mlconf.get_default_function_node_selector()
|
|
220
|
-
)
|
|
218
|
+
self.node_selector = node_selector or {}
|
|
221
219
|
self._affinity = affinity
|
|
222
220
|
self.priority_class_name = (
|
|
223
221
|
priority_class_name or mlrun.mlconf.default_function_priority_class_name
|
|
@@ -532,7 +530,7 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
532
530
|
return
|
|
533
531
|
|
|
534
532
|
# merge node selectors - precedence to existing node selector
|
|
535
|
-
self.node_selector = mlrun.utils.helpers.
|
|
533
|
+
self.node_selector = mlrun.utils.helpers.merge_dicts_with_precedence(
|
|
536
534
|
node_selector, self.node_selector
|
|
537
535
|
)
|
|
538
536
|
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -445,3 +445,21 @@ def enrich_run_labels(
|
|
|
445
445
|
if label.value not in labels and enrichment:
|
|
446
446
|
labels[label.value] = enrichment
|
|
447
447
|
return labels
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def resolve_node_selectors(
|
|
451
|
+
project_node_selector: dict, instance_node_selector: dict
|
|
452
|
+
) -> dict:
|
|
453
|
+
config_node_selector = mlrun.mlconf.get_default_function_node_selector()
|
|
454
|
+
if project_node_selector or config_node_selector:
|
|
455
|
+
mlrun.utils.logger.debug(
|
|
456
|
+
"Enriching node selector from project and mlrun config",
|
|
457
|
+
project_node_selector=project_node_selector,
|
|
458
|
+
config_node_selector=config_node_selector,
|
|
459
|
+
)
|
|
460
|
+
return mlrun.utils.helpers.merge_dicts_with_precedence(
|
|
461
|
+
config_node_selector,
|
|
462
|
+
project_node_selector,
|
|
463
|
+
instance_node_selector,
|
|
464
|
+
)
|
|
465
|
+
return instance_node_selector
|
mlrun/serving/states.py
CHANGED
|
@@ -872,7 +872,8 @@ class QueueStep(BaseStep):
|
|
|
872
872
|
return event
|
|
873
873
|
|
|
874
874
|
if self._stream:
|
|
875
|
-
|
|
875
|
+
full_event = self.options.get("full_event")
|
|
876
|
+
if full_event or full_event is None and self.next:
|
|
876
877
|
data = storey.utils.wrap_event_for_serialization(event, data)
|
|
877
878
|
self._stream.push(data)
|
|
878
879
|
event.terminated = True
|
|
@@ -1630,7 +1631,11 @@ def _init_async_objects(context, steps):
|
|
|
1630
1631
|
if step.path and not skip_stream:
|
|
1631
1632
|
stream_path = step.path
|
|
1632
1633
|
endpoint = None
|
|
1633
|
-
|
|
1634
|
+
# in case of a queue, we default to a full_event=True
|
|
1635
|
+
full_event = step.options.get("full_event")
|
|
1636
|
+
options = {
|
|
1637
|
+
"full_event": full_event or full_event is None and step.next
|
|
1638
|
+
}
|
|
1634
1639
|
options.update(step.options)
|
|
1635
1640
|
|
|
1636
1641
|
kafka_brokers = get_kafka_brokers_from_dict(options, pop=True)
|
|
@@ -1684,7 +1689,9 @@ def _init_async_objects(context, steps):
|
|
|
1684
1689
|
wait_for_result = True
|
|
1685
1690
|
|
|
1686
1691
|
source_args = context.get_param("source_args", {})
|
|
1687
|
-
explicit_ack =
|
|
1692
|
+
explicit_ack = (
|
|
1693
|
+
is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack_enabled()
|
|
1694
|
+
)
|
|
1688
1695
|
|
|
1689
1696
|
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
1690
1697
|
default_source = storey.SyncEmitSource(
|