mlrun 1.7.0rc13__py3-none-any.whl → 1.7.0rc15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (85) hide show
  1. mlrun/__main__.py +0 -105
  2. mlrun/artifacts/__init__.py +1 -2
  3. mlrun/artifacts/base.py +8 -250
  4. mlrun/artifacts/dataset.py +1 -190
  5. mlrun/artifacts/manager.py +2 -41
  6. mlrun/artifacts/model.py +1 -140
  7. mlrun/artifacts/plots.py +1 -375
  8. mlrun/common/schemas/model_monitoring/__init__.py +4 -0
  9. mlrun/common/schemas/model_monitoring/constants.py +24 -3
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +13 -1
  11. mlrun/common/schemas/project.py +1 -0
  12. mlrun/config.py +14 -4
  13. mlrun/data_types/to_pandas.py +4 -4
  14. mlrun/datastore/base.py +41 -9
  15. mlrun/datastore/datastore_profile.py +50 -3
  16. mlrun/datastore/hdfs.py +5 -0
  17. mlrun/datastore/inmem.py +2 -2
  18. mlrun/datastore/sources.py +43 -2
  19. mlrun/datastore/store_resources.py +2 -6
  20. mlrun/datastore/targets.py +125 -6
  21. mlrun/datastore/v3io.py +1 -1
  22. mlrun/db/auth_utils.py +152 -0
  23. mlrun/db/base.py +1 -1
  24. mlrun/db/httpdb.py +69 -33
  25. mlrun/feature_store/__init__.py +0 -2
  26. mlrun/feature_store/api.py +12 -47
  27. mlrun/feature_store/feature_set.py +9 -0
  28. mlrun/feature_store/retrieval/base.py +9 -4
  29. mlrun/feature_store/retrieval/conversion.py +4 -4
  30. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  31. mlrun/feature_store/retrieval/job.py +2 -0
  32. mlrun/feature_store/retrieval/local_merger.py +2 -0
  33. mlrun/feature_store/retrieval/spark_merger.py +5 -0
  34. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
  35. mlrun/kfpops.py +5 -10
  36. mlrun/launcher/base.py +1 -1
  37. mlrun/launcher/client.py +1 -1
  38. mlrun/lists.py +2 -2
  39. mlrun/model.py +36 -9
  40. mlrun/model_monitoring/api.py +41 -18
  41. mlrun/model_monitoring/application.py +5 -305
  42. mlrun/model_monitoring/applications/__init__.py +11 -0
  43. mlrun/model_monitoring/applications/_application_steps.py +158 -0
  44. mlrun/model_monitoring/applications/base.py +282 -0
  45. mlrun/model_monitoring/applications/context.py +214 -0
  46. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  47. mlrun/model_monitoring/applications/histogram_data_drift.py +92 -77
  48. mlrun/model_monitoring/applications/results.py +99 -0
  49. mlrun/model_monitoring/controller.py +3 -1
  50. mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
  51. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +1 -1
  52. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +67 -4
  53. mlrun/model_monitoring/evidently_application.py +6 -118
  54. mlrun/model_monitoring/helpers.py +1 -1
  55. mlrun/model_monitoring/model_endpoint.py +3 -2
  56. mlrun/model_monitoring/stream_processing.py +2 -3
  57. mlrun/model_monitoring/writer.py +69 -39
  58. mlrun/platforms/iguazio.py +2 -2
  59. mlrun/projects/pipelines.py +24 -7
  60. mlrun/projects/project.py +130 -65
  61. mlrun/render.py +2 -10
  62. mlrun/run.py +1 -4
  63. mlrun/runtimes/__init__.py +3 -3
  64. mlrun/runtimes/base.py +3 -3
  65. mlrun/runtimes/funcdoc.py +0 -28
  66. mlrun/runtimes/local.py +1 -1
  67. mlrun/runtimes/mpijob/__init__.py +0 -20
  68. mlrun/runtimes/mpijob/v1.py +1 -1
  69. mlrun/runtimes/nuclio/api_gateway.py +275 -153
  70. mlrun/runtimes/nuclio/function.py +1 -1
  71. mlrun/runtimes/pod.py +5 -5
  72. mlrun/runtimes/utils.py +1 -1
  73. mlrun/serving/states.py +53 -2
  74. mlrun/utils/helpers.py +27 -40
  75. mlrun/utils/notifications/notification/slack.py +31 -8
  76. mlrun/utils/notifications/notification_pusher.py +133 -14
  77. mlrun/utils/version/version.json +2 -2
  78. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/METADATA +2 -2
  79. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/RECORD +84 -79
  80. mlrun/runtimes/mpijob/v1alpha1.py +0 -29
  81. /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
  82. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/LICENSE +0 -0
  83. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/WHEEL +0 -0
  84. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/entry_points.txt +0 -0
  85. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc15.dist-info}/top_level.txt +0 -0
@@ -409,7 +409,7 @@ class SQLStoreBase(mlrun.model_monitoring.db.StoreBase):
409
409
  + "_"
410
410
  + event[mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME]
411
411
  + "_"
412
- + event[mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME]
412
+ + event[mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME]
413
413
  )
414
414
 
415
415
  def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
16
15
  import json
17
16
  import os
@@ -41,7 +40,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
41
40
  client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
42
41
  """
43
42
 
44
- def __init__(self, project: str, access_key: str):
43
+ def __init__(self, project: str, access_key: typing.Optional[str] = None) -> None:
45
44
  super().__init__(project=project)
46
45
  # Initialize a V3IO client instance
47
46
  self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
@@ -410,7 +409,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
410
409
  mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
411
410
  )
412
411
  metric_name = event.pop(
413
- mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME
412
+ mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME
414
413
  )
415
414
  attributes = {metric_name: json.dumps(event)}
416
415
 
@@ -446,7 +445,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
446
445
  """Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
447
446
  fields = [
448
447
  {
449
- "name": mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME,
448
+ "name": mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME,
450
449
  "type": "string",
451
450
  "nullable": False,
452
451
  }
@@ -703,3 +702,67 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
703
702
  @staticmethod
704
703
  def _get_monitoring_schedules_container(project_name: str) -> str:
705
704
  return f"users/pipelines/{project_name}/monitoring-schedules/functions"
705
+
706
+ def _extract_metrics_from_items(
707
+ self, app_items: list[dict[str, str]]
708
+ ) -> list[mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric]:
709
+ metrics: list[
710
+ mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
711
+ ] = []
712
+ for app_item in app_items:
713
+ # See https://www.iguazio.com/docs/latest-release/services/data-layer/reference/system-attributes/#sys-attr-__name
714
+ app_name = app_item.pop("__name")
715
+ if app_name == ".#schema":
716
+ continue
717
+ for result_name in app_item:
718
+ metrics.append(
719
+ mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric(
720
+ project=self.project,
721
+ app=app_name,
722
+ type=mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.RESULT,
723
+ name=result_name,
724
+ full_name=".".join(
725
+ [
726
+ self.project,
727
+ app_name,
728
+ mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.RESULT,
729
+ result_name,
730
+ ]
731
+ ),
732
+ )
733
+ )
734
+ return metrics
735
+
736
+ def get_model_endpoint_metrics(
737
+ self, endpoint_id: str
738
+ ) -> list[mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric]:
739
+ """Get model monitoring results and metrics on the endpoint"""
740
+ metrics: list[
741
+ mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
742
+ ] = []
743
+ container = self.get_v3io_monitoring_apps_container(self.project)
744
+ try:
745
+ response = self.client.kv.scan(container=container, table_path=endpoint_id)
746
+ except v3io.dataplane.response.HttpResponseError as err:
747
+ if err.status_code == HTTPStatus.NOT_FOUND:
748
+ logger.warning(
749
+ "Attempt getting metrics and results - no data. Check the "
750
+ "project name, endpoint, or wait for the applications to start.",
751
+ container=container,
752
+ table_path=endpoint_id,
753
+ )
754
+ return []
755
+ raise
756
+
757
+ while True:
758
+ metrics.extend(self._extract_metrics_from_items(response.output.items))
759
+ if response.output.last:
760
+ break
761
+ # TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
762
+ response = self.client.kv.scan(
763
+ container=container,
764
+ table_path=endpoint_id,
765
+ marker=response.output.next_marker,
766
+ )
767
+
768
+ return metrics
@@ -12,121 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import uuid
16
- import warnings
17
- from typing import Union
18
-
19
- import pandas as pd
20
- import semver
21
-
22
- from mlrun.errors import MLRunIncompatibleVersionError
23
- from mlrun.model_monitoring.application import ModelMonitoringApplicationBase
24
-
25
- SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.11")
26
-
27
-
28
- def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
29
- if ref.is_compatible(cur) or (
30
- cur.major == ref.major == 0 and cur.minor == ref.minor and cur.patch > ref.patch
31
- ):
32
- return
33
- if cur.major == ref.major == 0 and cur.minor > ref.minor:
34
- warnings.warn(
35
- f"Evidently version {cur} is not compatible with the tested "
36
- f"version {ref}, use at your own risk."
37
- )
38
- else:
39
- raise MLRunIncompatibleVersionError(
40
- f"Evidently version {cur} is not supported, please change to "
41
- f"{ref} (or another compatible version)."
42
- )
43
-
44
-
45
- _HAS_EVIDENTLY = False
46
- try:
47
- import evidently # noqa: F401
48
-
49
- _check_evidently_version(
50
- cur=semver.Version.parse(evidently.__version__),
51
- ref=SUPPORTED_EVIDENTLY_VERSION,
52
- )
53
- _HAS_EVIDENTLY = True
54
- except ModuleNotFoundError:
55
- pass
56
-
57
-
58
- if _HAS_EVIDENTLY:
59
- from evidently.renderers.notebook_utils import determine_template
60
- from evidently.report.report import Report
61
- from evidently.suite.base_suite import Suite
62
- from evidently.ui.type_aliases import STR_UUID
63
- from evidently.ui.workspace import Workspace
64
- from evidently.utils.dashboard import TemplateParams
65
-
66
-
67
- class EvidentlyModelMonitoringApplicationBase(ModelMonitoringApplicationBase):
68
- def __init__(
69
- self, evidently_workspace_path: str, evidently_project_id: "STR_UUID"
70
- ) -> None:
71
- """
72
- A class for integrating Evidently for mlrun model monitoring within a monitoring application.
73
- Note: evidently is not installed by default in the mlrun/mlrun image.
74
- It must be installed separately to use this class.
75
-
76
- :param evidently_workspace_path: (str) The path to the Evidently workspace.
77
- :param evidently_project_id: (str) The ID of the Evidently project.
78
-
79
- """
80
- if not _HAS_EVIDENTLY:
81
- raise ModuleNotFoundError("Evidently is not installed - the app cannot run")
82
- self.evidently_workspace = Workspace.create(evidently_workspace_path)
83
- self.evidently_project_id = evidently_project_id
84
- self.evidently_project = self.evidently_workspace.get_project(
85
- evidently_project_id
86
- )
87
-
88
- def log_evidently_object(
89
- self, evidently_object: Union["Report", "Suite"], artifact_name: str
90
- ):
91
- """
92
- Logs an Evidently report or suite as an artifact.
93
-
94
- :param evidently_object: (Union[Report, Suite]) The Evidently report or suite object.
95
- :param artifact_name: (str) The name for the logged artifact.
96
- """
97
- evidently_object_html = evidently_object.get_html()
98
- self.context.log_artifact(
99
- artifact_name, body=evidently_object_html.encode("utf-8"), format="html"
100
- )
101
-
102
- def log_project_dashboard(
103
- self,
104
- timestamp_start: pd.Timestamp,
105
- timestamp_end: pd.Timestamp,
106
- artifact_name: str = "dashboard",
107
- ):
108
- """
109
- Logs an Evidently project dashboard.
110
-
111
- :param timestamp_start: (pd.Timestamp) The start timestamp for the dashboard data.
112
- :param timestamp_end: (pd.Timestamp) The end timestamp for the dashboard data.
113
- :param artifact_name: (str) The name for the logged artifact.
114
- """
115
-
116
- dashboard_info = self.evidently_project.build_dashboard_info(
117
- timestamp_start, timestamp_end
118
- )
119
- template_params = TemplateParams(
120
- dashboard_id="pd_" + str(uuid.uuid4()).replace("-", ""),
121
- dashboard_info=dashboard_info,
122
- additional_graphs={},
123
- )
124
-
125
- dashboard_html = self._render(determine_template("inline"), template_params)
126
- self.context.log_artifact(
127
- artifact_name, body=dashboard_html.encode("utf-8"), format="html"
128
- )
129
-
130
- @staticmethod
131
- def _render(temple_func, template_params: "TemplateParams"):
132
- return temple_func(params=template_params)
15
+ # TODO : delete this file in 1.9.0
16
+ from mlrun.model_monitoring.applications import ( # noqa: F401
17
+ _HAS_EVIDENTLY,
18
+ SUPPORTED_EVIDENTLY_VERSION,
19
+ EvidentlyModelMonitoringApplicationBase,
20
+ )
@@ -215,7 +215,7 @@ def update_model_endpoint_last_request(
215
215
 
216
216
  def calculate_inputs_statistics(
217
217
  sample_set_statistics: dict, inputs: pd.DataFrame
218
- ) -> dict:
218
+ ) -> mlrun.common.model_monitoring.helpers.FeatureStats:
219
219
  """
220
220
  Calculate the inputs data statistics for drift monitoring purpose.
221
221
 
@@ -17,6 +17,7 @@ from dataclasses import dataclass, field
17
17
  from typing import Any
18
18
 
19
19
  import mlrun.model
20
+ from mlrun.common.model_monitoring.helpers import FeatureStats
20
21
  from mlrun.common.schemas.model_monitoring.constants import (
21
22
  EndpointType,
22
23
  EventKeyMetrics,
@@ -42,8 +43,8 @@ class ModelEndpointSpec(mlrun.model.ModelObj):
42
43
 
43
44
  @dataclass
44
45
  class ModelEndpointStatus(mlrun.model.ModelObj):
45
- feature_stats: dict = field(default_factory=dict)
46
- current_stats: dict = field(default_factory=dict)
46
+ feature_stats: FeatureStats = field(default_factory=dict)
47
+ current_stats: FeatureStats = field(default_factory=dict)
47
48
  first_request: str = ""
48
49
  last_request: str = ""
49
50
  error_count: int = 0
@@ -40,7 +40,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
40
40
  ProjectSecretKeys,
41
41
  PrometheusEndpoints,
42
42
  )
43
- from mlrun.model_monitoring.helpers import get_endpoint_record
44
43
  from mlrun.utils import logger
45
44
 
46
45
 
@@ -807,7 +806,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
807
806
  # left them
808
807
  if endpoint_id not in self.endpoints:
809
808
  logger.info("Trying to resume state", endpoint_id=endpoint_id)
810
- endpoint_record = get_endpoint_record(
809
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
811
810
  project=self.project,
812
811
  endpoint_id=endpoint_id,
813
812
  )
@@ -940,7 +939,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
940
939
  label_values = event[EventFieldType.PREDICTION]
941
940
  # Get feature names and label columns
942
941
  if endpoint_id not in self.feature_names:
943
- endpoint_record = get_endpoint_record(
942
+ endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
944
943
  project=self.project,
945
944
  endpoint_id=endpoint_id,
946
945
  )
@@ -23,14 +23,18 @@ from v3io_frames.errors import Error as V3IOFramesError
23
23
  from v3io_frames.frames_pb2 import IGNORE
24
24
 
25
25
  import mlrun.common.model_monitoring
26
+ import mlrun.common.schemas
26
27
  import mlrun.common.schemas.alert as alert_constants
27
28
  import mlrun.model_monitoring
28
29
  import mlrun.model_monitoring.db.stores
29
30
  import mlrun.utils.v3io_clients
30
31
  from mlrun.common.schemas.model_monitoring.constants import (
31
32
  EventFieldType,
33
+ MetricData,
34
+ ResultData,
32
35
  ResultStatusApp,
33
36
  WriterEvent,
37
+ WriterEventKind,
34
38
  )
35
39
  from mlrun.common.schemas.notification import NotificationKind, NotificationSeverity
36
40
  from mlrun.model_monitoring.helpers import get_endpoint_record
@@ -75,20 +79,20 @@ class _Notifier:
75
79
  self._severity = severity
76
80
 
77
81
  def _should_send_event(self) -> bool:
78
- return self._event[WriterEvent.RESULT_STATUS] >= ResultStatusApp.detected
82
+ return self._event[ResultData.RESULT_STATUS] >= ResultStatusApp.detected.value
79
83
 
80
84
  def _generate_message(self) -> str:
81
85
  return f"""\
82
86
  The monitoring app `{self._event[WriterEvent.APPLICATION_NAME]}` \
83
- of kind `{self._event[WriterEvent.RESULT_KIND]}` \
87
+ of kind `{self._event[ResultData.RESULT_KIND]}` \
84
88
  detected a problem in model endpoint ID `{self._event[WriterEvent.ENDPOINT_ID]}` \
85
89
  at time `{self._event[WriterEvent.START_INFER_TIME]}`.
86
90
 
87
91
  Result data:
88
- Name: `{self._event[WriterEvent.RESULT_NAME]}`
89
- Value: `{self._event[WriterEvent.RESULT_VALUE]}`
90
- Status: `{self._event[WriterEvent.RESULT_STATUS]}`
91
- Extra data: `{self._event[WriterEvent.RESULT_EXTRA_DATA]}`\
92
+ Name: `{self._event[ResultData.RESULT_NAME]}`
93
+ Value: `{self._event[ResultData.RESULT_VALUE]}`
94
+ Status: `{self._event[ResultData.RESULT_STATUS]}`
95
+ Extra data: `{self._event[ResultData.RESULT_EXTRA_DATA]}`\
92
96
  """
93
97
 
94
98
  def notify(self) -> None:
@@ -144,19 +148,25 @@ class ModelMonitoringWriter(StepToDict):
144
148
  rate=_TSDB_RATE,
145
149
  )
146
150
 
147
- def _update_kv_db(self, event: _AppResultEvent) -> None:
151
+ def _update_kv_db(self, event: _AppResultEvent, kind: str = "result") -> None:
152
+ if kind == "metric":
153
+ # TODO : Implement the logic for writing metrics to KV
154
+ return
148
155
  event = _AppResultEvent(event.copy())
149
156
  application_result_store = mlrun.model_monitoring.get_store_object(
150
157
  project=self.project
151
158
  )
152
159
  application_result_store.write_application_result(event=event)
153
160
 
154
- def _update_tsdb(self, event: _AppResultEvent) -> None:
161
+ def _update_tsdb(self, event: _AppResultEvent, kind: str = "result") -> None:
162
+ if kind == "metric":
163
+ # TODO : Implement the logic for writing metrics to TSDB
164
+ return
155
165
  event = _AppResultEvent(event.copy())
156
166
  event[WriterEvent.END_INFER_TIME] = datetime.datetime.fromisoformat(
157
167
  event[WriterEvent.END_INFER_TIME]
158
168
  )
159
- del event[WriterEvent.RESULT_EXTRA_DATA]
169
+ del event[ResultData.RESULT_EXTRA_DATA]
160
170
  try:
161
171
  self._tsdb_client.write(
162
172
  backend=_TSDB_BE,
@@ -166,7 +176,7 @@ class ModelMonitoringWriter(StepToDict):
166
176
  WriterEvent.END_INFER_TIME,
167
177
  WriterEvent.ENDPOINT_ID,
168
178
  WriterEvent.APPLICATION_NAME,
169
- WriterEvent.RESULT_NAME,
179
+ ResultData.RESULT_NAME,
170
180
  ],
171
181
  )
172
182
  logger.info("Updated V3IO TSDB successfully", table=_TSDB_TABLE)
@@ -180,20 +190,21 @@ class ModelMonitoringWriter(StepToDict):
180
190
 
181
191
  @staticmethod
182
192
  def _generate_event_on_drift(
183
- uid: str, drift_status: str, event_value: dict, project_name: str
184
- ):
193
+ model_endpoint: str, drift_status: str, event_value: dict, project_name: str
194
+ ) -> None:
185
195
  if (
186
- drift_status == ResultStatusApp.detected
187
- or drift_status == ResultStatusApp.potential_detection
196
+ drift_status == ResultStatusApp.detected.value
197
+ or drift_status == ResultStatusApp.potential_detection.value
188
198
  ):
199
+ logger.info("Sending an alert")
189
200
  entity = {
190
201
  "kind": alert_constants.EventEntityKind.MODEL,
191
202
  "project": project_name,
192
- "id": uid,
203
+ "model_endpoint": model_endpoint,
193
204
  }
194
205
  event_kind = (
195
206
  alert_constants.EventKind.DRIFT_DETECTED
196
- if drift_status == ResultStatusApp.detected
207
+ if drift_status == ResultStatusApp.detected.value
197
208
  else alert_constants.EventKind.DRIFT_SUSPECTED
198
209
  )
199
210
  event_data = mlrun.common.schemas.Event(
@@ -202,37 +213,57 @@ class ModelMonitoringWriter(StepToDict):
202
213
  mlrun.get_run_db().generate_event(event_kind, event_data)
203
214
 
204
215
  @staticmethod
205
- def _reconstruct_event(event: _RawEvent) -> _AppResultEvent:
216
+ def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, str]:
206
217
  """
207
218
  Modify the raw event into the expected monitoring application event
208
219
  schema as defined in `mlrun.common.schemas.model_monitoring.constants.WriterEvent`
209
220
  """
210
- try:
211
- result_event = _AppResultEvent(
212
- {key: event[key] for key in WriterEvent.list()}
221
+ if not isinstance(event, dict):
222
+ raise _WriterEventTypeError(
223
+ f"The event is of type: {type(event)}, expected a dictionary"
213
224
  )
214
- result_event[WriterEvent.CURRENT_STATS] = json.loads(
215
- event[WriterEvent.CURRENT_STATS]
225
+ kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
226
+ result_event = _AppResultEvent(json.loads(event.pop(WriterEvent.DATA, "{}")))
227
+ if not result_event: # BC for < 1.7.0, can be removed in 1.9.0
228
+ result_event = _AppResultEvent(event)
229
+ else:
230
+ result_event.update(_AppResultEvent(event))
231
+
232
+ expected_keys = list(
233
+ set(WriterEvent.list()).difference(
234
+ [WriterEvent.EVENT_KIND, WriterEvent.DATA]
216
235
  )
217
- return result_event
218
- except KeyError as err:
236
+ )
237
+ if kind == WriterEventKind.METRIC:
238
+ expected_keys.extend(MetricData.list())
239
+ elif kind == WriterEventKind.RESULT:
240
+ expected_keys.extend(ResultData.list())
241
+ else:
219
242
  raise _WriterEventValueError(
220
- "The received event misses some keys compared to the expected "
221
- "monitoring application event schema"
222
- ) from err
223
- except TypeError as err:
224
- raise _WriterEventTypeError(
225
- f"The event is of type: {type(event)}, expected a dictionary"
226
- ) from err
243
+ f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
244
+ )
245
+ missing_keys = [key for key in expected_keys if key not in result_event]
246
+ if missing_keys:
247
+ raise _WriterEventValueError(
248
+ f"The received event misses some keys compared to the expected "
249
+ f"monitoring application event schema: {missing_keys}"
250
+ )
251
+
252
+ return result_event, kind
227
253
 
228
254
  def do(self, event: _RawEvent) -> None:
229
- event = self._reconstruct_event(event)
255
+ event, kind = self._reconstruct_event(event)
230
256
  logger.info("Starting to write event", event=event)
231
- self._update_tsdb(event)
232
- self._update_kv_db(event)
257
+
258
+ self._update_tsdb(event, kind)
259
+ self._update_kv_db(event, kind)
260
+ logger.info("Completed event DB writes")
233
261
  _Notifier(event=event, notification_pusher=self._custom_notifier).notify()
234
262
 
235
- if mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled:
263
+ if (
264
+ mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
265
+ and kind == WriterEventKind.RESULT
266
+ ):
236
267
  endpoint_id = event[WriterEvent.ENDPOINT_ID]
237
268
  endpoint_record = self._endpoints_records.setdefault(
238
269
  endpoint_id,
@@ -242,13 +273,12 @@ class ModelMonitoringWriter(StepToDict):
242
273
  "app_name": event[WriterEvent.APPLICATION_NAME],
243
274
  "model": endpoint_record.get(EventFieldType.MODEL),
244
275
  "model_endpoint_id": event[WriterEvent.ENDPOINT_ID],
245
- "result_name": event[WriterEvent.RESULT_NAME],
246
- "result_value": event[WriterEvent.RESULT_VALUE],
276
+ "result_name": event[ResultData.RESULT_NAME],
277
+ "result_value": event[ResultData.RESULT_VALUE],
247
278
  }
248
279
  self._generate_event_on_drift(
249
280
  event[WriterEvent.ENDPOINT_ID],
250
- event[WriterEvent.RESULT_STATUS],
281
+ event[ResultData.RESULT_STATUS],
251
282
  event_value,
252
283
  self.project,
253
284
  )
254
- logger.info("Completed event DB writes")
@@ -525,8 +525,8 @@ def add_or_refresh_credentials(
525
525
  # different access keys for the 2 usages
526
526
  token = (
527
527
  token
528
- # can't use mlrun.runtimes.constants.FunctionEnvironmentVariables.auth_session cause this is running in the
529
- # import execution path (when we're initializing the run db) and therefore we can't import mlrun.runtimes
528
+ # can't use mlrun.common.runtimes.constants.FunctionEnvironmentVariables.auth_session cause this is running
529
+ # in the import execution path (when we're initializing the run db) and therefore we can't import mlrun.runtimes
530
530
  or os.environ.get("MLRUN_AUTH_SESSION")
531
531
  or os.environ.get("V3IO_ACCESS_KEY")
532
532
  )
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
  import abc
15
15
  import builtins
16
+ import http
16
17
  import importlib.util as imputil
17
18
  import os
18
19
  import tempfile
@@ -521,7 +522,7 @@ class _PipelineRunner(abc.ABC):
521
522
  @staticmethod
522
523
  def _get_handler(workflow_handler, workflow_spec, project, secrets):
523
524
  if not (workflow_handler and callable(workflow_handler)):
524
- workflow_file = workflow_spec.get_source_file(project.spec.context)
525
+ workflow_file = workflow_spec.get_source_file(project.spec.get_code_path())
525
526
  workflow_handler = create_pipeline(
526
527
  project,
527
528
  workflow_file,
@@ -553,7 +554,7 @@ class _KFPRunner(_PipelineRunner):
553
554
  @classmethod
554
555
  def save(cls, project, workflow_spec: WorkflowSpec, target, artifact_path=None):
555
556
  pipeline_context.set(project, workflow_spec)
556
- workflow_file = workflow_spec.get_source_file(project.spec.context)
557
+ workflow_file = workflow_spec.get_source_file(project.spec.get_code_path())
557
558
  functions = FunctionsDict(project)
558
559
  pipeline = create_pipeline(
559
560
  project,
@@ -882,17 +883,33 @@ class _RemoteRunner(_PipelineRunner):
882
883
  get_workflow_id_timeout=get_workflow_id_timeout,
883
884
  )
884
885
 
886
+ def _get_workflow_id_or_bail():
887
+ try:
888
+ return run_db.get_workflow_id(
889
+ project=project.name,
890
+ name=workflow_response.name,
891
+ run_id=workflow_response.run_id,
892
+ engine=workflow_spec.engine,
893
+ )
894
+ except mlrun.errors.MLRunHTTPStatusError as get_wf_exc:
895
+ # fail fast on specific errors
896
+ if get_wf_exc.error_status_code in [
897
+ http.HTTPStatus.PRECONDITION_FAILED
898
+ ]:
899
+ raise mlrun.errors.MLRunFatalFailureError(
900
+ original_exception=get_wf_exc
901
+ )
902
+
903
+ # raise for a retry (on other errors)
904
+ raise
905
+
885
906
  # Getting workflow id from run:
886
907
  response = retry_until_successful(
887
908
  1,
888
909
  get_workflow_id_timeout,
889
910
  logger,
890
911
  False,
891
- run_db.get_workflow_id,
892
- project=project.name,
893
- name=workflow_response.name,
894
- run_id=workflow_response.run_id,
895
- engine=workflow_spec.engine,
912
+ _get_workflow_id_or_bail,
896
913
  )
897
914
  workflow_id = response.workflow_id
898
915
  # After fetching the workflow_id the workflow executed successfully