mlrun 1.7.0rc34__py3-none-any.whl → 1.7.0rc35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (34) hide show
  1. mlrun/artifacts/base.py +1 -0
  2. mlrun/common/schemas/__init__.py +0 -1
  3. mlrun/common/schemas/model_monitoring/__init__.py +1 -2
  4. mlrun/common/schemas/model_monitoring/constants.py +3 -16
  5. mlrun/common/schemas/notification.py +1 -1
  6. mlrun/common/types.py +1 -0
  7. mlrun/config.py +6 -7
  8. mlrun/datastore/sources.py +8 -4
  9. mlrun/db/base.py +2 -3
  10. mlrun/db/httpdb.py +3 -3
  11. mlrun/model.py +1 -1
  12. mlrun/model_monitoring/applications/evidently_base.py +4 -5
  13. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +5 -0
  14. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +2 -2
  15. mlrun/model_monitoring/db/tsdb/base.py +6 -3
  16. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  17. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +22 -3
  18. mlrun/model_monitoring/stream_processing.py +5 -153
  19. mlrun/projects/pipelines.py +76 -73
  20. mlrun/run.py +4 -0
  21. mlrun/runtimes/nuclio/application/application.py +25 -2
  22. mlrun/runtimes/nuclio/function.py +5 -0
  23. mlrun/runtimes/nuclio/serving.py +1 -1
  24. mlrun/runtimes/pod.py +2 -4
  25. mlrun/serving/states.py +3 -1
  26. mlrun/utils/helpers.py +27 -14
  27. mlrun/utils/version/version.json +2 -2
  28. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/METADATA +3 -1
  29. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/RECORD +33 -34
  30. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/WHEEL +1 -1
  31. mlrun/model_monitoring/prometheus.py +0 -216
  32. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/LICENSE +0 -0
  33. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/entry_points.txt +0 -0
  34. {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/top_level.txt +0 -0
mlrun/artifacts/base.py CHANGED
@@ -632,6 +632,7 @@ class DirArtifactSpec(ArtifactSpec):
632
632
  "src_path",
633
633
  "target_path",
634
634
  "db_key",
635
+ "producer",
635
636
  ]
636
637
 
637
638
 
@@ -151,7 +151,6 @@ from .model_monitoring import (
151
151
  ModelMonitoringMode,
152
152
  ModelMonitoringStoreKinds,
153
153
  MonitoringFunctionNames,
154
- PrometheusEndpoints,
155
154
  TSDBTarget,
156
155
  V3IOTSDBTables,
157
156
  )
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .constants import (
16
+ V3IO_MODEL_MONITORING_DB,
16
17
  ControllerPolicy,
17
18
  DriftStatus,
18
19
  EndpointType,
@@ -31,8 +32,6 @@ from .constants import (
31
32
  MonitoringFunctionNames,
32
33
  PredictionsQueryConstants,
33
34
  ProjectSecretKeys,
34
- PrometheusEndpoints,
35
- PrometheusMetric,
36
35
  ResultData,
37
36
  ResultKindApp,
38
37
  SchedulingKeys,
@@ -170,7 +170,6 @@ class StreamKind(MonitoringStrEnum):
170
170
  class TSDBTarget(MonitoringStrEnum):
171
171
  V3IO_TSDB = "v3io-tsdb"
172
172
  TDEngine = "tdengine"
173
- PROMETHEUS = "prometheus"
174
173
 
175
174
 
176
175
  class ProjectSecretKeys:
@@ -231,21 +230,6 @@ class EndpointType(IntEnum):
231
230
  LEAF_EP = 3 # end point that is a child of a router
232
231
 
233
232
 
234
- class PrometheusMetric:
235
- PREDICTIONS_TOTAL = "predictions_total"
236
- MODEL_LATENCY_SECONDS = "model_latency_seconds"
237
- INCOME_FEATURES = "income_features"
238
- ERRORS_TOTAL = "errors_total"
239
- DRIFT_METRICS = "drift_metrics"
240
- DRIFT_STATUS = "drift_status"
241
-
242
-
243
- class PrometheusEndpoints(MonitoringStrEnum):
244
- MODEL_MONITORING_METRICS = "/model-monitoring-metrics"
245
- MONITORING_BATCH_METRICS = "/monitoring-batch-metrics"
246
- MONITORING_DRIFT_STATUS = "/monitoring-drift-status"
247
-
248
-
249
233
  class MonitoringFunctionNames(MonitoringStrEnum):
250
234
  STREAM = "model-monitoring-stream"
251
235
  APPLICATION_CONTROLLER = "model-monitoring-controller"
@@ -381,3 +365,6 @@ class SpecialApps:
381
365
 
382
366
 
383
367
  _RESERVED_FUNCTION_NAMES = MonitoringFunctionNames.list() + [SpecialApps.MLRUN_INFRA]
368
+
369
+
370
+ V3IO_MODEL_MONITORING_DB = "v3io"
@@ -55,7 +55,7 @@ class Notification(pydantic.BaseModel):
55
55
  message: str
56
56
  severity: NotificationSeverity
57
57
  when: list[str]
58
- condition: str
58
+ condition: str = None
59
59
  params: dict[str, typing.Any] = None
60
60
  status: NotificationStatus = None
61
61
  sent_time: typing.Union[str, datetime.datetime] = None
mlrun/common/types.py CHANGED
@@ -30,6 +30,7 @@ class HTTPMethod(StrEnum):
30
30
  GET = "GET"
31
31
  POST = "POST"
32
32
  DELETE = "DELETE"
33
+ PATCH = "PATCH"
33
34
 
34
35
 
35
36
  class Operation(StrEnum):
mlrun/config.py CHANGED
@@ -485,10 +485,10 @@ default_config = {
485
485
  # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
486
486
  # git+https://github.com/mlrun/mlrun@development. by default uses the version
487
487
  "mlrun_version_specifier": "",
488
- "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1", # kaniko builder image
488
+ "kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2", # kaniko builder image
489
489
  "kaniko_init_container_image": "alpine:3.18",
490
490
  # image for kaniko init container when docker registry is ECR
491
- "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",
491
+ "kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
492
492
  # kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
493
493
  # a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
494
494
  "kaniko_image_fs_extraction_retries": "3",
@@ -1237,12 +1237,11 @@ class Config:
1237
1237
 
1238
1238
  return storage_options
1239
1239
 
1240
- def is_explicit_ack(self, version=None) -> bool:
1241
- if not version:
1242
- version = self.nuclio_version
1240
+ def is_explicit_ack_enabled(self) -> bool:
1243
1241
  return self.httpdb.nuclio.explicit_ack == "enabled" and (
1244
- not version
1245
- or semver.VersionInfo.parse(version) >= semver.VersionInfo.parse("1.12.10")
1242
+ not self.nuclio_version
1243
+ or semver.VersionInfo.parse(self.nuclio_version)
1244
+ >= semver.VersionInfo.parse("1.12.10")
1246
1245
  )
1247
1246
 
1248
1247
 
@@ -85,7 +85,8 @@ class BaseSourceDriver(DataSource):
85
85
  )
86
86
 
87
87
  explicit_ack = (
88
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
88
+ is_explicit_ack_supported(context)
89
+ and mlrun.mlconf.is_explicit_ack_enabled()
89
90
  )
90
91
  return storey.SyncEmitSource(
91
92
  context=context,
@@ -944,7 +945,8 @@ class OnlineSource(BaseSourceDriver):
944
945
 
945
946
  source_args = self.attributes.get("source_args", {})
946
947
  explicit_ack = (
947
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
948
+ is_explicit_ack_supported(context)
949
+ and mlrun.mlconf.is_explicit_ack_enabled()
948
950
  )
949
951
  # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
950
952
  src_class = storey.SyncEmitSource(
@@ -1029,7 +1031,8 @@ class StreamSource(OnlineSource):
1029
1031
  engine = "async"
1030
1032
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
1031
1033
  engine = function.spec.graph.engine
1032
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1034
+
1035
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
1033
1036
  kwargs["explicit_ack_mode"] = "explicitOnly"
1034
1037
  kwargs["worker_allocation_mode"] = "static"
1035
1038
 
@@ -1116,7 +1119,8 @@ class KafkaSource(OnlineSource):
1116
1119
  engine = "async"
1117
1120
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
1118
1121
  engine = function.spec.graph.engine
1119
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1122
+
1123
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
1120
1124
  explicit_ack_mode = "explicitOnly"
1121
1125
  extra_attributes["workerAllocationMode"] = extra_attributes.get(
1122
1126
  "worker_allocation_mode", "static"
mlrun/db/base.py CHANGED
@@ -242,9 +242,8 @@ class RunDBInterface(ABC):
242
242
  )
243
243
  artifact_identifiers.append(
244
244
  mlrun.common.schemas.ArtifactIdentifier(
245
- key=mlrun.utils.get_in_artifact(artifact_obj, "key"),
246
- # we are passing tree as uid when storing an artifact, so if uid is not defined,
247
- # pass the tree as uid
245
+ # we pass the db_key and not the key so the API will be able to find the artifact in the db
246
+ key=mlrun.utils.get_in_artifact(artifact_obj, "db_key"),
248
247
  uid=mlrun.utils.get_in_artifact(artifact_obj, "uid"),
249
248
  producer_id=mlrun.utils.get_in_artifact(artifact_obj, "tree"),
250
249
  kind=mlrun.utils.get_in_artifact(artifact_obj, "kind"),
mlrun/db/httpdb.py CHANGED
@@ -1015,7 +1015,7 @@ class HTTPRunDB(RunDBInterface):
1015
1015
  "format": format_,
1016
1016
  "tag": tag,
1017
1017
  "tree": tree,
1018
- "object_uid": uid,
1018
+ "object-uid": uid,
1019
1019
  }
1020
1020
  if iter is not None:
1021
1021
  params["iter"] = str(iter)
@@ -1051,7 +1051,7 @@ class HTTPRunDB(RunDBInterface):
1051
1051
  "key": key,
1052
1052
  "tag": tag,
1053
1053
  "tree": tree,
1054
- "object_uid": uid,
1054
+ "object-uid": uid,
1055
1055
  "iter": iter,
1056
1056
  "deletion_strategy": deletion_strategy,
1057
1057
  }
@@ -3380,7 +3380,7 @@ class HTTPRunDB(RunDBInterface):
3380
3380
  By default, the image is mlrun/mlrun.
3381
3381
  """
3382
3382
  self.api_call(
3383
- method=mlrun.common.types.HTTPMethod.POST,
3383
+ method=mlrun.common.types.HTTPMethod.PATCH,
3384
3384
  path=f"projects/{project}/model-monitoring/model-monitoring-controller",
3385
3385
  params={
3386
3386
  "base_period": base_period,
mlrun/model.py CHANGED
@@ -754,7 +754,7 @@ class Notification(ModelObj):
754
754
  "Both 'secret_params' and 'params' are empty, at least one must be defined."
755
755
  )
756
756
 
757
- notification_class.validate_params(secret_params or params)
757
+ notification_class.validate_params(secret_params | params)
758
758
 
759
759
  @staticmethod
760
760
  def validate_notification_uniqueness(notifications: list["Notification"]):
@@ -23,7 +23,7 @@ import mlrun.model_monitoring.applications.base as mm_base
23
23
  import mlrun.model_monitoring.applications.context as mm_context
24
24
  from mlrun.errors import MLRunIncompatibleVersionError
25
25
 
26
- SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.11")
26
+ SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.32")
27
27
 
28
28
 
29
29
  def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
@@ -57,12 +57,11 @@ except ModuleNotFoundError:
57
57
 
58
58
 
59
59
  if _HAS_EVIDENTLY:
60
- from evidently.renderers.notebook_utils import determine_template
61
60
  from evidently.report.report import Report
62
61
  from evidently.suite.base_suite import Suite
63
62
  from evidently.ui.type_aliases import STR_UUID
64
63
  from evidently.ui.workspace import Workspace
65
- from evidently.utils.dashboard import TemplateParams
64
+ from evidently.utils.dashboard import TemplateParams, file_html_template
66
65
 
67
66
 
68
67
  class EvidentlyModelMonitoringApplicationBase(mm_base.ModelMonitoringApplicationBase):
@@ -123,7 +122,7 @@ class EvidentlyModelMonitoringApplicationBase(mm_base.ModelMonitoringApplication
123
122
  additional_graphs={},
124
123
  )
125
124
 
126
- dashboard_html = self._render(determine_template("inline"), template_params)
125
+ dashboard_html = self._render(file_html_template, template_params)
127
126
  self.context.log_artifact(
128
127
  artifact_name, body=dashboard_html.encode("utf-8"), format="html"
129
128
  )
@@ -201,7 +200,7 @@ class EvidentlyModelMonitoringApplicationBaseV2(
201
200
  additional_graphs={},
202
201
  )
203
202
 
204
- dashboard_html = self._render(determine_template("inline"), template_params)
203
+ dashboard_html = self._render(file_html_template, template_params)
205
204
  monitoring_context.log_artifact(
206
205
  artifact_name, body=dashboard_html.encode("utf-8"), format="html"
207
206
  )
@@ -177,6 +177,11 @@ class SQLStoreBase(StoreBase):
177
177
  param table: SQLAlchemy declarative table.
178
178
  :param criteria: A list of binary expressions that filter the query.
179
179
  """
180
+ if not self._engine.has_table(table.__tablename__):
181
+ logger.debug(
182
+ f"Table {table.__tablename__} does not exist in the database. Skipping deletion."
183
+ )
184
+ return
180
185
  with create_session(dsn=self._sql_connection_string) as session:
181
186
  # Generate and commit the delete query
182
187
  session.query(
@@ -408,14 +408,14 @@ class KVStoreBase(StoreBase):
408
408
 
409
409
  """
410
410
  try:
411
- data = self.client.kv.get(
411
+ response = self.client.kv.get(
412
412
  container=self._get_monitoring_schedules_container(
413
413
  project_name=self.project
414
414
  ),
415
415
  table_path=endpoint_id,
416
416
  key=application_name,
417
417
  )
418
- return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
418
+ return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
419
419
  except v3io.dataplane.response.HttpResponseError as err:
420
420
  logger.debug("Error while getting last analyzed time", err=err)
421
421
  raise mlrun.errors.MLRunNotFoundError(
@@ -27,7 +27,7 @@ from mlrun.utils import logger
27
27
  class TSDBConnector(ABC):
28
28
  type: typing.ClassVar[str]
29
29
 
30
- def __init__(self, project: str):
30
+ def __init__(self, project: str) -> None:
31
31
  """
32
32
  Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
33
33
  At the moment we have 3 different types of monitoring data:
@@ -42,10 +42,10 @@ class TSDBConnector(ABC):
42
42
  writer.
43
43
 
44
44
  :param project: the name of the project.
45
-
46
45
  """
47
46
  self.project = project
48
47
 
48
+ @abstractmethod
49
49
  def apply_monitoring_stream_steps(self, graph):
50
50
  """
51
51
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -58,6 +58,7 @@ class TSDBConnector(ABC):
58
58
  """
59
59
  pass
60
60
 
61
+ @abstractmethod
61
62
  def write_application_event(
62
63
  self,
63
64
  event: dict,
@@ -69,13 +70,14 @@ class TSDBConnector(ABC):
69
70
  :raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
70
71
  """
71
72
 
73
+ @abstractmethod
72
74
  def delete_tsdb_resources(self):
73
75
  """
74
76
  Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
75
77
  """
76
-
77
78
  pass
78
79
 
80
+ @abstractmethod
79
81
  def get_model_endpoint_real_time_metrics(
80
82
  self,
81
83
  endpoint_id: str,
@@ -102,6 +104,7 @@ class TSDBConnector(ABC):
102
104
  """
103
105
  pass
104
106
 
107
+ @abstractmethod
105
108
  def create_tables(self) -> None:
106
109
  """
107
110
  Create the TSDB tables using the TSDB connector. At the moment we support 3 types of tables:
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
16
15
  import json
17
16
 
@@ -21,8 +20,6 @@ from mlrun.common.schemas.model_monitoring import (
21
20
  EventKeyMetrics,
22
21
  )
23
22
 
24
- _TABLE_COLUMN = "table_column"
25
-
26
23
 
27
24
  class ProcessBeforeTDEngine(mlrun.feature_store.steps.MapClass):
28
25
  def __init__(self, **kwargs):
@@ -11,7 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
14
+
15
+ from typing import Any
15
16
 
16
17
  import mlrun.feature_store.steps
17
18
  from mlrun.common.schemas.model_monitoring import (
@@ -21,6 +22,24 @@ from mlrun.common.schemas.model_monitoring import (
21
22
  )
22
23
 
23
24
 
25
+ def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
26
+ """
27
+ Normalize user defined keys - input data to a model and its predictions,
28
+ to a form V3IO frames tolerates.
29
+
30
+ The dictionary keys should conform to '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'.
31
+ """
32
+ prefix = "_"
33
+
34
+ def norm_key(key: str) -> str:
35
+ key = key.replace("-", "_") # hyphens `-` are not allowed
36
+ if key and key[0].isdigit(): # starting with a digit is not allowed
37
+ return prefix + key
38
+ return key
39
+
40
+ return {norm_key(k): v for k, v in event.items()}
41
+
42
+
24
43
  class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
25
44
  def __init__(self, **kwargs):
26
45
  """
@@ -68,8 +87,8 @@ class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
68
87
  # endpoint_features includes the event values of each feature and prediction
69
88
  endpoint_features = {
70
89
  EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
71
- **event[EventFieldType.NAMED_PREDICTIONS],
72
- **event[EventFieldType.NAMED_FEATURES],
90
+ **_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_PREDICTIONS]),
91
+ **_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_FEATURES]),
73
92
  **base_event,
74
93
  }
75
94
  # Create a dictionary that includes both base_metrics and endpoint_features
@@ -27,7 +27,6 @@ import mlrun.datastore.targets
27
27
  import mlrun.feature_store as fstore
28
28
  import mlrun.feature_store.steps
29
29
  import mlrun.model_monitoring.db
30
- import mlrun.model_monitoring.prometheus
31
30
  import mlrun.serving.states
32
31
  import mlrun.utils
33
32
  from mlrun.common.schemas.model_monitoring.constants import (
@@ -37,7 +36,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
37
36
  FileTargetKind,
38
37
  ModelEndpointTarget,
39
38
  ProjectSecretKeys,
40
- PrometheusEndpoints,
41
39
  )
42
40
  from mlrun.utils import logger
43
41
 
@@ -172,39 +170,12 @@ class EventStreamProcessor:
172
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
173
171
  )
174
172
 
175
- # Event routing based on the provided path
176
- def apply_event_routing():
177
- typing.cast(
178
- mlrun.serving.TaskStep,
179
- graph.add_step(
180
- "EventRouting",
181
- full_event=True,
182
- project=self.project,
183
- ),
184
- ).respond()
185
-
186
- apply_event_routing()
187
-
188
- # Filter out events with '-' in the path basename from going forward
189
- # through the next steps of the stream graph
190
- def apply_storey_filter_stream_events():
191
- # Filter events with Prometheus endpoints path
192
- graph.add_step(
193
- "storey.Filter",
194
- "filter_stream_event",
195
- _fn=f"(event.path not in {PrometheusEndpoints.list()})",
196
- full_event=True,
197
- )
198
-
199
- apply_storey_filter_stream_events()
200
-
201
173
  # Process endpoint event: splitting into sub-events and validate event data
202
174
  def apply_process_endpoint_event():
203
175
  graph.add_step(
204
176
  "ProcessEndpointEvent",
205
177
  full_event=True,
206
178
  project=self.project,
207
- after="filter_stream_event",
208
179
  )
209
180
 
210
181
  apply_process_endpoint_event()
@@ -324,33 +295,10 @@ class EventStreamProcessor:
324
295
 
325
296
  apply_storey_sample_window()
326
297
 
327
- # TSDB branch (skip to Prometheus if in CE env)
328
- if not mlrun.mlconf.is_ce_mode():
329
- tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
330
- project=self.project, secret_provider=secret_provider
331
- )
332
- tsdb_connector.apply_monitoring_stream_steps(graph=graph)
333
-
334
- else:
335
- # Prometheus
336
- # Increase the prediction counter by 1 and update the latency value
337
- graph.add_step(
338
- "IncCounter",
339
- name="IncCounter",
340
- after="MapFeatureNames",
341
- project=self.project,
342
- )
343
-
344
- # Record a sample of features and labels
345
- def apply_record_features_to_prometheus():
346
- graph.add_step(
347
- "RecordFeatures",
348
- name="RecordFeaturesToPrometheus",
349
- after="sample",
350
- project=self.project,
351
- )
352
-
353
- apply_record_features_to_prometheus()
298
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
299
+ project=self.project, secret_provider=secret_provider
300
+ )
301
+ tsdb_connector.apply_monitoring_stream_steps(graph=graph)
354
302
 
355
303
  # Parquet branch
356
304
  # Filter and validate different keys before writing the data to Parquet target
@@ -542,11 +490,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
542
490
  error = event.get("error")
543
491
  if error:
544
492
  self.error_count[endpoint_id] += 1
545
- mlrun.model_monitoring.prometheus.write_errors(
546
- project=self.project,
547
- endpoint_id=event["endpoint_id"],
548
- model_name=event["model"],
549
- )
493
+ # TODO: write to tsdb / kv once in a while
550
494
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
551
495
 
552
496
  # Validate event fields
@@ -973,98 +917,6 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
973
917
  return event
974
918
 
975
919
 
976
- class EventRouting(mlrun.feature_store.steps.MapClass):
977
- """
978
- Router the event according to the configured path under event.path. Please note that this step returns the result
979
- to the caller. At the moment there are several paths:
980
-
981
- - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
982
- to scrape the results from the monitoring stream memory.
983
-
984
- - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
985
- statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
986
- metrics.
987
-
988
- - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
989
-
990
- """
991
-
992
- def __init__(
993
- self,
994
- project: str,
995
- **kwargs,
996
- ):
997
- super().__init__(**kwargs)
998
- self.project: str = project
999
-
1000
- def do(self, event):
1001
- if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
1002
- # Return a parsed Prometheus registry file
1003
- event.body = mlrun.model_monitoring.prometheus.get_registry()
1004
- elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
1005
- # Update statistical metrics
1006
- for event_metric in event.body:
1007
- mlrun.model_monitoring.prometheus.write_drift_metrics(
1008
- project=self.project,
1009
- endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
1010
- metric=event_metric[EventFieldType.METRIC],
1011
- value=event_metric[EventFieldType.VALUE],
1012
- )
1013
- elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
1014
- # Update drift status
1015
- mlrun.model_monitoring.prometheus.write_drift_status(
1016
- project=self.project,
1017
- endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
1018
- drift_status=event.body[EventFieldType.DRIFT_STATUS],
1019
- )
1020
-
1021
- return event
1022
-
1023
-
1024
- class IncCounter(mlrun.feature_store.steps.MapClass):
1025
- """Increase prediction counter by 1 and update the total latency value"""
1026
-
1027
- def __init__(self, project: str, **kwargs):
1028
- super().__init__(**kwargs)
1029
- self.project: str = project
1030
-
1031
- def do(self, event):
1032
- # Compute prediction per second
1033
-
1034
- mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
1035
- project=self.project,
1036
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1037
- latency=event[EventFieldType.LATENCY],
1038
- model_name=event[EventFieldType.MODEL],
1039
- endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
1040
- )
1041
-
1042
- return event
1043
-
1044
-
1045
- class RecordFeatures(mlrun.feature_store.steps.MapClass):
1046
- """Record a sample of features and labels in Prometheus registry"""
1047
-
1048
- def __init__(self, project: str, **kwargs):
1049
- super().__init__(**kwargs)
1050
- self.project: str = project
1051
-
1052
- def do(self, event):
1053
- # Generate a dictionary of features and predictions
1054
- features = {
1055
- **event[EventFieldType.NAMED_PREDICTIONS],
1056
- **event[EventFieldType.NAMED_FEATURES],
1057
- }
1058
-
1059
- mlrun.model_monitoring.prometheus.write_income_features(
1060
- project=self.project,
1061
- endpoint_id=event[EventFieldType.ENDPOINT_ID],
1062
- features=features,
1063
- )
1064
-
1065
- return event
1066
-
1067
-
1068
920
  def update_endpoint_record(
1069
921
  project: str,
1070
922
  endpoint_id: str,