mlrun 1.7.0rc15__py3-none-any.whl → 1.7.0rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (77) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +18 -4
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/artifacts/__init__.py +7 -1
  6. mlrun/artifacts/base.py +28 -3
  7. mlrun/artifacts/dataset.py +8 -0
  8. mlrun/artifacts/manager.py +18 -0
  9. mlrun/artifacts/model.py +8 -1
  10. mlrun/artifacts/plots.py +13 -0
  11. mlrun/common/schemas/__init__.py +10 -2
  12. mlrun/common/schemas/alert.py +64 -5
  13. mlrun/common/schemas/api_gateway.py +4 -0
  14. mlrun/common/schemas/artifact.py +15 -0
  15. mlrun/common/schemas/auth.py +2 -0
  16. mlrun/common/schemas/model_monitoring/__init__.py +4 -1
  17. mlrun/common/schemas/model_monitoring/constants.py +17 -1
  18. mlrun/common/schemas/model_monitoring/model_endpoints.py +60 -1
  19. mlrun/common/schemas/project.py +5 -1
  20. mlrun/config.py +11 -4
  21. mlrun/datastore/datastore_profile.py +10 -7
  22. mlrun/db/base.py +24 -4
  23. mlrun/db/httpdb.py +97 -43
  24. mlrun/db/nopdb.py +25 -4
  25. mlrun/errors.py +5 -0
  26. mlrun/launcher/base.py +3 -2
  27. mlrun/lists.py +4 -0
  28. mlrun/model.py +15 -8
  29. mlrun/model_monitoring/__init__.py +1 -1
  30. mlrun/model_monitoring/applications/_application_steps.py +1 -2
  31. mlrun/model_monitoring/applications/context.py +1 -1
  32. mlrun/model_monitoring/applications/histogram_data_drift.py +64 -38
  33. mlrun/model_monitoring/db/__init__.py +2 -0
  34. mlrun/model_monitoring/db/stores/base/store.py +9 -36
  35. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
  36. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +56 -202
  37. mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
  38. mlrun/model_monitoring/db/tsdb/base.py +135 -0
  39. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  40. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  41. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +442 -0
  42. mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
  43. mlrun/model_monitoring/stream_processing.py +46 -210
  44. mlrun/model_monitoring/writer.py +50 -100
  45. mlrun/platforms/__init__.py +10 -9
  46. mlrun/platforms/iguazio.py +19 -200
  47. mlrun/projects/operations.py +11 -7
  48. mlrun/projects/pipelines.py +13 -76
  49. mlrun/projects/project.py +62 -17
  50. mlrun/render.py +9 -3
  51. mlrun/run.py +5 -38
  52. mlrun/runtimes/__init__.py +1 -0
  53. mlrun/runtimes/base.py +3 -3
  54. mlrun/runtimes/kubejob.py +2 -1
  55. mlrun/runtimes/nuclio/api_gateway.py +163 -77
  56. mlrun/runtimes/nuclio/application/application.py +160 -7
  57. mlrun/runtimes/nuclio/function.py +25 -45
  58. mlrun/runtimes/pod.py +16 -36
  59. mlrun/runtimes/remotesparkjob.py +1 -1
  60. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  61. mlrun/runtimes/utils.py +0 -38
  62. mlrun/track/tracker.py +2 -1
  63. mlrun/utils/helpers.py +51 -31
  64. mlrun/utils/logger.py +11 -6
  65. mlrun/utils/notifications/notification/base.py +1 -1
  66. mlrun/utils/notifications/notification/slack.py +9 -4
  67. mlrun/utils/notifications/notification/webhook.py +1 -1
  68. mlrun/utils/notifications/notification_pusher.py +21 -14
  69. mlrun/utils/version/version.json +2 -2
  70. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/METADATA +4 -3
  71. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/RECORD +75 -69
  72. mlrun/kfpops.py +0 -860
  73. mlrun/platforms/other.py +0 -305
  74. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/LICENSE +0 -0
  75. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/WHEEL +0 -0
  76. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/entry_points.txt +0 -0
  77. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/top_level.txt +0 -0
@@ -19,18 +19,17 @@ from http import HTTPStatus
19
19
 
20
20
  import v3io.dataplane
21
21
  import v3io.dataplane.response
22
- import v3io_frames
23
22
 
24
23
  import mlrun.common.model_monitoring.helpers
25
- import mlrun.common.schemas.model_monitoring
24
+ import mlrun.common.schemas.model_monitoring as mm_constants
26
25
  import mlrun.model_monitoring.db
27
26
  import mlrun.utils.v3io_clients
28
27
  from mlrun.utils import logger
29
28
 
30
29
  # Fields to encode before storing in the KV table or to decode after retrieving
31
30
  fields_to_encode_decode = [
32
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS,
33
- mlrun.common.schemas.model_monitoring.EventFieldType.CURRENT_STATS,
31
+ mm_constants.EventFieldType.FEATURE_STATS,
32
+ mm_constants.EventFieldType.CURRENT_STATS,
34
33
  ]
35
34
 
36
35
 
@@ -65,7 +64,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
65
64
  self.client.kv.put(
66
65
  container=self.container,
67
66
  table_path=self.path,
68
- key=endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID],
67
+ key=endpoint[mm_constants.EventFieldType.UID],
69
68
  attributes=endpoint,
70
69
  )
71
70
 
@@ -218,17 +217,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
218
217
  if uids is None:
219
218
  uids = []
220
219
  for item in items:
221
- if mlrun.common.schemas.model_monitoring.EventFieldType.UID not in item:
220
+ if mm_constants.EventFieldType.UID not in item:
222
221
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
223
- uids.append(
224
- item[
225
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
226
- ]
227
- )
222
+ uids.append(item[mm_constants.EventFieldType.ENDPOINT_ID])
228
223
  else:
229
- uids.append(
230
- item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
231
- )
224
+ uids.append(item[mm_constants.EventFieldType.UID])
232
225
 
233
226
  # Add each relevant model endpoint to the model endpoints list
234
227
  for endpoint_id in uids:
@@ -239,27 +232,20 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
239
232
 
240
233
  return endpoint_list
241
234
 
242
- def delete_model_endpoints_resources(self, endpoints: list[dict[str, typing.Any]]):
235
+ def delete_model_endpoints_resources(self):
243
236
  """
244
- Delete all model endpoints resources in both KV and the time series DB.
245
-
246
- :param endpoints: A list of model endpoints flattened dictionaries.
237
+ Delete all model endpoints resources in V3IO KV.
247
238
  """
248
239
 
240
+ endpoints = self.list_model_endpoints()
241
+
249
242
  # Delete model endpoint record from KV table
250
243
  for endpoint_dict in endpoints:
251
- if (
252
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
253
- not in endpoint_dict
254
- ):
244
+ if mm_constants.EventFieldType.UID not in endpoint_dict:
255
245
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
256
- endpoint_id = endpoint_dict[
257
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
258
- ]
246
+ endpoint_id = endpoint_dict[mm_constants.EventFieldType.ENDPOINT_ID]
259
247
  else:
260
- endpoint_id = endpoint_dict[
261
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
262
- ]
248
+ endpoint_id = endpoint_dict[mm_constants.EventFieldType.UID]
263
249
  self.delete_model_endpoint(
264
250
  endpoint_id,
265
251
  )
@@ -282,135 +268,26 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
282
268
  raise_for_status=v3io.dataplane.RaiseForStatus.never,
283
269
  )
284
270
 
285
- # Cleanup TSDB
286
- frames = self._get_frames_client()
287
-
288
- # Generate the required tsdb paths
289
- tsdb_path, filtered_path = self._generate_tsdb_paths()
290
-
291
- # Delete time series DB resources
292
- try:
293
- frames.delete(
294
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
295
- table=filtered_path,
296
- )
297
- except v3io_frames.errors.DeleteError as e:
298
- if "No TSDB schema file found" not in str(e):
299
- logger.warning(
300
- f"Failed to delete TSDB table '{filtered_path}'",
301
- err=mlrun.errors.err_to_str(e),
302
- )
303
- # Final cleanup of tsdb path
304
- tsdb_path.replace("://u", ":///u")
305
- store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
306
- store.rm(tsdb_path, recursive=True)
307
-
308
- def get_endpoint_real_time_metrics(
271
+ def write_application_event(
309
272
  self,
310
- endpoint_id: str,
311
- metrics: list[str],
312
- start: str = "now-1h",
313
- end: str = "now",
314
- access_key: str = None,
315
- ) -> dict[str, list[tuple[str, float]]]:
316
- """
317
- Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
318
- `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
319
-
320
- :param endpoint_id: The unique id of the model endpoint.
321
- :param metrics: A list of real-time metrics to return for the model endpoint.
322
- :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
323
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
324
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
325
- earliest time.
326
- :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
327
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
328
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
329
- earliest time.
330
- :param access_key: V3IO access key that will be used for generating Frames client object. If not
331
- provided, the access key will be retrieved from the environment variables.
332
-
333
- :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
334
- includes timestamps and the values.
335
- """
336
-
337
- # Initialize access key
338
- access_key = access_key or mlrun.mlconf.get_v3io_access_key()
339
-
340
- if not metrics:
341
- raise mlrun.errors.MLRunInvalidArgumentError(
342
- "Metric names must be provided"
343
- )
344
-
345
- # Initialize metrics mapping dictionary
346
- metrics_mapping = {}
347
-
348
- # Getting the path for the time series DB
349
- events_path = (
350
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
351
- project=self.project,
352
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
353
- )
354
- )
355
- (
356
- _,
357
- container,
358
- events_path,
359
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
360
- events_path
361
- )
362
-
363
- # Retrieve the raw data from the time series DB based on the provided metrics and time ranges
364
- frames_client = mlrun.utils.v3io_clients.get_frames_client(
365
- token=access_key,
366
- address=mlrun.mlconf.v3io_framesd,
367
- container=container,
368
- )
369
-
370
- try:
371
- data = frames_client.read(
372
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
373
- table=events_path,
374
- columns=["endpoint_id", *metrics],
375
- filter=f"endpoint_id=='{endpoint_id}'",
376
- start=start,
377
- end=end,
378
- )
379
-
380
- # Fill the metrics mapping dictionary with the metric name and values
381
- data_dict = data.to_dict()
382
- for metric in metrics:
383
- metric_data = data_dict.get(metric)
384
- if metric_data is None:
385
- continue
386
-
387
- values = [
388
- (str(timestamp), value) for timestamp, value in metric_data.items()
389
- ]
390
- metrics_mapping[metric] = values
391
-
392
- except v3io_frames.errors.ReadError:
393
- logger.warn("Failed to read tsdb", endpoint=endpoint_id)
394
-
395
- return metrics_mapping
396
-
397
- def write_application_result(self, event: dict[str, typing.Any]):
273
+ event: dict[str, typing.Any],
274
+ kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
275
+ ):
398
276
  """
399
- Write a new application result event in the target table.
277
+ Write a new application event in the target table.
400
278
 
401
279
  :param event: An event dictionary that represents the application result, should be corresponded to the
402
280
  schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
403
281
  object.
282
+ :param kind: The type of the event, can be either "result" or "metric".
404
283
  """
405
- endpoint_id = event.pop(
406
- mlrun.common.schemas.model_monitoring.WriterEvent.ENDPOINT_ID
407
- )
408
- app_name = event.pop(
409
- mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
410
- )
411
- metric_name = event.pop(
412
- mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME
413
- )
284
+ if kind == mm_constants.WriterEventKind.METRIC:
285
+ # TODO : Implement the logic for writing metrics to KV
286
+ return
287
+
288
+ endpoint_id = event.pop(mm_constants.WriterEvent.ENDPOINT_ID)
289
+ app_name = event.pop(mm_constants.WriterEvent.APPLICATION_NAME)
290
+ metric_name = event.pop(mm_constants.ResultData.RESULT_NAME)
414
291
  attributes = {metric_name: json.dumps(event)}
415
292
 
416
293
  v3io_monitoring_apps_container = self.get_v3io_monitoring_apps_container(
@@ -445,7 +322,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
445
322
  """Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
446
323
  fields = [
447
324
  {
448
- "name": mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME,
325
+ "name": mm_constants.ResultData.RESULT_NAME,
449
326
  "type": "string",
450
327
  "nullable": False,
451
328
  }
@@ -453,7 +330,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
453
330
  res = self.client.kv.create_schema(
454
331
  container=v3io_monitoring_apps_container,
455
332
  table_path=endpoint_id,
456
- key=mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME,
333
+ key=mm_constants.WriterEvent.APPLICATION_NAME,
457
334
  fields=fields,
458
335
  )
459
336
  if res.status_code != HTTPStatus.OK:
@@ -484,9 +361,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
484
361
  table_path=endpoint_id,
485
362
  key=application_name,
486
363
  )
487
- return data.output.item[
488
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
489
- ]
364
+ return data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
490
365
  except v3io.dataplane.response.HttpResponseError as err:
491
366
  logger.debug("Error while getting last analyzed time", err=err)
492
367
  raise mlrun.errors.MLRunNotFoundError(
@@ -511,9 +386,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
511
386
  ),
512
387
  table_path=endpoint_id,
513
388
  key=application_name,
514
- attributes={
515
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
516
- },
389
+ attributes={mm_constants.SchedulingKeys.LAST_ANALYZED: last_analyzed},
517
390
  )
518
391
 
519
392
  def _generate_tsdb_paths(self) -> tuple[str, str]:
@@ -622,8 +495,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
622
495
  # Apply top_level filter (remove endpoints that considered a child of a router)
623
496
  if top_level:
624
497
  filter_expression.append(
625
- f"(endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.NODE_EP.value)}' "
626
- f"OR endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.ROUTER.value)}')"
498
+ f"(endpoint_type=='{str(mm_constants.EndpointType.NODE_EP.value)}' "
499
+ f"OR endpoint_type=='{str(mm_constants.EndpointType.ROUTER.value)}')"
627
500
  )
628
501
 
629
502
  return " AND ".join(filter_expression)
@@ -643,41 +516,31 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
643
516
  # Validate default value for `error_count`
644
517
  # For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
645
518
  if (
646
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT in endpoint
647
- and endpoint[
648
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
649
- ]
650
- == "null"
519
+ mm_constants.EventFieldType.ERROR_COUNT in endpoint
520
+ and endpoint[mm_constants.EventFieldType.ERROR_COUNT] == "null"
651
521
  ):
652
- endpoint[
653
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
654
- ] = "0"
522
+ endpoint[mm_constants.EventFieldType.ERROR_COUNT] = "0"
655
523
 
656
524
  # Validate default value for `metrics`
657
525
  # For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
658
526
  if (
659
- mlrun.common.schemas.model_monitoring.EventFieldType.METRICS in endpoint
660
- and endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS]
661
- == "null"
527
+ mm_constants.EventFieldType.METRICS in endpoint
528
+ and endpoint[mm_constants.EventFieldType.METRICS] == "null"
662
529
  ):
663
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS] = (
664
- json.dumps(
665
- {
666
- mlrun.common.schemas.model_monitoring.EventKeyMetrics.GENERIC: {
667
- mlrun.common.schemas.model_monitoring.EventLiveStats.LATENCY_AVG_1H: 0,
668
- mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
669
- }
530
+ endpoint[mm_constants.EventFieldType.METRICS] = json.dumps(
531
+ {
532
+ mm_constants.EventKeyMetrics.GENERIC: {
533
+ mm_constants.EventLiveStats.LATENCY_AVG_1H: 0,
534
+ mm_constants.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
670
535
  }
671
- )
536
+ }
672
537
  )
673
538
  # Validate key `uid` instead of `endpoint_id`
674
539
  # For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
675
- if mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID in endpoint:
676
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID] = (
677
- endpoint[
678
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
679
- ]
680
- )
540
+ if mm_constants.EventFieldType.ENDPOINT_ID in endpoint:
541
+ endpoint[mm_constants.EventFieldType.UID] = endpoint[
542
+ mm_constants.EventFieldType.ENDPOINT_ID
543
+ ]
681
544
 
682
545
  @staticmethod
683
546
  def _encode_field(field: typing.Union[str, bytes]) -> bytes:
@@ -705,10 +568,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
705
568
 
706
569
  def _extract_metrics_from_items(
707
570
  self, app_items: list[dict[str, str]]
708
- ) -> list[mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric]:
709
- metrics: list[
710
- mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
711
- ] = []
571
+ ) -> list[mm_constants.ModelEndpointMonitoringMetric]:
572
+ metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
712
573
  for app_item in app_items:
713
574
  # See https://www.iguazio.com/docs/latest-release/services/data-layer/reference/system-attributes/#sys-attr-__name
714
575
  app_name = app_item.pop("__name")
@@ -716,18 +577,13 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
716
577
  continue
717
578
  for result_name in app_item:
718
579
  metrics.append(
719
- mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric(
580
+ mm_constants.ModelEndpointMonitoringMetric(
720
581
  project=self.project,
721
582
  app=app_name,
722
- type=mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.RESULT,
583
+ type=mm_constants.ModelEndpointMonitoringMetricType.RESULT,
723
584
  name=result_name,
724
- full_name=".".join(
725
- [
726
- self.project,
727
- app_name,
728
- mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.RESULT,
729
- result_name,
730
- ]
585
+ full_name=mlrun.common.schemas.model_monitoring.model_endpoints._compose_full_name(
586
+ project=self.project, app=app_name, name=result_name
731
587
  ),
732
588
  )
733
589
  )
@@ -735,11 +591,9 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
735
591
 
736
592
  def get_model_endpoint_metrics(
737
593
  self, endpoint_id: str
738
- ) -> list[mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric]:
594
+ ) -> list[mm_constants.ModelEndpointMonitoringMetric]:
739
595
  """Get model monitoring results and metrics on the endpoint"""
740
- metrics: list[
741
- mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
742
- ] = []
596
+ metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
743
597
  container = self.get_v3io_monitoring_apps_container(self.project)
744
598
  try:
745
599
  response = self.client.kv.scan(container=container, table_path=endpoint_id)
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import enum
16
+ import typing
17
+
18
+ import mlrun.common.schemas.secret
19
+ import mlrun.errors
20
+
21
+ from .base import TSDBConnector
22
+
23
+
24
+ class ObjectTSDBFactory(enum.Enum):
25
+ """Enum class to handle the different TSDB connector type values for storing real time metrics"""
26
+
27
+ v3io_tsdb = "v3io-tsdb"
28
+
29
+ def to_tsdb_connector(self, project: str, **kwargs) -> TSDBConnector:
30
+ """
31
+ Return a TSDBConnector object based on the provided enum value.
32
+ :param project: The name of the project.
33
+ :return: `TSDBConnector` object.
34
+ """
35
+
36
+ if self == self.v3io_tsdb:
37
+ if mlrun.mlconf.is_ce_mode():
38
+ raise mlrun.errors.MLRunInvalidArgumentError(
39
+ f"{self.v3io_tsdb} is not supported in CE mode."
40
+ )
41
+
42
+ from .v3io.v3io_connector import V3IOTSDBConnector
43
+
44
+ return V3IOTSDBConnector(project=project, **kwargs)
45
+
46
+ @classmethod
47
+ def _missing_(cls, value: typing.Any):
48
+ """A lookup function to handle an invalid value.
49
+ :param value: Provided enum (invalid) value.
50
+ """
51
+ valid_values = list(cls.__members__.keys())
52
+ raise mlrun.errors.MLRunInvalidArgumentError(
53
+ f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
54
+ )
55
+
56
+
57
+ def get_tsdb_connector(project: str, **kwargs) -> TSDBConnector:
58
+ """
59
+ Get the TSDB connector type based on mlrun.config.model_endpoint_monitoring.tsdb_connector_type.
60
+ :param project: The name of the project.
61
+ :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
62
+ TSDB connector such as updating drift metrics or write application record result.
63
+ """
64
+
65
+ # Get store type value from ObjectTSDBFactory enum class
66
+ tsdb_connector_type = ObjectTSDBFactory(
67
+ mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
68
+ )
69
+
70
+ # Convert into TSDB connector object
71
+ return tsdb_connector_type.to_tsdb_connector(project=project, **kwargs)
@@ -0,0 +1,135 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+
17
+ from abc import ABC
18
+
19
+ import pandas as pd
20
+
21
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
22
+
23
+
24
+ class TSDBConnector(ABC):
25
+ def __init__(self, project: str):
26
+ """
27
+ Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
28
+ At the moment we have 3 different types of monitoring data:
29
+ - real time performance metrics: real time performance metrics that are being calculated by the model
30
+ monitoring stream pod.
31
+ Among these metrics are the base metrics (average latency and predictions over time), endpoint features
32
+ (data samples), and custom metrics (user-defined metrics).
33
+ - app_results: a detailed results that include status, kind, extra data, etc. These results are being calculated
34
+ through the monitoring applications and stored in the TSDB using the model monitoring writer.
35
+ - metrics: a basic key value that represents a numeric metric. Similar to the app_results, these metrics
36
+ are being calculated through the monitoring applications and stored in the TSDB using the model monitoring
37
+ writer.
38
+
39
+ :param project: the name of the project.
40
+
41
+ """
42
+ self.project = project
43
+
44
+ def apply_monitoring_stream_steps(self, graph):
45
+ """
46
+ Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
47
+ different key metric dictionaries. This data is being used by the monitoring dashboards in
48
+ grafana.
49
+ There are 3 different key metric dictionaries that are being generated throughout these steps:
50
+ - base_metrics (average latency and predictions over time)
51
+ - endpoint_features (Prediction and feature names and values)
52
+ - custom_metrics (user-defined metrics)
53
+ """
54
+ pass
55
+
56
+ def write_application_event(
57
+ self,
58
+ event: dict,
59
+ kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
60
+ ):
61
+ """
62
+ Write a single application or metric to TSDB.
63
+
64
+ :raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
65
+ """
66
+ pass
67
+
68
+ def delete_tsdb_resources(self):
69
+ """
70
+ Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
71
+ """
72
+
73
+ pass
74
+
75
+ def get_model_endpoint_real_time_metrics(
76
+ self,
77
+ endpoint_id: str,
78
+ metrics: list[str],
79
+ start: str = "now-1h",
80
+ end: str = "now",
81
+ ) -> dict[str, list[tuple[str, float]]]:
82
+ """
83
+ Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
84
+ `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user. Note that these
85
+ metrics are being calculated by the model monitoring stream pod.
86
+ :param endpoint_id: The unique id of the model endpoint.
87
+ :param metrics: A list of real-time metrics to return for the model endpoint.
88
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
89
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
90
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
91
+ = seconds), or 0 for the earliest time.
92
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
93
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
94
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
95
+ = seconds), or 0 for the earliest time.
96
+ :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
97
+ includes timestamps and the values.
98
+ """
99
+ pass
100
+
101
+ def get_records(
102
+ self,
103
+ table: str,
104
+ columns: list[str] = None,
105
+ filter_query: str = "",
106
+ start: str = "now-1h",
107
+ end: str = "now",
108
+ ) -> pd.DataFrame:
109
+ """
110
+ Getting records from TSDB data collection.
111
+ :param table: Table name, e.g. 'metrics', 'app_results'.
112
+ :param columns: Columns to include in the result.
113
+ :param filter_query: Optional filter expression as a string. The filter structure depends on the TSDB
114
+ connector type.
115
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC
116
+ 3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
117
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
118
+ = seconds), or 0 for the earliest time.
119
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC
120
+ 3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
121
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
122
+ = seconds), or 0 for the earliest time.
123
+
124
+ :return: DataFrame with the provided attributes from the data collection.
125
+ :raise: MLRunNotFoundError if the provided table wasn't found.
126
+ """
127
+ pass
128
+
129
+ def create_tsdb_application_tables(self):
130
+ """
131
+ Create the application tables using the TSDB connector. At the moment we support 2 types of application tables:
132
+ - app_results: a detailed result that includes status, kind, extra data, etc.
133
+ - metrics: a basic key value that represents a numeric metric.
134
+ """
135
+ pass
@@ -0,0 +1,15 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .v3io_connector import V3IOTSDBConnector