mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +18 -109
  3. mlrun/{runtimes/mpijob/v1alpha1.py → alerts/__init__.py} +2 -16
  4. mlrun/alerts/alert.py +141 -0
  5. mlrun/artifacts/__init__.py +8 -3
  6. mlrun/artifacts/base.py +36 -253
  7. mlrun/artifacts/dataset.py +9 -190
  8. mlrun/artifacts/manager.py +20 -41
  9. mlrun/artifacts/model.py +8 -140
  10. mlrun/artifacts/plots.py +14 -375
  11. mlrun/common/schemas/__init__.py +4 -2
  12. mlrun/common/schemas/alert.py +46 -4
  13. mlrun/common/schemas/api_gateway.py +4 -0
  14. mlrun/common/schemas/artifact.py +15 -0
  15. mlrun/common/schemas/auth.py +2 -0
  16. mlrun/common/schemas/model_monitoring/__init__.py +8 -1
  17. mlrun/common/schemas/model_monitoring/constants.py +40 -4
  18. mlrun/common/schemas/model_monitoring/model_endpoints.py +73 -2
  19. mlrun/common/schemas/project.py +2 -0
  20. mlrun/config.py +7 -4
  21. mlrun/data_types/to_pandas.py +4 -4
  22. mlrun/datastore/base.py +41 -9
  23. mlrun/datastore/datastore_profile.py +54 -4
  24. mlrun/datastore/inmem.py +2 -2
  25. mlrun/datastore/sources.py +43 -2
  26. mlrun/datastore/store_resources.py +2 -6
  27. mlrun/datastore/targets.py +106 -39
  28. mlrun/db/base.py +23 -3
  29. mlrun/db/httpdb.py +101 -47
  30. mlrun/db/nopdb.py +20 -2
  31. mlrun/errors.py +5 -0
  32. mlrun/feature_store/__init__.py +0 -2
  33. mlrun/feature_store/api.py +12 -47
  34. mlrun/feature_store/feature_set.py +9 -0
  35. mlrun/feature_store/retrieval/base.py +9 -4
  36. mlrun/feature_store/retrieval/conversion.py +4 -4
  37. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  38. mlrun/feature_store/retrieval/job.py +2 -0
  39. mlrun/feature_store/retrieval/local_merger.py +2 -0
  40. mlrun/feature_store/retrieval/spark_merger.py +5 -0
  41. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
  42. mlrun/launcher/base.py +4 -3
  43. mlrun/launcher/client.py +1 -1
  44. mlrun/lists.py +4 -2
  45. mlrun/model.py +25 -11
  46. mlrun/model_monitoring/__init__.py +1 -1
  47. mlrun/model_monitoring/api.py +41 -18
  48. mlrun/model_monitoring/application.py +5 -305
  49. mlrun/model_monitoring/applications/__init__.py +11 -0
  50. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  51. mlrun/model_monitoring/applications/base.py +282 -0
  52. mlrun/model_monitoring/applications/context.py +214 -0
  53. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  54. mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
  55. mlrun/model_monitoring/applications/results.py +99 -0
  56. mlrun/model_monitoring/controller.py +3 -1
  57. mlrun/model_monitoring/db/__init__.py +2 -0
  58. mlrun/model_monitoring/db/stores/base/store.py +9 -36
  59. mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
  60. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
  61. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +104 -187
  62. mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
  63. mlrun/model_monitoring/db/tsdb/base.py +135 -0
  64. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  65. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  66. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +404 -0
  67. mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
  68. mlrun/model_monitoring/evidently_application.py +6 -118
  69. mlrun/model_monitoring/helpers.py +1 -1
  70. mlrun/model_monitoring/model_endpoint.py +3 -2
  71. mlrun/model_monitoring/stream_processing.py +48 -213
  72. mlrun/model_monitoring/writer.py +101 -121
  73. mlrun/platforms/__init__.py +10 -9
  74. mlrun/platforms/iguazio.py +21 -202
  75. mlrun/projects/operations.py +11 -7
  76. mlrun/projects/pipelines.py +13 -76
  77. mlrun/projects/project.py +73 -45
  78. mlrun/render.py +11 -13
  79. mlrun/run.py +6 -41
  80. mlrun/runtimes/__init__.py +3 -3
  81. mlrun/runtimes/base.py +6 -6
  82. mlrun/runtimes/funcdoc.py +0 -28
  83. mlrun/runtimes/kubejob.py +2 -1
  84. mlrun/runtimes/local.py +1 -1
  85. mlrun/runtimes/mpijob/__init__.py +0 -20
  86. mlrun/runtimes/mpijob/v1.py +1 -1
  87. mlrun/runtimes/nuclio/api_gateway.py +75 -9
  88. mlrun/runtimes/nuclio/function.py +9 -35
  89. mlrun/runtimes/pod.py +16 -36
  90. mlrun/runtimes/remotesparkjob.py +1 -1
  91. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  92. mlrun/runtimes/utils.py +1 -39
  93. mlrun/utils/helpers.py +72 -71
  94. mlrun/utils/notifications/notification/base.py +1 -1
  95. mlrun/utils/notifications/notification/slack.py +12 -5
  96. mlrun/utils/notifications/notification/webhook.py +1 -1
  97. mlrun/utils/notifications/notification_pusher.py +134 -14
  98. mlrun/utils/version/version.json +2 -2
  99. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/METADATA +4 -3
  100. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/RECORD +105 -95
  101. mlrun/kfpops.py +0 -865
  102. mlrun/platforms/other.py +0 -305
  103. /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
  104. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/LICENSE +0 -0
  105. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/WHEEL +0 -0
  106. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/entry_points.txt +0 -0
  107. {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
14
 
16
15
  import json
17
16
  import os
@@ -20,18 +19,17 @@ from http import HTTPStatus
20
19
 
21
20
  import v3io.dataplane
22
21
  import v3io.dataplane.response
23
- import v3io_frames
24
22
 
25
23
  import mlrun.common.model_monitoring.helpers
26
- import mlrun.common.schemas.model_monitoring
24
+ import mlrun.common.schemas.model_monitoring as mm_constants
27
25
  import mlrun.model_monitoring.db
28
26
  import mlrun.utils.v3io_clients
29
27
  from mlrun.utils import logger
30
28
 
31
29
  # Fields to encode before storing in the KV table or to decode after retrieving
32
30
  fields_to_encode_decode = [
33
- mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS,
34
- mlrun.common.schemas.model_monitoring.EventFieldType.CURRENT_STATS,
31
+ mm_constants.EventFieldType.FEATURE_STATS,
32
+ mm_constants.EventFieldType.CURRENT_STATS,
35
33
  ]
36
34
 
37
35
 
@@ -41,7 +39,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
41
39
  client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
42
40
  """
43
41
 
44
- def __init__(self, project: str, access_key: str):
42
+ def __init__(self, project: str, access_key: typing.Optional[str] = None) -> None:
45
43
  super().__init__(project=project)
46
44
  # Initialize a V3IO client instance
47
45
  self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
@@ -66,7 +64,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
66
64
  self.client.kv.put(
67
65
  container=self.container,
68
66
  table_path=self.path,
69
- key=endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID],
67
+ key=endpoint[mm_constants.EventFieldType.UID],
70
68
  attributes=endpoint,
71
69
  )
72
70
 
@@ -219,17 +217,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
219
217
  if uids is None:
220
218
  uids = []
221
219
  for item in items:
222
- if mlrun.common.schemas.model_monitoring.EventFieldType.UID not in item:
220
+ if mm_constants.EventFieldType.UID not in item:
223
221
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
224
- uids.append(
225
- item[
226
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
227
- ]
228
- )
222
+ uids.append(item[mm_constants.EventFieldType.ENDPOINT_ID])
229
223
  else:
230
- uids.append(
231
- item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
232
- )
224
+ uids.append(item[mm_constants.EventFieldType.UID])
233
225
 
234
226
  # Add each relevant model endpoint to the model endpoints list
235
227
  for endpoint_id in uids:
@@ -240,27 +232,20 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
240
232
 
241
233
  return endpoint_list
242
234
 
243
- def delete_model_endpoints_resources(self, endpoints: list[dict[str, typing.Any]]):
235
+ def delete_model_endpoints_resources(self):
244
236
  """
245
- Delete all model endpoints resources in both KV and the time series DB.
246
-
247
- :param endpoints: A list of model endpoints flattened dictionaries.
237
+ Delete all model endpoints resources in V3IO KV.
248
238
  """
249
239
 
240
+ endpoints = self.list_model_endpoints()
241
+
250
242
  # Delete model endpoint record from KV table
251
243
  for endpoint_dict in endpoints:
252
- if (
253
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
254
- not in endpoint_dict
255
- ):
244
+ if mm_constants.EventFieldType.UID not in endpoint_dict:
256
245
  # This is kept for backwards compatibility - in old versions the key column named endpoint_id
257
- endpoint_id = endpoint_dict[
258
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
259
- ]
246
+ endpoint_id = endpoint_dict[mm_constants.EventFieldType.ENDPOINT_ID]
260
247
  else:
261
- endpoint_id = endpoint_dict[
262
- mlrun.common.schemas.model_monitoring.EventFieldType.UID
263
- ]
248
+ endpoint_id = endpoint_dict[mm_constants.EventFieldType.UID]
264
249
  self.delete_model_endpoint(
265
250
  endpoint_id,
266
251
  )
@@ -283,135 +268,26 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
283
268
  raise_for_status=v3io.dataplane.RaiseForStatus.never,
284
269
  )
285
270
 
286
- # Cleanup TSDB
287
- frames = self._get_frames_client()
288
-
289
- # Generate the required tsdb paths
290
- tsdb_path, filtered_path = self._generate_tsdb_paths()
291
-
292
- # Delete time series DB resources
293
- try:
294
- frames.delete(
295
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
296
- table=filtered_path,
297
- )
298
- except v3io_frames.errors.DeleteError as e:
299
- if "No TSDB schema file found" not in str(e):
300
- logger.warning(
301
- f"Failed to delete TSDB table '{filtered_path}'",
302
- err=mlrun.errors.err_to_str(e),
303
- )
304
- # Final cleanup of tsdb path
305
- tsdb_path.replace("://u", ":///u")
306
- store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
307
- store.rm(tsdb_path, recursive=True)
308
-
309
- def get_endpoint_real_time_metrics(
271
+ def write_application_event(
310
272
  self,
311
- endpoint_id: str,
312
- metrics: list[str],
313
- start: str = "now-1h",
314
- end: str = "now",
315
- access_key: str = None,
316
- ) -> dict[str, list[tuple[str, float]]]:
317
- """
318
- Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
319
- `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
320
-
321
- :param endpoint_id: The unique id of the model endpoint.
322
- :param metrics: A list of real-time metrics to return for the model endpoint.
323
- :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
324
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
325
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
326
- earliest time.
327
- :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
328
- time, a Unix timestamp in milliseconds, a relative time (`'now'` or
329
- `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
330
- earliest time.
331
- :param access_key: V3IO access key that will be used for generating Frames client object. If not
332
- provided, the access key will be retrieved from the environment variables.
333
-
334
- :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
335
- includes timestamps and the values.
336
- """
337
-
338
- # Initialize access key
339
- access_key = access_key or mlrun.mlconf.get_v3io_access_key()
340
-
341
- if not metrics:
342
- raise mlrun.errors.MLRunInvalidArgumentError(
343
- "Metric names must be provided"
344
- )
345
-
346
- # Initialize metrics mapping dictionary
347
- metrics_mapping = {}
348
-
349
- # Getting the path for the time series DB
350
- events_path = (
351
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
352
- project=self.project,
353
- kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
354
- )
355
- )
356
- (
357
- _,
358
- container,
359
- events_path,
360
- ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
361
- events_path
362
- )
363
-
364
- # Retrieve the raw data from the time series DB based on the provided metrics and time ranges
365
- frames_client = mlrun.utils.v3io_clients.get_frames_client(
366
- token=access_key,
367
- address=mlrun.mlconf.v3io_framesd,
368
- container=container,
369
- )
370
-
371
- try:
372
- data = frames_client.read(
373
- backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
374
- table=events_path,
375
- columns=["endpoint_id", *metrics],
376
- filter=f"endpoint_id=='{endpoint_id}'",
377
- start=start,
378
- end=end,
379
- )
380
-
381
- # Fill the metrics mapping dictionary with the metric name and values
382
- data_dict = data.to_dict()
383
- for metric in metrics:
384
- metric_data = data_dict.get(metric)
385
- if metric_data is None:
386
- continue
387
-
388
- values = [
389
- (str(timestamp), value) for timestamp, value in metric_data.items()
390
- ]
391
- metrics_mapping[metric] = values
392
-
393
- except v3io_frames.errors.ReadError:
394
- logger.warn("Failed to read tsdb", endpoint=endpoint_id)
395
-
396
- return metrics_mapping
397
-
398
- def write_application_result(self, event: dict[str, typing.Any]):
273
+ event: dict[str, typing.Any],
274
+ kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
275
+ ):
399
276
  """
400
- Write a new application result event in the target table.
277
+ Write a new application event in the target table.
401
278
 
402
279
  :param event: An event dictionary that represents the application result, should be corresponded to the
403
280
  schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
404
281
  object.
282
+ :param kind: The type of the event, can be either "result" or "metric".
405
283
  """
406
- endpoint_id = event.pop(
407
- mlrun.common.schemas.model_monitoring.WriterEvent.ENDPOINT_ID
408
- )
409
- app_name = event.pop(
410
- mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
411
- )
412
- metric_name = event.pop(
413
- mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME
414
- )
284
+ if kind == mm_constants.WriterEventKind.METRIC:
285
+ # TODO : Implement the logic for writing metrics to KV
286
+ return
287
+
288
+ endpoint_id = event.pop(mm_constants.WriterEvent.ENDPOINT_ID)
289
+ app_name = event.pop(mm_constants.WriterEvent.APPLICATION_NAME)
290
+ metric_name = event.pop(mm_constants.ResultData.RESULT_NAME)
415
291
  attributes = {metric_name: json.dumps(event)}
416
292
 
417
293
  v3io_monitoring_apps_container = self.get_v3io_monitoring_apps_container(
@@ -446,7 +322,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
446
322
  """Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
447
323
  fields = [
448
324
  {
449
- "name": mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME,
325
+ "name": mm_constants.ResultData.RESULT_NAME,
450
326
  "type": "string",
451
327
  "nullable": False,
452
328
  }
@@ -454,7 +330,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
454
330
  res = self.client.kv.create_schema(
455
331
  container=v3io_monitoring_apps_container,
456
332
  table_path=endpoint_id,
457
- key=mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME,
333
+ key=mm_constants.WriterEvent.APPLICATION_NAME,
458
334
  fields=fields,
459
335
  )
460
336
  if res.status_code != HTTPStatus.OK:
@@ -485,9 +361,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
485
361
  table_path=endpoint_id,
486
362
  key=application_name,
487
363
  )
488
- return data.output.item[
489
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
490
- ]
364
+ return data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
491
365
  except v3io.dataplane.response.HttpResponseError as err:
492
366
  logger.debug("Error while getting last analyzed time", err=err)
493
367
  raise mlrun.errors.MLRunNotFoundError(
@@ -512,9 +386,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
512
386
  ),
513
387
  table_path=endpoint_id,
514
388
  key=application_name,
515
- attributes={
516
- mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
517
- },
389
+ attributes={mm_constants.SchedulingKeys.LAST_ANALYZED: last_analyzed},
518
390
  )
519
391
 
520
392
  def _generate_tsdb_paths(self) -> tuple[str, str]:
@@ -623,8 +495,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
623
495
  # Apply top_level filter (remove endpoints that considered a child of a router)
624
496
  if top_level:
625
497
  filter_expression.append(
626
- f"(endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.NODE_EP.value)}' "
627
- f"OR endpoint_type=='{str(mlrun.common.schemas.model_monitoring.EndpointType.ROUTER.value)}')"
498
+ f"(endpoint_type=='{str(mm_constants.EndpointType.NODE_EP.value)}' "
499
+ f"OR endpoint_type=='{str(mm_constants.EndpointType.ROUTER.value)}')"
628
500
  )
629
501
 
630
502
  return " AND ".join(filter_expression)
@@ -644,41 +516,31 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
644
516
  # Validate default value for `error_count`
645
517
  # For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
646
518
  if (
647
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT in endpoint
648
- and endpoint[
649
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
650
- ]
651
- == "null"
519
+ mm_constants.EventFieldType.ERROR_COUNT in endpoint
520
+ and endpoint[mm_constants.EventFieldType.ERROR_COUNT] == "null"
652
521
  ):
653
- endpoint[
654
- mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
655
- ] = "0"
522
+ endpoint[mm_constants.EventFieldType.ERROR_COUNT] = "0"
656
523
 
657
524
  # Validate default value for `metrics`
658
525
  # For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
659
526
  if (
660
- mlrun.common.schemas.model_monitoring.EventFieldType.METRICS in endpoint
661
- and endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS]
662
- == "null"
527
+ mm_constants.EventFieldType.METRICS in endpoint
528
+ and endpoint[mm_constants.EventFieldType.METRICS] == "null"
663
529
  ):
664
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.METRICS] = (
665
- json.dumps(
666
- {
667
- mlrun.common.schemas.model_monitoring.EventKeyMetrics.GENERIC: {
668
- mlrun.common.schemas.model_monitoring.EventLiveStats.LATENCY_AVG_1H: 0,
669
- mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
670
- }
530
+ endpoint[mm_constants.EventFieldType.METRICS] = json.dumps(
531
+ {
532
+ mm_constants.EventKeyMetrics.GENERIC: {
533
+ mm_constants.EventLiveStats.LATENCY_AVG_1H: 0,
534
+ mm_constants.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
671
535
  }
672
- )
536
+ }
673
537
  )
674
538
  # Validate key `uid` instead of `endpoint_id`
675
539
  # For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
676
- if mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID in endpoint:
677
- endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID] = (
678
- endpoint[
679
- mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
680
- ]
681
- )
540
+ if mm_constants.EventFieldType.ENDPOINT_ID in endpoint:
541
+ endpoint[mm_constants.EventFieldType.UID] = endpoint[
542
+ mm_constants.EventFieldType.ENDPOINT_ID
543
+ ]
682
544
 
683
545
  @staticmethod
684
546
  def _encode_field(field: typing.Union[str, bytes]) -> bytes:
@@ -703,3 +565,58 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
703
565
  @staticmethod
704
566
  def _get_monitoring_schedules_container(project_name: str) -> str:
705
567
  return f"users/pipelines/{project_name}/monitoring-schedules/functions"
568
+
569
+ def _extract_metrics_from_items(
570
+ self, app_items: list[dict[str, str]]
571
+ ) -> list[mm_constants.ModelEndpointMonitoringMetric]:
572
+ metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
573
+ for app_item in app_items:
574
+ # See https://www.iguazio.com/docs/latest-release/services/data-layer/reference/system-attributes/#sys-attr-__name
575
+ app_name = app_item.pop("__name")
576
+ if app_name == ".#schema":
577
+ continue
578
+ for result_name in app_item:
579
+ metrics.append(
580
+ mm_constants.ModelEndpointMonitoringMetric(
581
+ project=self.project,
582
+ app=app_name,
583
+ type=mm_constants.ModelEndpointMonitoringMetricType.RESULT,
584
+ name=result_name,
585
+ full_name=mlrun.common.schemas.model_monitoring.model_endpoints._compose_full_name(
586
+ project=self.project, app=app_name, name=result_name
587
+ ),
588
+ )
589
+ )
590
+ return metrics
591
+
592
+ def get_model_endpoint_metrics(
593
+ self, endpoint_id: str
594
+ ) -> list[mm_constants.ModelEndpointMonitoringMetric]:
595
+ """Get model monitoring results and metrics on the endpoint"""
596
+ metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
597
+ container = self.get_v3io_monitoring_apps_container(self.project)
598
+ try:
599
+ response = self.client.kv.scan(container=container, table_path=endpoint_id)
600
+ except v3io.dataplane.response.HttpResponseError as err:
601
+ if err.status_code == HTTPStatus.NOT_FOUND:
602
+ logger.warning(
603
+ "Attempt getting metrics and results - no data. Check the "
604
+ "project name, endpoint, or wait for the applications to start.",
605
+ container=container,
606
+ table_path=endpoint_id,
607
+ )
608
+ return []
609
+ raise
610
+
611
+ while True:
612
+ metrics.extend(self._extract_metrics_from_items(response.output.items))
613
+ if response.output.last:
614
+ break
615
+ # TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
616
+ response = self.client.kv.scan(
617
+ container=container,
618
+ table_path=endpoint_id,
619
+ marker=response.output.next_marker,
620
+ )
621
+
622
+ return metrics
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import enum
16
+ import typing
17
+
18
+ import mlrun.common.schemas.secret
19
+ import mlrun.errors
20
+
21
+ from .base import TSDBConnector
22
+
23
+
24
+ class ObjectTSDBFactory(enum.Enum):
25
+ """Enum class to handle the different TSDB connector type values for storing real time metrics"""
26
+
27
+ v3io_tsdb = "v3io-tsdb"
28
+
29
+ def to_tsdb_connector(self, project: str, **kwargs) -> TSDBConnector:
30
+ """
31
+ Return a TSDBConnector object based on the provided enum value.
32
+ :param project: The name of the project.
33
+ :return: `TSDBConnector` object.
34
+ """
35
+
36
+ if self == self.v3io_tsdb:
37
+ if mlrun.mlconf.is_ce_mode():
38
+ raise mlrun.errors.MLRunInvalidArgumentError(
39
+ f"{self.v3io_tsdb} is not supported in CE mode."
40
+ )
41
+
42
+ from .v3io.v3io_connector import V3IOTSDBConnector
43
+
44
+ return V3IOTSDBConnector(project=project, **kwargs)
45
+
46
+ @classmethod
47
+ def _missing_(cls, value: typing.Any):
48
+ """A lookup function to handle an invalid value.
49
+ :param value: Provided enum (invalid) value.
50
+ """
51
+ valid_values = list(cls.__members__.keys())
52
+ raise mlrun.errors.MLRunInvalidArgumentError(
53
+ f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
54
+ )
55
+
56
+
57
+ def get_tsdb_connector(project: str, **kwargs) -> TSDBConnector:
58
+ """
59
+ Get the TSDB connector type based on mlrun.config.model_endpoint_monitoring.tsdb_connector_type.
60
+ :param project: The name of the project.
61
+ :return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
62
+ TSDB connector such as updating drift metrics or write application record result.
63
+ """
64
+
65
+ # Get store type value from ObjectTSDBFactory enum class
66
+ tsdb_connector_type = ObjectTSDBFactory(
67
+ mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
68
+ )
69
+
70
+ # Convert into TSDB connector object
71
+ return tsdb_connector_type.to_tsdb_connector(project=project, **kwargs)
@@ -0,0 +1,135 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+
17
+ from abc import ABC
18
+
19
+ import pandas as pd
20
+
21
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
22
+
23
+
24
+ class TSDBConnector(ABC):
25
+ def __init__(self, project: str):
26
+ """
27
+ Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
28
+ At the moment we have 3 different types of monitoring data:
29
+ - real time performance metrics: real time performance metrics that are being calculated by the model
30
+ monitoring stream pod.
31
+ Among these metrics are the base metrics (average latency and predictions over time), endpoint features
32
+ (data samples), and custom metrics (user-defined metrics).
33
+ - app_results: a detailed results that include status, kind, extra data, etc. These results are being calculated
34
+ through the monitoring applications and stored in the TSDB using the model monitoring writer.
35
+ - metrics: a basic key value that represents a numeric metric. Similar to the app_results, these metrics
36
+ are being calculated through the monitoring applications and stored in the TSDB using the model monitoring
37
+ writer.
38
+
39
+ :param project: the name of the project.
40
+
41
+ """
42
+ self.project = project
43
+
44
+ def apply_monitoring_stream_steps(self, graph):
45
+ """
46
+ Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
47
+ different key metric dictionaries. This data is being used by the monitoring dashboards in
48
+ grafana.
49
+ There are 3 different key metric dictionaries that are being generated throughout these steps:
50
+ - base_metrics (average latency and predictions over time)
51
+ - endpoint_features (Prediction and feature names and values)
52
+ - custom_metrics (user-defined metrics)
53
+ """
54
+ pass
55
+
56
+ def write_application_event(
57
+ self,
58
+ event: dict,
59
+ kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
60
+ ):
61
+ """
62
+ Write a single application or metric to TSDB.
63
+
64
+ :raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
65
+ """
66
+ pass
67
+
68
+ def delete_tsdb_resources(self):
69
+ """
70
+ Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
71
+ """
72
+
73
+ pass
74
+
75
+ def get_model_endpoint_real_time_metrics(
76
+ self,
77
+ endpoint_id: str,
78
+ metrics: list[str],
79
+ start: str = "now-1h",
80
+ end: str = "now",
81
+ ) -> dict[str, list[tuple[str, float]]]:
82
+ """
83
+ Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
84
+ `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user. Note that these
85
+ metrics are being calculated by the model monitoring stream pod.
86
+ :param endpoint_id: The unique id of the model endpoint.
87
+ :param metrics: A list of real-time metrics to return for the model endpoint.
88
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
89
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
90
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
91
+ = seconds), or 0 for the earliest time.
92
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
93
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
94
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
95
+ = seconds), or 0 for the earliest time.
96
+ :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
97
+ includes timestamps and the values.
98
+ """
99
+ pass
100
+
101
+ def get_records(
102
+ self,
103
+ table: str,
104
+ columns: list[str] = None,
105
+ filter_query: str = "",
106
+ start: str = "now-1h",
107
+ end: str = "now",
108
+ ) -> pd.DataFrame:
109
+ """
110
+ Getting records from TSDB data collection.
111
+ :param table: Table name, e.g. 'metrics', 'app_results'.
112
+ :param columns: Columns to include in the result.
113
+ :param filter_query: Optional filter expression as a string. The filter structure depends on the TSDB
114
+ connector type.
115
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC
116
+ 3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
117
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
118
+ = seconds), or 0 for the earliest time.
119
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC
120
+ 3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
121
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
122
+ = seconds), or 0 for the earliest time.
123
+
124
+ :return: DataFrame with the provided attributes from the data collection.
125
+ :raise: MLRunNotFoundError if the provided table wasn't found.
126
+ """
127
+ pass
128
+
129
+ def create_tsdb_application_tables(self):
130
+ """
131
+ Create the application tables using the TSDB connector. At the moment we support 2 types of application tables:
132
+ - app_results: a detailed result that includes status, kind, extra data, etc.
133
+ - metrics: a basic key value that represents a numeric metric.
134
+ """
135
+ pass
@@ -0,0 +1,15 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .v3io_connector import V3IOTSDBConnector