mlrun 1.7.0rc15__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +18 -4
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +141 -0
- mlrun/artifacts/__init__.py +7 -1
- mlrun/artifacts/base.py +28 -3
- mlrun/artifacts/dataset.py +8 -0
- mlrun/artifacts/manager.py +18 -0
- mlrun/artifacts/model.py +7 -0
- mlrun/artifacts/plots.py +13 -0
- mlrun/common/schemas/__init__.py +4 -2
- mlrun/common/schemas/alert.py +46 -4
- mlrun/common/schemas/api_gateway.py +4 -0
- mlrun/common/schemas/artifact.py +15 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -1
- mlrun/common/schemas/model_monitoring/constants.py +16 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +60 -1
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +4 -1
- mlrun/datastore/datastore_profile.py +10 -7
- mlrun/db/base.py +23 -3
- mlrun/db/httpdb.py +97 -43
- mlrun/db/nopdb.py +20 -2
- mlrun/errors.py +5 -0
- mlrun/launcher/base.py +3 -2
- mlrun/lists.py +2 -0
- mlrun/model.py +7 -2
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/applications/_application_steps.py +1 -2
- mlrun/model_monitoring/applications/context.py +1 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +64 -38
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/base/store.py +9 -36
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +56 -202
- mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
- mlrun/model_monitoring/db/tsdb/base.py +135 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +404 -0
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
- mlrun/model_monitoring/stream_processing.py +46 -210
- mlrun/model_monitoring/writer.py +49 -99
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +19 -200
- mlrun/projects/operations.py +11 -7
- mlrun/projects/pipelines.py +13 -76
- mlrun/projects/project.py +55 -14
- mlrun/render.py +9 -3
- mlrun/run.py +5 -38
- mlrun/runtimes/base.py +3 -3
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/nuclio/api_gateway.py +75 -9
- mlrun/runtimes/nuclio/function.py +8 -34
- mlrun/runtimes/pod.py +16 -36
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +0 -38
- mlrun/utils/helpers.py +45 -31
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +9 -4
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +15 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/METADATA +3 -2
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/RECORD +71 -65
- mlrun/kfpops.py +0 -860
- mlrun/platforms/other.py +0 -305
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -19,18 +19,17 @@ from http import HTTPStatus
|
|
|
19
19
|
|
|
20
20
|
import v3io.dataplane
|
|
21
21
|
import v3io.dataplane.response
|
|
22
|
-
import v3io_frames
|
|
23
22
|
|
|
24
23
|
import mlrun.common.model_monitoring.helpers
|
|
25
|
-
import mlrun.common.schemas.model_monitoring
|
|
24
|
+
import mlrun.common.schemas.model_monitoring as mm_constants
|
|
26
25
|
import mlrun.model_monitoring.db
|
|
27
26
|
import mlrun.utils.v3io_clients
|
|
28
27
|
from mlrun.utils import logger
|
|
29
28
|
|
|
30
29
|
# Fields to encode before storing in the KV table or to decode after retrieving
|
|
31
30
|
fields_to_encode_decode = [
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
mm_constants.EventFieldType.FEATURE_STATS,
|
|
32
|
+
mm_constants.EventFieldType.CURRENT_STATS,
|
|
34
33
|
]
|
|
35
34
|
|
|
36
35
|
|
|
@@ -65,7 +64,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
65
64
|
self.client.kv.put(
|
|
66
65
|
container=self.container,
|
|
67
66
|
table_path=self.path,
|
|
68
|
-
key=endpoint[
|
|
67
|
+
key=endpoint[mm_constants.EventFieldType.UID],
|
|
69
68
|
attributes=endpoint,
|
|
70
69
|
)
|
|
71
70
|
|
|
@@ -218,17 +217,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
218
217
|
if uids is None:
|
|
219
218
|
uids = []
|
|
220
219
|
for item in items:
|
|
221
|
-
if
|
|
220
|
+
if mm_constants.EventFieldType.UID not in item:
|
|
222
221
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
223
|
-
uids.append(
|
|
224
|
-
item[
|
|
225
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
|
|
226
|
-
]
|
|
227
|
-
)
|
|
222
|
+
uids.append(item[mm_constants.EventFieldType.ENDPOINT_ID])
|
|
228
223
|
else:
|
|
229
|
-
uids.append(
|
|
230
|
-
item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
|
|
231
|
-
)
|
|
224
|
+
uids.append(item[mm_constants.EventFieldType.UID])
|
|
232
225
|
|
|
233
226
|
# Add each relevant model endpoint to the model endpoints list
|
|
234
227
|
for endpoint_id in uids:
|
|
@@ -239,27 +232,20 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
239
232
|
|
|
240
233
|
return endpoint_list
|
|
241
234
|
|
|
242
|
-
def delete_model_endpoints_resources(self
|
|
235
|
+
def delete_model_endpoints_resources(self):
|
|
243
236
|
"""
|
|
244
|
-
Delete all model endpoints resources in
|
|
245
|
-
|
|
246
|
-
:param endpoints: A list of model endpoints flattened dictionaries.
|
|
237
|
+
Delete all model endpoints resources in V3IO KV.
|
|
247
238
|
"""
|
|
248
239
|
|
|
240
|
+
endpoints = self.list_model_endpoints()
|
|
241
|
+
|
|
249
242
|
# Delete model endpoint record from KV table
|
|
250
243
|
for endpoint_dict in endpoints:
|
|
251
|
-
if
|
|
252
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
253
|
-
not in endpoint_dict
|
|
254
|
-
):
|
|
244
|
+
if mm_constants.EventFieldType.UID not in endpoint_dict:
|
|
255
245
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
256
|
-
endpoint_id = endpoint_dict[
|
|
257
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
|
|
258
|
-
]
|
|
246
|
+
endpoint_id = endpoint_dict[mm_constants.EventFieldType.ENDPOINT_ID]
|
|
259
247
|
else:
|
|
260
|
-
endpoint_id = endpoint_dict[
|
|
261
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
262
|
-
]
|
|
248
|
+
endpoint_id = endpoint_dict[mm_constants.EventFieldType.UID]
|
|
263
249
|
self.delete_model_endpoint(
|
|
264
250
|
endpoint_id,
|
|
265
251
|
)
|
|
@@ -282,135 +268,26 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
282
268
|
raise_for_status=v3io.dataplane.RaiseForStatus.never,
|
|
283
269
|
)
|
|
284
270
|
|
|
285
|
-
|
|
286
|
-
frames = self._get_frames_client()
|
|
287
|
-
|
|
288
|
-
# Generate the required tsdb paths
|
|
289
|
-
tsdb_path, filtered_path = self._generate_tsdb_paths()
|
|
290
|
-
|
|
291
|
-
# Delete time series DB resources
|
|
292
|
-
try:
|
|
293
|
-
frames.delete(
|
|
294
|
-
backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
|
|
295
|
-
table=filtered_path,
|
|
296
|
-
)
|
|
297
|
-
except v3io_frames.errors.DeleteError as e:
|
|
298
|
-
if "No TSDB schema file found" not in str(e):
|
|
299
|
-
logger.warning(
|
|
300
|
-
f"Failed to delete TSDB table '{filtered_path}'",
|
|
301
|
-
err=mlrun.errors.err_to_str(e),
|
|
302
|
-
)
|
|
303
|
-
# Final cleanup of tsdb path
|
|
304
|
-
tsdb_path.replace("://u", ":///u")
|
|
305
|
-
store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
|
|
306
|
-
store.rm(tsdb_path, recursive=True)
|
|
307
|
-
|
|
308
|
-
def get_endpoint_real_time_metrics(
|
|
271
|
+
def write_application_event(
|
|
309
272
|
self,
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
end: str = "now",
|
|
314
|
-
access_key: str = None,
|
|
315
|
-
) -> dict[str, list[tuple[str, float]]]:
|
|
316
|
-
"""
|
|
317
|
-
Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
|
|
318
|
-
`predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
|
|
319
|
-
|
|
320
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
321
|
-
:param metrics: A list of real-time metrics to return for the model endpoint.
|
|
322
|
-
:param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
|
|
323
|
-
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
324
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
|
|
325
|
-
earliest time.
|
|
326
|
-
:param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
|
|
327
|
-
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
328
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
|
|
329
|
-
earliest time.
|
|
330
|
-
:param access_key: V3IO access key that will be used for generating Frames client object. If not
|
|
331
|
-
provided, the access key will be retrieved from the environment variables.
|
|
332
|
-
|
|
333
|
-
:return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
|
|
334
|
-
includes timestamps and the values.
|
|
335
|
-
"""
|
|
336
|
-
|
|
337
|
-
# Initialize access key
|
|
338
|
-
access_key = access_key or mlrun.mlconf.get_v3io_access_key()
|
|
339
|
-
|
|
340
|
-
if not metrics:
|
|
341
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
342
|
-
"Metric names must be provided"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
# Initialize metrics mapping dictionary
|
|
346
|
-
metrics_mapping = {}
|
|
347
|
-
|
|
348
|
-
# Getting the path for the time series DB
|
|
349
|
-
events_path = (
|
|
350
|
-
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
351
|
-
project=self.project,
|
|
352
|
-
kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
|
|
353
|
-
)
|
|
354
|
-
)
|
|
355
|
-
(
|
|
356
|
-
_,
|
|
357
|
-
container,
|
|
358
|
-
events_path,
|
|
359
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
360
|
-
events_path
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
# Retrieve the raw data from the time series DB based on the provided metrics and time ranges
|
|
364
|
-
frames_client = mlrun.utils.v3io_clients.get_frames_client(
|
|
365
|
-
token=access_key,
|
|
366
|
-
address=mlrun.mlconf.v3io_framesd,
|
|
367
|
-
container=container,
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
try:
|
|
371
|
-
data = frames_client.read(
|
|
372
|
-
backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
|
|
373
|
-
table=events_path,
|
|
374
|
-
columns=["endpoint_id", *metrics],
|
|
375
|
-
filter=f"endpoint_id=='{endpoint_id}'",
|
|
376
|
-
start=start,
|
|
377
|
-
end=end,
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
# Fill the metrics mapping dictionary with the metric name and values
|
|
381
|
-
data_dict = data.to_dict()
|
|
382
|
-
for metric in metrics:
|
|
383
|
-
metric_data = data_dict.get(metric)
|
|
384
|
-
if metric_data is None:
|
|
385
|
-
continue
|
|
386
|
-
|
|
387
|
-
values = [
|
|
388
|
-
(str(timestamp), value) for timestamp, value in metric_data.items()
|
|
389
|
-
]
|
|
390
|
-
metrics_mapping[metric] = values
|
|
391
|
-
|
|
392
|
-
except v3io_frames.errors.ReadError:
|
|
393
|
-
logger.warn("Failed to read tsdb", endpoint=endpoint_id)
|
|
394
|
-
|
|
395
|
-
return metrics_mapping
|
|
396
|
-
|
|
397
|
-
def write_application_result(self, event: dict[str, typing.Any]):
|
|
273
|
+
event: dict[str, typing.Any],
|
|
274
|
+
kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
|
|
275
|
+
):
|
|
398
276
|
"""
|
|
399
|
-
Write a new application
|
|
277
|
+
Write a new application event in the target table.
|
|
400
278
|
|
|
401
279
|
:param event: An event dictionary that represents the application result, should be corresponded to the
|
|
402
280
|
schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
|
|
403
281
|
object.
|
|
282
|
+
:param kind: The type of the event, can be either "result" or "metric".
|
|
404
283
|
"""
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
)
|
|
411
|
-
metric_name = event.pop(
|
|
412
|
-
mlrun.common.schemas.model_monitoring.ResultData.RESULT_NAME
|
|
413
|
-
)
|
|
284
|
+
if kind == mm_constants.WriterEventKind.METRIC:
|
|
285
|
+
# TODO : Implement the logic for writing metrics to KV
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
endpoint_id = event.pop(mm_constants.WriterEvent.ENDPOINT_ID)
|
|
289
|
+
app_name = event.pop(mm_constants.WriterEvent.APPLICATION_NAME)
|
|
290
|
+
metric_name = event.pop(mm_constants.ResultData.RESULT_NAME)
|
|
414
291
|
attributes = {metric_name: json.dumps(event)}
|
|
415
292
|
|
|
416
293
|
v3io_monitoring_apps_container = self.get_v3io_monitoring_apps_container(
|
|
@@ -445,7 +322,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
445
322
|
"""Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
|
|
446
323
|
fields = [
|
|
447
324
|
{
|
|
448
|
-
"name":
|
|
325
|
+
"name": mm_constants.ResultData.RESULT_NAME,
|
|
449
326
|
"type": "string",
|
|
450
327
|
"nullable": False,
|
|
451
328
|
}
|
|
@@ -453,7 +330,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
453
330
|
res = self.client.kv.create_schema(
|
|
454
331
|
container=v3io_monitoring_apps_container,
|
|
455
332
|
table_path=endpoint_id,
|
|
456
|
-
key=
|
|
333
|
+
key=mm_constants.WriterEvent.APPLICATION_NAME,
|
|
457
334
|
fields=fields,
|
|
458
335
|
)
|
|
459
336
|
if res.status_code != HTTPStatus.OK:
|
|
@@ -484,9 +361,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
484
361
|
table_path=endpoint_id,
|
|
485
362
|
key=application_name,
|
|
486
363
|
)
|
|
487
|
-
return data.output.item[
|
|
488
|
-
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
|
|
489
|
-
]
|
|
364
|
+
return data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
|
|
490
365
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
491
366
|
logger.debug("Error while getting last analyzed time", err=err)
|
|
492
367
|
raise mlrun.errors.MLRunNotFoundError(
|
|
@@ -511,9 +386,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
511
386
|
),
|
|
512
387
|
table_path=endpoint_id,
|
|
513
388
|
key=application_name,
|
|
514
|
-
attributes={
|
|
515
|
-
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
|
|
516
|
-
},
|
|
389
|
+
attributes={mm_constants.SchedulingKeys.LAST_ANALYZED: last_analyzed},
|
|
517
390
|
)
|
|
518
391
|
|
|
519
392
|
def _generate_tsdb_paths(self) -> tuple[str, str]:
|
|
@@ -622,8 +495,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
622
495
|
# Apply top_level filter (remove endpoints that considered a child of a router)
|
|
623
496
|
if top_level:
|
|
624
497
|
filter_expression.append(
|
|
625
|
-
f"(endpoint_type=='{str(
|
|
626
|
-
f"OR endpoint_type=='{str(
|
|
498
|
+
f"(endpoint_type=='{str(mm_constants.EndpointType.NODE_EP.value)}' "
|
|
499
|
+
f"OR endpoint_type=='{str(mm_constants.EndpointType.ROUTER.value)}')"
|
|
627
500
|
)
|
|
628
501
|
|
|
629
502
|
return " AND ".join(filter_expression)
|
|
@@ -643,41 +516,31 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
643
516
|
# Validate default value for `error_count`
|
|
644
517
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
|
|
645
518
|
if (
|
|
646
|
-
|
|
647
|
-
and endpoint[
|
|
648
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
|
|
649
|
-
]
|
|
650
|
-
== "null"
|
|
519
|
+
mm_constants.EventFieldType.ERROR_COUNT in endpoint
|
|
520
|
+
and endpoint[mm_constants.EventFieldType.ERROR_COUNT] == "null"
|
|
651
521
|
):
|
|
652
|
-
endpoint[
|
|
653
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
|
|
654
|
-
] = "0"
|
|
522
|
+
endpoint[mm_constants.EventFieldType.ERROR_COUNT] = "0"
|
|
655
523
|
|
|
656
524
|
# Validate default value for `metrics`
|
|
657
525
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
|
|
658
526
|
if (
|
|
659
|
-
|
|
660
|
-
and endpoint[
|
|
661
|
-
== "null"
|
|
527
|
+
mm_constants.EventFieldType.METRICS in endpoint
|
|
528
|
+
and endpoint[mm_constants.EventFieldType.METRICS] == "null"
|
|
662
529
|
):
|
|
663
|
-
endpoint[
|
|
664
|
-
|
|
665
|
-
{
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
|
|
669
|
-
}
|
|
530
|
+
endpoint[mm_constants.EventFieldType.METRICS] = json.dumps(
|
|
531
|
+
{
|
|
532
|
+
mm_constants.EventKeyMetrics.GENERIC: {
|
|
533
|
+
mm_constants.EventLiveStats.LATENCY_AVG_1H: 0,
|
|
534
|
+
mm_constants.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
|
|
670
535
|
}
|
|
671
|
-
|
|
536
|
+
}
|
|
672
537
|
)
|
|
673
538
|
# Validate key `uid` instead of `endpoint_id`
|
|
674
539
|
# For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
|
|
675
|
-
if
|
|
676
|
-
endpoint[
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
]
|
|
680
|
-
)
|
|
540
|
+
if mm_constants.EventFieldType.ENDPOINT_ID in endpoint:
|
|
541
|
+
endpoint[mm_constants.EventFieldType.UID] = endpoint[
|
|
542
|
+
mm_constants.EventFieldType.ENDPOINT_ID
|
|
543
|
+
]
|
|
681
544
|
|
|
682
545
|
@staticmethod
|
|
683
546
|
def _encode_field(field: typing.Union[str, bytes]) -> bytes:
|
|
@@ -705,10 +568,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
705
568
|
|
|
706
569
|
def _extract_metrics_from_items(
|
|
707
570
|
self, app_items: list[dict[str, str]]
|
|
708
|
-
) -> list[
|
|
709
|
-
metrics: list[
|
|
710
|
-
mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
|
|
711
|
-
] = []
|
|
571
|
+
) -> list[mm_constants.ModelEndpointMonitoringMetric]:
|
|
572
|
+
metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
|
|
712
573
|
for app_item in app_items:
|
|
713
574
|
# See https://www.iguazio.com/docs/latest-release/services/data-layer/reference/system-attributes/#sys-attr-__name
|
|
714
575
|
app_name = app_item.pop("__name")
|
|
@@ -716,18 +577,13 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
716
577
|
continue
|
|
717
578
|
for result_name in app_item:
|
|
718
579
|
metrics.append(
|
|
719
|
-
|
|
580
|
+
mm_constants.ModelEndpointMonitoringMetric(
|
|
720
581
|
project=self.project,
|
|
721
582
|
app=app_name,
|
|
722
|
-
type=
|
|
583
|
+
type=mm_constants.ModelEndpointMonitoringMetricType.RESULT,
|
|
723
584
|
name=result_name,
|
|
724
|
-
full_name=
|
|
725
|
-
|
|
726
|
-
self.project,
|
|
727
|
-
app_name,
|
|
728
|
-
mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.RESULT,
|
|
729
|
-
result_name,
|
|
730
|
-
]
|
|
585
|
+
full_name=mlrun.common.schemas.model_monitoring.model_endpoints._compose_full_name(
|
|
586
|
+
project=self.project, app=app_name, name=result_name
|
|
731
587
|
),
|
|
732
588
|
)
|
|
733
589
|
)
|
|
@@ -735,11 +591,9 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
735
591
|
|
|
736
592
|
def get_model_endpoint_metrics(
|
|
737
593
|
self, endpoint_id: str
|
|
738
|
-
) -> list[
|
|
594
|
+
) -> list[mm_constants.ModelEndpointMonitoringMetric]:
|
|
739
595
|
"""Get model monitoring results and metrics on the endpoint"""
|
|
740
|
-
metrics: list[
|
|
741
|
-
mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetric
|
|
742
|
-
] = []
|
|
596
|
+
metrics: list[mm_constants.ModelEndpointMonitoringMetric] = []
|
|
743
597
|
container = self.get_v3io_monitoring_apps_container(self.project)
|
|
744
598
|
try:
|
|
745
599
|
response = self.client.kv.scan(container=container, table_path=endpoint_id)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import enum
|
|
16
|
+
import typing
|
|
17
|
+
|
|
18
|
+
import mlrun.common.schemas.secret
|
|
19
|
+
import mlrun.errors
|
|
20
|
+
|
|
21
|
+
from .base import TSDBConnector
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ObjectTSDBFactory(enum.Enum):
|
|
25
|
+
"""Enum class to handle the different TSDB connector type values for storing real time metrics"""
|
|
26
|
+
|
|
27
|
+
v3io_tsdb = "v3io-tsdb"
|
|
28
|
+
|
|
29
|
+
def to_tsdb_connector(self, project: str, **kwargs) -> TSDBConnector:
|
|
30
|
+
"""
|
|
31
|
+
Return a TSDBConnector object based on the provided enum value.
|
|
32
|
+
:param project: The name of the project.
|
|
33
|
+
:return: `TSDBConnector` object.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
if self == self.v3io_tsdb:
|
|
37
|
+
if mlrun.mlconf.is_ce_mode():
|
|
38
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
39
|
+
f"{self.v3io_tsdb} is not supported in CE mode."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from .v3io.v3io_connector import V3IOTSDBConnector
|
|
43
|
+
|
|
44
|
+
return V3IOTSDBConnector(project=project, **kwargs)
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def _missing_(cls, value: typing.Any):
|
|
48
|
+
"""A lookup function to handle an invalid value.
|
|
49
|
+
:param value: Provided enum (invalid) value.
|
|
50
|
+
"""
|
|
51
|
+
valid_values = list(cls.__members__.keys())
|
|
52
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
53
|
+
f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_tsdb_connector(project: str, **kwargs) -> TSDBConnector:
|
|
58
|
+
"""
|
|
59
|
+
Get the TSDB connector type based on mlrun.config.model_endpoint_monitoring.tsdb_connector_type.
|
|
60
|
+
:param project: The name of the project.
|
|
61
|
+
:return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
|
|
62
|
+
TSDB connector such as updating drift metrics or write application record result.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
# Get store type value from ObjectTSDBFactory enum class
|
|
66
|
+
tsdb_connector_type = ObjectTSDBFactory(
|
|
67
|
+
mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Convert into TSDB connector object
|
|
71
|
+
return tsdb_connector_type.to_tsdb_connector(project=project, **kwargs)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from abc import ABC
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TSDBConnector(ABC):
|
|
25
|
+
def __init__(self, project: str):
|
|
26
|
+
"""
|
|
27
|
+
Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
|
|
28
|
+
At the moment we have 3 different types of monitoring data:
|
|
29
|
+
- real time performance metrics: real time performance metrics that are being calculated by the model
|
|
30
|
+
monitoring stream pod.
|
|
31
|
+
Among these metrics are the base metrics (average latency and predictions over time), endpoint features
|
|
32
|
+
(data samples), and custom metrics (user-defined metrics).
|
|
33
|
+
- app_results: a detailed results that include status, kind, extra data, etc. These results are being calculated
|
|
34
|
+
through the monitoring applications and stored in the TSDB using the model monitoring writer.
|
|
35
|
+
- metrics: a basic key value that represents a numeric metric. Similar to the app_results, these metrics
|
|
36
|
+
are being calculated through the monitoring applications and stored in the TSDB using the model monitoring
|
|
37
|
+
writer.
|
|
38
|
+
|
|
39
|
+
:param project: the name of the project.
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
self.project = project
|
|
43
|
+
|
|
44
|
+
def apply_monitoring_stream_steps(self, graph):
|
|
45
|
+
"""
|
|
46
|
+
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
47
|
+
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
48
|
+
grafana.
|
|
49
|
+
There are 3 different key metric dictionaries that are being generated throughout these steps:
|
|
50
|
+
- base_metrics (average latency and predictions over time)
|
|
51
|
+
- endpoint_features (Prediction and feature names and values)
|
|
52
|
+
- custom_metrics (user-defined metrics)
|
|
53
|
+
"""
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
def write_application_event(
|
|
57
|
+
self,
|
|
58
|
+
event: dict,
|
|
59
|
+
kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Write a single application or metric to TSDB.
|
|
63
|
+
|
|
64
|
+
:raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
|
|
65
|
+
"""
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
def delete_tsdb_resources(self):
|
|
69
|
+
"""
|
|
70
|
+
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
def get_model_endpoint_real_time_metrics(
|
|
76
|
+
self,
|
|
77
|
+
endpoint_id: str,
|
|
78
|
+
metrics: list[str],
|
|
79
|
+
start: str = "now-1h",
|
|
80
|
+
end: str = "now",
|
|
81
|
+
) -> dict[str, list[tuple[str, float]]]:
|
|
82
|
+
"""
|
|
83
|
+
Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
|
|
84
|
+
`predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user. Note that these
|
|
85
|
+
metrics are being calculated by the model monitoring stream pod.
|
|
86
|
+
:param endpoint_id: The unique id of the model endpoint.
|
|
87
|
+
:param metrics: A list of real-time metrics to return for the model endpoint.
|
|
88
|
+
:param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
|
|
89
|
+
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
90
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
91
|
+
= seconds), or 0 for the earliest time.
|
|
92
|
+
:param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
|
|
93
|
+
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
94
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
95
|
+
= seconds), or 0 for the earliest time.
|
|
96
|
+
:return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
|
|
97
|
+
includes timestamps and the values.
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
def get_records(
|
|
102
|
+
self,
|
|
103
|
+
table: str,
|
|
104
|
+
columns: list[str] = None,
|
|
105
|
+
filter_query: str = "",
|
|
106
|
+
start: str = "now-1h",
|
|
107
|
+
end: str = "now",
|
|
108
|
+
) -> pd.DataFrame:
|
|
109
|
+
"""
|
|
110
|
+
Getting records from TSDB data collection.
|
|
111
|
+
:param table: Table name, e.g. 'metrics', 'app_results'.
|
|
112
|
+
:param columns: Columns to include in the result.
|
|
113
|
+
:param filter_query: Optional filter expression as a string. The filter structure depends on the TSDB
|
|
114
|
+
connector type.
|
|
115
|
+
:param start: The start time of the metrics. Can be represented by a string containing an RFC
|
|
116
|
+
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
117
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
118
|
+
= seconds), or 0 for the earliest time.
|
|
119
|
+
:param end: The end time of the metrics. Can be represented by a string containing an RFC
|
|
120
|
+
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
121
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
122
|
+
= seconds), or 0 for the earliest time.
|
|
123
|
+
|
|
124
|
+
:return: DataFrame with the provided attributes from the data collection.
|
|
125
|
+
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
126
|
+
"""
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
def create_tsdb_application_tables(self):
|
|
130
|
+
"""
|
|
131
|
+
Create the application tables using the TSDB connector. At the moment we support 2 types of application tables:
|
|
132
|
+
- app_results: a detailed result that includes status, kind, extra data, etc.
|
|
133
|
+
- metrics: a basic key value that represents a numeric metric.
|
|
134
|
+
"""
|
|
135
|
+
pass
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .v3io_connector import V3IOTSDBConnector
|