mlrun 1.7.0rc17__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +1 -1
- mlrun/artifacts/manager.py +5 -1
- mlrun/common/runtimes/constants.py +3 -0
- mlrun/common/schemas/__init__.py +1 -1
- mlrun/common/schemas/alert.py +31 -9
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +3 -1
- mlrun/common/schemas/model_monitoring/constants.py +20 -1
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +17 -6
- mlrun/config.py +2 -0
- mlrun/data_types/to_pandas.py +5 -5
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +111 -6
- mlrun/datastore/targets.py +2 -2
- mlrun/db/base.py +5 -1
- mlrun/db/httpdb.py +22 -3
- mlrun/db/nopdb.py +5 -1
- mlrun/errors.py +6 -0
- mlrun/feature_store/retrieval/conversion.py +5 -5
- mlrun/feature_store/retrieval/job.py +3 -2
- mlrun/feature_store/retrieval/spark_merger.py +2 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -2
- mlrun/model_monitoring/db/stores/base/store.py +16 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +44 -43
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +190 -91
- mlrun/model_monitoring/db/tsdb/__init__.py +35 -6
- mlrun/model_monitoring/db/tsdb/base.py +25 -18
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +207 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +231 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +73 -72
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +217 -16
- mlrun/model_monitoring/helpers.py +32 -0
- mlrun/model_monitoring/stream_processing.py +7 -4
- mlrun/model_monitoring/writer.py +18 -13
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/projects/project.py +33 -8
- mlrun/render.py +8 -5
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +20 -1
- mlrun/utils/notifications/notification/slack.py +27 -7
- mlrun/utils/notifications/notification_pusher.py +38 -40
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/METADATA +7 -2
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/RECORD +55 -51
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/top_level.txt +0 -0
|
@@ -15,23 +15,79 @@
|
|
|
15
15
|
import json
|
|
16
16
|
import os
|
|
17
17
|
import typing
|
|
18
|
+
from dataclasses import dataclass
|
|
18
19
|
from http import HTTPStatus
|
|
19
20
|
|
|
20
21
|
import v3io.dataplane
|
|
22
|
+
import v3io.dataplane.output
|
|
21
23
|
import v3io.dataplane.response
|
|
22
24
|
|
|
23
25
|
import mlrun.common.model_monitoring.helpers
|
|
24
|
-
import mlrun.common.schemas.model_monitoring as
|
|
26
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
25
27
|
import mlrun.model_monitoring.db
|
|
26
28
|
import mlrun.utils.v3io_clients
|
|
27
29
|
from mlrun.utils import logger
|
|
28
30
|
|
|
29
31
|
# Fields to encode before storing in the KV table or to decode after retrieving
|
|
30
32
|
fields_to_encode_decode = [
|
|
31
|
-
|
|
32
|
-
|
|
33
|
+
mm_schemas.EventFieldType.FEATURE_STATS,
|
|
34
|
+
mm_schemas.EventFieldType.CURRENT_STATS,
|
|
33
35
|
]
|
|
34
36
|
|
|
37
|
+
_METRIC_FIELDS: list[str] = [
|
|
38
|
+
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
39
|
+
mm_schemas.MetricData.METRIC_NAME,
|
|
40
|
+
mm_schemas.MetricData.METRIC_VALUE,
|
|
41
|
+
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
42
|
+
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SchemaField(typing.TypedDict):
|
|
47
|
+
name: str
|
|
48
|
+
type: str
|
|
49
|
+
nullable: bool
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class SchemaParams:
|
|
54
|
+
key: str
|
|
55
|
+
fields: list[SchemaField]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_RESULT_SCHEMA: list[SchemaField] = [
|
|
59
|
+
SchemaField(
|
|
60
|
+
name=mm_schemas.ResultData.RESULT_NAME,
|
|
61
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
62
|
+
nullable=False,
|
|
63
|
+
)
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
_METRIC_SCHEMA: list[SchemaField] = [
|
|
67
|
+
SchemaField(
|
|
68
|
+
name=mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
69
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
70
|
+
nullable=False,
|
|
71
|
+
),
|
|
72
|
+
SchemaField(
|
|
73
|
+
name=mm_schemas.MetricData.METRIC_NAME,
|
|
74
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
75
|
+
nullable=False,
|
|
76
|
+
),
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_KIND_TO_SCHEMA_PARAMS: dict[mm_schemas.WriterEventKind, SchemaParams] = {
|
|
81
|
+
mm_schemas.WriterEventKind.RESULT: SchemaParams(
|
|
82
|
+
key=mm_schemas.WriterEvent.APPLICATION_NAME, fields=_RESULT_SCHEMA
|
|
83
|
+
),
|
|
84
|
+
mm_schemas.WriterEventKind.METRIC: SchemaParams(
|
|
85
|
+
key="metric_id", fields=_METRIC_SCHEMA
|
|
86
|
+
),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_EXCLUDE_SCHEMA_FILTER_EXPRESSION = '__name!=".#schema"'
|
|
90
|
+
|
|
35
91
|
|
|
36
92
|
class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
37
93
|
"""
|
|
@@ -64,7 +120,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
64
120
|
self.client.kv.put(
|
|
65
121
|
container=self.container,
|
|
66
122
|
table_path=self.path,
|
|
67
|
-
key=endpoint[
|
|
123
|
+
key=endpoint[mm_schemas.EventFieldType.UID],
|
|
68
124
|
attributes=endpoint,
|
|
69
125
|
)
|
|
70
126
|
|
|
@@ -151,7 +207,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
151
207
|
"""Getting path and container based on the model monitoring configurations"""
|
|
152
208
|
path = mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
153
209
|
project=self.project,
|
|
154
|
-
kind=
|
|
210
|
+
kind=mm_schemas.ModelMonitoringStoreKinds.ENDPOINTS,
|
|
155
211
|
)
|
|
156
212
|
(
|
|
157
213
|
_,
|
|
@@ -217,11 +273,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
217
273
|
if uids is None:
|
|
218
274
|
uids = []
|
|
219
275
|
for item in items:
|
|
220
|
-
if
|
|
276
|
+
if mm_schemas.EventFieldType.UID not in item:
|
|
221
277
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
222
|
-
uids.append(item[
|
|
278
|
+
uids.append(item[mm_schemas.EventFieldType.ENDPOINT_ID])
|
|
223
279
|
else:
|
|
224
|
-
uids.append(item[
|
|
280
|
+
uids.append(item[mm_schemas.EventFieldType.UID])
|
|
225
281
|
|
|
226
282
|
# Add each relevant model endpoint to the model endpoints list
|
|
227
283
|
for endpoint_id in uids:
|
|
@@ -241,11 +297,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
241
297
|
|
|
242
298
|
# Delete model endpoint record from KV table
|
|
243
299
|
for endpoint_dict in endpoints:
|
|
244
|
-
if
|
|
300
|
+
if mm_schemas.EventFieldType.UID not in endpoint_dict:
|
|
245
301
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
246
|
-
endpoint_id = endpoint_dict[
|
|
302
|
+
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
|
|
247
303
|
else:
|
|
248
|
-
endpoint_id = endpoint_dict[
|
|
304
|
+
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
|
|
249
305
|
self.delete_model_endpoint(
|
|
250
306
|
endpoint_id,
|
|
251
307
|
)
|
|
@@ -268,11 +324,19 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
268
324
|
raise_for_status=v3io.dataplane.RaiseForStatus.never,
|
|
269
325
|
)
|
|
270
326
|
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _get_results_table_path(endpoint_id: str) -> str:
|
|
329
|
+
return endpoint_id
|
|
330
|
+
|
|
331
|
+
@staticmethod
|
|
332
|
+
def _get_metrics_table_path(endpoint_id: str) -> str:
|
|
333
|
+
return f"{endpoint_id}_metrics"
|
|
334
|
+
|
|
271
335
|
def write_application_event(
|
|
272
336
|
self,
|
|
273
337
|
event: dict[str, typing.Any],
|
|
274
|
-
kind:
|
|
275
|
-
):
|
|
338
|
+
kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
|
|
339
|
+
) -> None:
|
|
276
340
|
"""
|
|
277
341
|
Write a new application event in the target table.
|
|
278
342
|
|
|
@@ -281,66 +345,63 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
281
345
|
object.
|
|
282
346
|
:param kind: The type of the event, can be either "result" or "metric".
|
|
283
347
|
"""
|
|
284
|
-
if kind == mm_constants.WriterEventKind.METRIC:
|
|
285
|
-
# TODO : Implement the logic for writing metrics to KV
|
|
286
|
-
return
|
|
287
|
-
|
|
288
|
-
endpoint_id = event.pop(mm_constants.WriterEvent.ENDPOINT_ID)
|
|
289
|
-
app_name = event.pop(mm_constants.WriterEvent.APPLICATION_NAME)
|
|
290
|
-
metric_name = event.pop(mm_constants.ResultData.RESULT_NAME)
|
|
291
|
-
attributes = {metric_name: json.dumps(event)}
|
|
292
348
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
349
|
+
container = self.get_v3io_monitoring_apps_container(project_name=self.project)
|
|
350
|
+
endpoint_id = event.pop(mm_schemas.WriterEvent.ENDPOINT_ID)
|
|
351
|
+
|
|
352
|
+
if kind == mm_schemas.WriterEventKind.METRIC:
|
|
353
|
+
table_path = self._get_metrics_table_path(endpoint_id)
|
|
354
|
+
key = f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}.{event[mm_schemas.MetricData.METRIC_NAME]}"
|
|
355
|
+
attributes = {event_key: event[event_key] for event_key in _METRIC_FIELDS}
|
|
356
|
+
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
357
|
+
table_path = self._get_results_table_path(endpoint_id)
|
|
358
|
+
key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
|
|
359
|
+
metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
|
|
360
|
+
attributes = {metric_name: json.dumps(event)}
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError(f"Invalid {kind = }")
|
|
296
363
|
|
|
297
364
|
self.client.kv.update(
|
|
298
|
-
container=
|
|
299
|
-
table_path=
|
|
300
|
-
key=
|
|
365
|
+
container=container,
|
|
366
|
+
table_path=table_path,
|
|
367
|
+
key=key,
|
|
301
368
|
attributes=attributes,
|
|
302
369
|
)
|
|
303
370
|
|
|
304
371
|
schema_file = self.client.kv.new_cursor(
|
|
305
|
-
container=
|
|
306
|
-
table_path=
|
|
372
|
+
container=container,
|
|
373
|
+
table_path=table_path,
|
|
307
374
|
filter_expression='__name==".#schema"',
|
|
308
375
|
)
|
|
309
376
|
|
|
310
377
|
if not schema_file.all():
|
|
311
378
|
logger.info(
|
|
312
|
-
"
|
|
313
|
-
container=
|
|
314
|
-
|
|
379
|
+
"Generating a new V3IO KV schema file",
|
|
380
|
+
container=container,
|
|
381
|
+
table_path=table_path,
|
|
382
|
+
)
|
|
383
|
+
self._generate_kv_schema(
|
|
384
|
+
container=container, table_path=table_path, kind=kind
|
|
315
385
|
)
|
|
316
|
-
|
|
317
|
-
logger.info("Updated V3IO KV successfully", key=app_name)
|
|
386
|
+
logger.info("Updated V3IO KV successfully", key=key)
|
|
318
387
|
|
|
319
388
|
def _generate_kv_schema(
|
|
320
|
-
self,
|
|
321
|
-
):
|
|
389
|
+
self, *, container: str, table_path: str, kind: mm_schemas.WriterEventKind
|
|
390
|
+
) -> None:
|
|
322
391
|
"""Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
|
|
323
|
-
|
|
324
|
-
{
|
|
325
|
-
"name": mm_constants.ResultData.RESULT_NAME,
|
|
326
|
-
"type": "string",
|
|
327
|
-
"nullable": False,
|
|
328
|
-
}
|
|
329
|
-
]
|
|
392
|
+
schema_params = _KIND_TO_SCHEMA_PARAMS[kind]
|
|
330
393
|
res = self.client.kv.create_schema(
|
|
331
|
-
container=
|
|
332
|
-
table_path=
|
|
333
|
-
key=
|
|
334
|
-
fields=fields,
|
|
394
|
+
container=container,
|
|
395
|
+
table_path=table_path,
|
|
396
|
+
key=schema_params.key,
|
|
397
|
+
fields=schema_params.fields,
|
|
335
398
|
)
|
|
336
399
|
if res.status_code != HTTPStatus.OK:
|
|
337
400
|
raise mlrun.errors.MLRunBadRequestError(
|
|
338
|
-
f"Couldn't infer schema for endpoint {
|
|
401
|
+
f"Couldn't infer schema for endpoint {table_path} which is required for Grafana dashboards"
|
|
339
402
|
)
|
|
340
403
|
else:
|
|
341
|
-
logger.info(
|
|
342
|
-
"Generated V3IO KV schema successfully", endpoint_id=endpoint_id
|
|
343
|
-
)
|
|
404
|
+
logger.info("Generated V3IO KV schema successfully", table_path=table_path)
|
|
344
405
|
|
|
345
406
|
def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
|
|
346
407
|
"""
|
|
@@ -361,7 +422,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
361
422
|
table_path=endpoint_id,
|
|
362
423
|
key=application_name,
|
|
363
424
|
)
|
|
364
|
-
return data.output.item[
|
|
425
|
+
return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
365
426
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
366
427
|
logger.debug("Error while getting last analyzed time", err=err)
|
|
367
428
|
raise mlrun.errors.MLRunNotFoundError(
|
|
@@ -386,7 +447,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
386
447
|
),
|
|
387
448
|
table_path=endpoint_id,
|
|
388
449
|
key=application_name,
|
|
389
|
-
attributes={
|
|
450
|
+
attributes={mm_schemas.SchedulingKeys.LAST_ANALYZED: last_analyzed},
|
|
390
451
|
)
|
|
391
452
|
|
|
392
453
|
def _generate_tsdb_paths(self) -> tuple[str, str]:
|
|
@@ -399,7 +460,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
399
460
|
full_path = (
|
|
400
461
|
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
401
462
|
project=self.project,
|
|
402
|
-
kind=
|
|
463
|
+
kind=mm_schemas.ModelMonitoringStoreKinds.EVENTS,
|
|
403
464
|
)
|
|
404
465
|
)
|
|
405
466
|
|
|
@@ -495,8 +556,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
495
556
|
# Apply top_level filter (remove endpoints that considered a child of a router)
|
|
496
557
|
if top_level:
|
|
497
558
|
filter_expression.append(
|
|
498
|
-
f"(endpoint_type=='{str(
|
|
499
|
-
f"OR endpoint_type=='{str(
|
|
559
|
+
f"(endpoint_type=='{str(mm_schemas.EndpointType.NODE_EP.value)}' "
|
|
560
|
+
f"OR endpoint_type=='{str(mm_schemas.EndpointType.ROUTER.value)}')"
|
|
500
561
|
)
|
|
501
562
|
|
|
502
563
|
return " AND ".join(filter_expression)
|
|
@@ -516,30 +577,30 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
516
577
|
# Validate default value for `error_count`
|
|
517
578
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
|
|
518
579
|
if (
|
|
519
|
-
|
|
520
|
-
and endpoint[
|
|
580
|
+
mm_schemas.EventFieldType.ERROR_COUNT in endpoint
|
|
581
|
+
and endpoint[mm_schemas.EventFieldType.ERROR_COUNT] == "null"
|
|
521
582
|
):
|
|
522
|
-
endpoint[
|
|
583
|
+
endpoint[mm_schemas.EventFieldType.ERROR_COUNT] = "0"
|
|
523
584
|
|
|
524
585
|
# Validate default value for `metrics`
|
|
525
586
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
|
|
526
587
|
if (
|
|
527
|
-
|
|
528
|
-
and endpoint[
|
|
588
|
+
mm_schemas.EventFieldType.METRICS in endpoint
|
|
589
|
+
and endpoint[mm_schemas.EventFieldType.METRICS] == "null"
|
|
529
590
|
):
|
|
530
|
-
endpoint[
|
|
591
|
+
endpoint[mm_schemas.EventFieldType.METRICS] = json.dumps(
|
|
531
592
|
{
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
593
|
+
mm_schemas.EventKeyMetrics.GENERIC: {
|
|
594
|
+
mm_schemas.EventLiveStats.LATENCY_AVG_1H: 0,
|
|
595
|
+
mm_schemas.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
|
|
535
596
|
}
|
|
536
597
|
}
|
|
537
598
|
)
|
|
538
599
|
# Validate key `uid` instead of `endpoint_id`
|
|
539
600
|
# For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
|
|
540
|
-
if
|
|
541
|
-
endpoint[
|
|
542
|
-
|
|
601
|
+
if mm_schemas.EventFieldType.ENDPOINT_ID in endpoint:
|
|
602
|
+
endpoint[mm_schemas.EventFieldType.UID] = endpoint[
|
|
603
|
+
mm_schemas.EventFieldType.ENDPOINT_ID
|
|
543
604
|
]
|
|
544
605
|
|
|
545
606
|
@staticmethod
|
|
@@ -566,57 +627,95 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
566
627
|
def _get_monitoring_schedules_container(project_name: str) -> str:
|
|
567
628
|
return f"users/pipelines/{project_name}/monitoring-schedules/functions"
|
|
568
629
|
|
|
569
|
-
def
|
|
630
|
+
def _extract_results_from_items(
|
|
570
631
|
self, app_items: list[dict[str, str]]
|
|
571
|
-
) -> list[
|
|
572
|
-
|
|
632
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
633
|
+
"""Assuming .#schema items are filtered out"""
|
|
634
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
573
635
|
for app_item in app_items:
|
|
574
|
-
# See https://www.iguazio.com/docs/latest-release/services/data-layer/reference/system-attributes/#sys-attr-__name
|
|
575
636
|
app_name = app_item.pop("__name")
|
|
576
|
-
if app_name == ".#schema":
|
|
577
|
-
continue
|
|
578
637
|
for result_name in app_item:
|
|
579
638
|
metrics.append(
|
|
580
|
-
|
|
639
|
+
mm_schemas.ModelEndpointMonitoringMetric(
|
|
581
640
|
project=self.project,
|
|
582
641
|
app=app_name,
|
|
583
|
-
type=
|
|
642
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.RESULT,
|
|
584
643
|
name=result_name,
|
|
585
|
-
full_name=
|
|
644
|
+
full_name=mm_schemas.model_endpoints._compose_full_name(
|
|
586
645
|
project=self.project, app=app_name, name=result_name
|
|
587
646
|
),
|
|
588
647
|
)
|
|
589
648
|
)
|
|
590
649
|
return metrics
|
|
591
650
|
|
|
651
|
+
def _extract_metrics_from_items(
|
|
652
|
+
self, result_items: list[dict[str, str]]
|
|
653
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
654
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
655
|
+
logger.debug("Result items", result_items=result_items)
|
|
656
|
+
for result_item in result_items:
|
|
657
|
+
app = result_item[mm_schemas.WriterEvent.APPLICATION_NAME]
|
|
658
|
+
name = result_item[mm_schemas.MetricData.METRIC_NAME]
|
|
659
|
+
metrics.append(
|
|
660
|
+
mm_schemas.ModelEndpointMonitoringMetric(
|
|
661
|
+
project=self.project,
|
|
662
|
+
app=app,
|
|
663
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
664
|
+
name=name,
|
|
665
|
+
full_name=mm_schemas.model_endpoints._compose_full_name(
|
|
666
|
+
project=self.project,
|
|
667
|
+
app=app,
|
|
668
|
+
name=name,
|
|
669
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
670
|
+
),
|
|
671
|
+
)
|
|
672
|
+
)
|
|
673
|
+
return metrics
|
|
674
|
+
|
|
592
675
|
def get_model_endpoint_metrics(
|
|
593
|
-
self, endpoint_id: str
|
|
594
|
-
) -> list[
|
|
676
|
+
self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
|
|
677
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
595
678
|
"""Get model monitoring results and metrics on the endpoint"""
|
|
596
|
-
metrics: list[
|
|
679
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
597
680
|
container = self.get_v3io_monitoring_apps_container(self.project)
|
|
681
|
+
if type == mm_schemas.ModelEndpointMonitoringMetricType.METRIC:
|
|
682
|
+
table_path = self._get_metrics_table_path(endpoint_id)
|
|
683
|
+
items_extractor = self._extract_metrics_from_items
|
|
684
|
+
elif type == mm_schemas.ModelEndpointMonitoringMetricType.RESULT:
|
|
685
|
+
table_path = self._get_results_table_path(endpoint_id)
|
|
686
|
+
items_extractor = self._extract_results_from_items
|
|
687
|
+
else:
|
|
688
|
+
raise ValueError(f"Invalid metric {type = }")
|
|
689
|
+
|
|
690
|
+
def scan(
|
|
691
|
+
marker: typing.Optional[str] = None,
|
|
692
|
+
) -> v3io.dataplane.response.Response:
|
|
693
|
+
# TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
|
|
694
|
+
return self.client.kv.scan(
|
|
695
|
+
container=container,
|
|
696
|
+
table_path=table_path,
|
|
697
|
+
marker=marker,
|
|
698
|
+
filter_expression=_EXCLUDE_SCHEMA_FILTER_EXPRESSION,
|
|
699
|
+
)
|
|
700
|
+
|
|
598
701
|
try:
|
|
599
|
-
response =
|
|
702
|
+
response = scan()
|
|
600
703
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
601
704
|
if err.status_code == HTTPStatus.NOT_FOUND:
|
|
602
705
|
logger.warning(
|
|
603
|
-
"Attempt getting
|
|
706
|
+
f"Attempt getting {type}s - no data. Check the "
|
|
604
707
|
"project name, endpoint, or wait for the applications to start.",
|
|
605
708
|
container=container,
|
|
606
|
-
table_path=
|
|
709
|
+
table_path=table_path,
|
|
607
710
|
)
|
|
608
711
|
return []
|
|
609
712
|
raise
|
|
610
713
|
|
|
611
714
|
while True:
|
|
612
|
-
|
|
613
|
-
|
|
715
|
+
output = typing.cast(v3io.dataplane.output.GetItemsOutput, response.output)
|
|
716
|
+
metrics.extend(items_extractor(output.items))
|
|
717
|
+
if output.last:
|
|
614
718
|
break
|
|
615
|
-
|
|
616
|
-
response = self.client.kv.scan(
|
|
617
|
-
container=container,
|
|
618
|
-
table_path=endpoint_id,
|
|
619
|
-
marker=response.output.next_marker,
|
|
620
|
-
)
|
|
719
|
+
response = scan(marker=output.next_marker)
|
|
621
720
|
|
|
622
721
|
return metrics
|
|
@@ -25,6 +25,7 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
25
25
|
"""Enum class to handle the different TSDB connector type values for storing real time metrics"""
|
|
26
26
|
|
|
27
27
|
v3io_tsdb = "v3io-tsdb"
|
|
28
|
+
tdengine = "tdengine"
|
|
28
29
|
|
|
29
30
|
def to_tsdb_connector(self, project: str, **kwargs) -> TSDBConnector:
|
|
30
31
|
"""
|
|
@@ -43,6 +44,13 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
43
44
|
|
|
44
45
|
return V3IOTSDBConnector(project=project, **kwargs)
|
|
45
46
|
|
|
47
|
+
# Assuming TDEngine connector if connector type is not V3IO TSDB.
|
|
48
|
+
# Update these lines once there are more than two connector types.
|
|
49
|
+
|
|
50
|
+
from .tdengine.tdengine_connector import TDEngineConnector
|
|
51
|
+
|
|
52
|
+
return TDEngineConnector(project=project, **kwargs)
|
|
53
|
+
|
|
46
54
|
@classmethod
|
|
47
55
|
def _missing_(cls, value: typing.Any):
|
|
48
56
|
"""A lookup function to handle an invalid value.
|
|
@@ -54,18 +62,39 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
54
62
|
)
|
|
55
63
|
|
|
56
64
|
|
|
57
|
-
def get_tsdb_connector(
|
|
65
|
+
def get_tsdb_connector(
|
|
66
|
+
project: str,
|
|
67
|
+
tsdb_connector_type: str = "",
|
|
68
|
+
secret_provider: typing.Callable = None,
|
|
69
|
+
**kwargs,
|
|
70
|
+
) -> TSDBConnector:
|
|
58
71
|
"""
|
|
59
|
-
Get
|
|
72
|
+
Get TSDB connector object.
|
|
60
73
|
:param project: The name of the project.
|
|
74
|
+
:param tsdb_connector_type: The type of the TSDB connector. See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory
|
|
75
|
+
for available options.
|
|
76
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
77
|
+
|
|
61
78
|
:return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
|
|
62
79
|
TSDB connector such as updating drift metrics or write application record result.
|
|
63
80
|
"""
|
|
64
81
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
|
|
82
|
+
tsdb_connection_string = mlrun.model_monitoring.helpers.get_tsdb_connection_string(
|
|
83
|
+
secret_provider=secret_provider
|
|
68
84
|
)
|
|
69
85
|
|
|
86
|
+
if tsdb_connection_string and tsdb_connection_string.startswith("taosws"):
|
|
87
|
+
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
|
|
88
|
+
kwargs["connection_string"] = tsdb_connection_string
|
|
89
|
+
|
|
90
|
+
# Set the default TSDB connector type if no connection has been set
|
|
91
|
+
tsdb_connector_type = (
|
|
92
|
+
tsdb_connector_type
|
|
93
|
+
or mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Get connector type value from ObjectTSDBFactory enum class
|
|
97
|
+
tsdb_connector_factory = ObjectTSDBFactory(tsdb_connector_type)
|
|
98
|
+
|
|
70
99
|
# Convert into TSDB connector object
|
|
71
|
-
return
|
|
100
|
+
return tsdb_connector_factory.to_tsdb_connector(project=project, **kwargs)
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
import typing
|
|
17
18
|
from abc import ABC
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
@@ -22,6 +23,8 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class TSDBConnector(ABC):
|
|
26
|
+
type: str = ""
|
|
27
|
+
|
|
25
28
|
def __init__(self, project: str):
|
|
26
29
|
"""
|
|
27
30
|
Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
|
|
@@ -57,13 +60,12 @@ class TSDBConnector(ABC):
|
|
|
57
60
|
self,
|
|
58
61
|
event: dict,
|
|
59
62
|
kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
|
|
60
|
-
):
|
|
63
|
+
) -> None:
|
|
61
64
|
"""
|
|
62
65
|
Write a single application or metric to TSDB.
|
|
63
66
|
|
|
64
67
|
:raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
|
|
65
68
|
"""
|
|
66
|
-
pass
|
|
67
69
|
|
|
68
70
|
def delete_tsdb_resources(self):
|
|
69
71
|
"""
|
|
@@ -76,8 +78,8 @@ class TSDBConnector(ABC):
|
|
|
76
78
|
self,
|
|
77
79
|
endpoint_id: str,
|
|
78
80
|
metrics: list[str],
|
|
79
|
-
start: str
|
|
80
|
-
end: str
|
|
81
|
+
start: str,
|
|
82
|
+
end: str,
|
|
81
83
|
) -> dict[str, list[tuple[str, float]]]:
|
|
82
84
|
"""
|
|
83
85
|
Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
|
|
@@ -101,35 +103,40 @@ class TSDBConnector(ABC):
|
|
|
101
103
|
def get_records(
|
|
102
104
|
self,
|
|
103
105
|
table: str,
|
|
104
|
-
|
|
106
|
+
start: str,
|
|
107
|
+
end: str,
|
|
108
|
+
columns: typing.Optional[list[str]] = None,
|
|
105
109
|
filter_query: str = "",
|
|
106
|
-
start: str = "now-1h",
|
|
107
|
-
end: str = "now",
|
|
108
110
|
) -> pd.DataFrame:
|
|
109
111
|
"""
|
|
110
112
|
Getting records from TSDB data collection.
|
|
111
113
|
:param table: Table name, e.g. 'metrics', 'app_results'.
|
|
114
|
+
:param start: The start time of the metrics.
|
|
115
|
+
If using V3IO, can be represented by a string containing an RFC 3339 time, a Unix
|
|
116
|
+
timestamp in milliseconds, a relative time (`'now'` or `'now-[0-9]+[mhd]'`, where
|
|
117
|
+
`m` = minutes, `h` = hours, `'d'` = days, and `'s'` = seconds), or 0 for the earliest
|
|
118
|
+
time.
|
|
119
|
+
If using TDEngine, can be represented by datetime.
|
|
120
|
+
:param end: The end time of the metrics.
|
|
121
|
+
If using V3IO, can be represented by a string containing an RFC 3339 time, a Unix
|
|
122
|
+
timestamp in milliseconds, a relative time (`'now'` or `'now-[0-9]+[mhd]'`, where
|
|
123
|
+
`m` = minutes, `h` = hours, `'d'` = days, and `'s'` = seconds), or 0 for the earliest
|
|
124
|
+
time.
|
|
125
|
+
If using TDEngine, can be represented by datetime.
|
|
112
126
|
:param columns: Columns to include in the result.
|
|
113
127
|
:param filter_query: Optional filter expression as a string. The filter structure depends on the TSDB
|
|
114
128
|
connector type.
|
|
115
|
-
|
|
116
|
-
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
117
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
118
|
-
= seconds), or 0 for the earliest time.
|
|
119
|
-
:param end: The end time of the metrics. Can be represented by a string containing an RFC
|
|
120
|
-
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
121
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and `'s'`
|
|
122
|
-
= seconds), or 0 for the earliest time.
|
|
129
|
+
|
|
123
130
|
|
|
124
131
|
:return: DataFrame with the provided attributes from the data collection.
|
|
125
132
|
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
126
133
|
"""
|
|
127
134
|
pass
|
|
128
135
|
|
|
129
|
-
def
|
|
136
|
+
def create_tables(self) -> None:
|
|
130
137
|
"""
|
|
131
|
-
Create the
|
|
138
|
+
Create the TSDB tables using the TSDB connector. At the moment we support 3 types of tables:
|
|
132
139
|
- app_results: a detailed result that includes status, kind, extra data, etc.
|
|
133
140
|
- metrics: a basic key value that represents a numeric metric.
|
|
141
|
+
- predictions: latency of each prediction.
|
|
134
142
|
"""
|
|
135
|
-
pass
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .tdengine_connector import TDEngineConnector
|