mlrun 1.7.0rc18__py3-none-any.whl → 1.7.0rc19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +5 -2
- mlrun/common/constants.py +64 -3
- mlrun/common/formatters/__init__.py +16 -0
- mlrun/common/formatters/base.py +59 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/runtimes/constants.py +29 -4
- mlrun/common/schemas/__init__.py +0 -1
- mlrun/common/schemas/api_gateway.py +52 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/model_monitoring/__init__.py +6 -3
- mlrun/common/schemas/model_monitoring/constants.py +2 -7
- mlrun/config.py +7 -2
- mlrun/datastore/sources.py +16 -22
- mlrun/datastore/store_resources.py +5 -1
- mlrun/datastore/targets.py +3 -2
- mlrun/datastore/utils.py +42 -0
- mlrun/execution.py +16 -6
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/tf_keras/__init__.py +4 -1
- mlrun/launcher/client.py +4 -2
- mlrun/launcher/local.py +8 -2
- mlrun/launcher/remote.py +8 -2
- mlrun/model.py +5 -1
- mlrun/model_monitoring/db/stores/__init__.py +0 -2
- mlrun/model_monitoring/db/stores/base/store.py +1 -2
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +32 -2
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +25 -5
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +207 -139
- mlrun/model_monitoring/db/tsdb/__init__.py +1 -1
- mlrun/model_monitoring/db/tsdb/base.py +225 -38
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +48 -15
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +182 -16
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +229 -42
- mlrun/model_monitoring/helpers.py +13 -0
- mlrun/model_monitoring/writer.py +36 -11
- mlrun/projects/operations.py +8 -5
- mlrun/projects/pipelines.py +42 -15
- mlrun/projects/project.py +22 -6
- mlrun/runtimes/base.py +2 -1
- mlrun/runtimes/local.py +4 -1
- mlrun/runtimes/nuclio/api_gateway.py +32 -8
- mlrun/runtimes/nuclio/application/application.py +3 -3
- mlrun/runtimes/nuclio/function.py +1 -4
- mlrun/runtimes/utils.py +5 -6
- mlrun/serving/server.py +2 -1
- mlrun/utils/helpers.py +8 -6
- mlrun/utils/logger.py +28 -1
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification_pusher.py +10 -3
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/METADATA +3 -3
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/RECORD +62 -59
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +0 -335
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc18.dist-info → mlrun-1.7.0rc19.dist-info}/top_level.txt +0 -0
|
@@ -11,13 +11,15 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
import typing
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from io import StringIO
|
|
18
|
+
from typing import Literal, Optional, Union
|
|
16
19
|
|
|
17
20
|
import pandas as pd
|
|
18
21
|
import v3io_frames.client
|
|
19
22
|
import v3io_frames.errors
|
|
20
|
-
from v3io.dataplane import Client as V3IOClient
|
|
21
23
|
from v3io_frames.frames_pb2 import IGNORE
|
|
22
24
|
|
|
23
25
|
import mlrun.common.model_monitoring
|
|
@@ -25,10 +27,12 @@ import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
|
25
27
|
import mlrun.feature_store.steps
|
|
26
28
|
import mlrun.utils.v3io_clients
|
|
27
29
|
from mlrun.model_monitoring.db import TSDBConnector
|
|
30
|
+
from mlrun.model_monitoring.helpers import get_invocations_fqn
|
|
28
31
|
from mlrun.utils import logger
|
|
29
32
|
|
|
30
33
|
_TSDB_BE = "tsdb"
|
|
31
34
|
_TSDB_RATE = "1/s"
|
|
35
|
+
_CONTAINER = "users"
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
class V3IOTSDBConnector(TSDBConnector):
|
|
@@ -42,13 +46,11 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
42
46
|
def __init__(
|
|
43
47
|
self,
|
|
44
48
|
project: str,
|
|
45
|
-
|
|
46
|
-
container: str = "users",
|
|
49
|
+
container: str = _CONTAINER,
|
|
47
50
|
v3io_framesd: typing.Optional[str] = None,
|
|
48
51
|
create_table: bool = False,
|
|
49
|
-
):
|
|
52
|
+
) -> None:
|
|
50
53
|
super().__init__(project=project)
|
|
51
|
-
self.access_key = access_key or mlrun.mlconf.get_v3io_access_key()
|
|
52
54
|
|
|
53
55
|
self.container = container
|
|
54
56
|
|
|
@@ -56,9 +58,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
56
58
|
self._frames_client: v3io_frames.client.ClientBase = (
|
|
57
59
|
self._get_v3io_frames_client(self.container)
|
|
58
60
|
)
|
|
59
|
-
self._v3io_client: V3IOClient = mlrun.utils.v3io_clients.get_v3io_client(
|
|
60
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
61
|
-
)
|
|
62
61
|
|
|
63
62
|
self._init_tables_path()
|
|
64
63
|
|
|
@@ -254,7 +253,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
254
253
|
) -> None:
|
|
255
254
|
"""Write a single result or metric to TSDB"""
|
|
256
255
|
|
|
257
|
-
event[mm_schemas.WriterEvent.END_INFER_TIME] = datetime.
|
|
256
|
+
event[mm_schemas.WriterEvent.END_INFER_TIME] = datetime.fromisoformat(
|
|
258
257
|
event[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
259
258
|
)
|
|
260
259
|
index_cols_base = [
|
|
@@ -299,12 +298,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
299
298
|
else:
|
|
300
299
|
# Delete all tables
|
|
301
300
|
tables = mm_schemas.V3IOTSDBTables.list()
|
|
302
|
-
for
|
|
301
|
+
for table_to_delete in tables:
|
|
303
302
|
try:
|
|
304
|
-
self._frames_client.delete(
|
|
305
|
-
backend=mlrun.common.schemas.model_monitoring.TimeSeriesConnector.TSDB,
|
|
306
|
-
table=table,
|
|
307
|
-
)
|
|
303
|
+
self._frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
|
|
308
304
|
except v3io_frames.errors.DeleteError as e:
|
|
309
305
|
logger.warning(
|
|
310
306
|
f"Failed to delete TSDB table '{table}'",
|
|
@@ -318,11 +314,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
318
314
|
store.rm(tsdb_path, recursive=True)
|
|
319
315
|
|
|
320
316
|
def get_model_endpoint_real_time_metrics(
|
|
321
|
-
self,
|
|
322
|
-
endpoint_id: str,
|
|
323
|
-
metrics: list[str],
|
|
324
|
-
start: str,
|
|
325
|
-
end: str,
|
|
317
|
+
self, endpoint_id: str, metrics: list[str], start: str, end: str
|
|
326
318
|
) -> dict[str, list[tuple[str, float]]]:
|
|
327
319
|
"""
|
|
328
320
|
Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
|
|
@@ -350,7 +342,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
350
342
|
metrics_mapping = {}
|
|
351
343
|
|
|
352
344
|
try:
|
|
353
|
-
data = self.
|
|
345
|
+
data = self._get_records(
|
|
354
346
|
table=mm_schemas.V3IOTSDBTables.EVENTS,
|
|
355
347
|
columns=["endpoint_id", *metrics],
|
|
356
348
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
@@ -375,45 +367,74 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
375
367
|
|
|
376
368
|
return metrics_mapping
|
|
377
369
|
|
|
378
|
-
def
|
|
370
|
+
def _get_records(
|
|
379
371
|
self,
|
|
380
372
|
table: str,
|
|
381
|
-
start: str,
|
|
382
|
-
end: str,
|
|
373
|
+
start: Union[datetime, str],
|
|
374
|
+
end: Union[datetime, str],
|
|
383
375
|
columns: typing.Optional[list[str]] = None,
|
|
384
376
|
filter_query: str = "",
|
|
377
|
+
interval: typing.Optional[str] = None,
|
|
378
|
+
agg_funcs: typing.Optional[list] = None,
|
|
379
|
+
limit: typing.Optional[int] = None,
|
|
380
|
+
sliding_window_step: typing.Optional[str] = None,
|
|
381
|
+
**kwargs,
|
|
385
382
|
) -> pd.DataFrame:
|
|
386
383
|
"""
|
|
387
384
|
Getting records from V3IO TSDB data collection.
|
|
388
|
-
:param table:
|
|
389
|
-
:param start:
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
:param end:
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
:param columns:
|
|
398
|
-
:param filter_query:
|
|
399
|
-
|
|
385
|
+
:param table: Path to the collection to query.
|
|
386
|
+
:param start: The start time of the metrics. Can be represented by a string containing an RFC
|
|
387
|
+
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
388
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
|
|
389
|
+
`'s'` = seconds), or 0 for the earliest time.
|
|
390
|
+
:param end: The end time of the metrics. Can be represented by a string containing an RFC
|
|
391
|
+
3339 time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
392
|
+
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
|
|
393
|
+
`'s'` = seconds), or 0 for the earliest time.
|
|
394
|
+
:param columns: Columns to include in the result.
|
|
395
|
+
:param filter_query: V3IO filter expression. The expected filter expression includes different
|
|
396
|
+
conditions, divided by ' AND '.
|
|
397
|
+
:param interval: The interval to aggregate the data by. Note that if interval is provided,
|
|
398
|
+
agg_funcs must bg provided as well. Provided as a string in the format of '1m',
|
|
399
|
+
'1h', etc.
|
|
400
|
+
:param agg_funcs: The aggregation functions to apply on the columns. Note that if `agg_funcs` is
|
|
401
|
+
provided, `interval` must bg provided as well. Provided as a list of strings in
|
|
402
|
+
the format of ['sum', 'avg', 'count', ...].
|
|
403
|
+
:param limit: The maximum number of records to return.
|
|
404
|
+
:param sliding_window_step: The time step for which the time window moves forward. Note that if
|
|
405
|
+
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
406
|
+
as a string in the format of '1m', '1h', etc.
|
|
407
|
+
:param kwargs: Additional keyword arguments passed to the read method of frames client.
|
|
400
408
|
:return: DataFrame with the provided attributes from the data collection.
|
|
401
409
|
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
402
410
|
"""
|
|
403
411
|
if table not in self.tables:
|
|
404
412
|
raise mlrun.errors.MLRunNotFoundError(
|
|
405
|
-
f"Table '{table}' does not exist in the tables list of the TSDB connector."
|
|
413
|
+
f"Table '{table}' does not exist in the tables list of the TSDB connector. "
|
|
406
414
|
f"Available tables: {list(self.tables.keys())}"
|
|
407
415
|
)
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
416
|
+
|
|
417
|
+
if agg_funcs:
|
|
418
|
+
# Frames client expects the aggregators to be a comma-separated string
|
|
419
|
+
agg_funcs = ",".join(agg_funcs)
|
|
420
|
+
table_path = self.tables[table]
|
|
421
|
+
df = self._frames_client.read(
|
|
422
|
+
backend=_TSDB_BE,
|
|
423
|
+
table=table_path,
|
|
413
424
|
start=start,
|
|
414
425
|
end=end,
|
|
426
|
+
columns=columns,
|
|
427
|
+
filter=filter_query,
|
|
428
|
+
aggregation_window=interval,
|
|
429
|
+
aggregators=agg_funcs,
|
|
430
|
+
step=sliding_window_step,
|
|
431
|
+
**kwargs,
|
|
415
432
|
)
|
|
416
433
|
|
|
434
|
+
if limit:
|
|
435
|
+
df = df.head(limit)
|
|
436
|
+
return df
|
|
437
|
+
|
|
417
438
|
def _get_v3io_source_directory(self) -> str:
|
|
418
439
|
"""
|
|
419
440
|
Get the V3IO source directory for the current project. Usually the source directory will
|
|
@@ -441,3 +462,169 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
441
462
|
address=mlrun.mlconf.v3io_framesd,
|
|
442
463
|
container=v3io_container,
|
|
443
464
|
)
|
|
465
|
+
|
|
466
|
+
def read_metrics_data(
|
|
467
|
+
self,
|
|
468
|
+
*,
|
|
469
|
+
endpoint_id: str,
|
|
470
|
+
start: datetime,
|
|
471
|
+
end: datetime,
|
|
472
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
|
|
473
|
+
type: Literal["metrics", "results"] = "results",
|
|
474
|
+
) -> Union[
|
|
475
|
+
list[
|
|
476
|
+
Union[
|
|
477
|
+
mm_schemas.ModelEndpointMonitoringResultValues,
|
|
478
|
+
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
479
|
+
],
|
|
480
|
+
],
|
|
481
|
+
list[
|
|
482
|
+
Union[
|
|
483
|
+
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
484
|
+
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
485
|
+
],
|
|
486
|
+
],
|
|
487
|
+
]:
|
|
488
|
+
"""
|
|
489
|
+
Read metrics OR results from the TSDB and return as a list.
|
|
490
|
+
Note: the type must match the actual metrics in the `metrics` parameter.
|
|
491
|
+
If the type is "results", pass only results in the `metrics` parameter.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
if type == "metrics":
|
|
495
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
496
|
+
name = mm_schemas.MetricData.METRIC_NAME
|
|
497
|
+
df_handler = self.df_to_metrics_values
|
|
498
|
+
elif type == "results":
|
|
499
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
500
|
+
name = mm_schemas.ResultData.RESULT_NAME
|
|
501
|
+
df_handler = self.df_to_results_values
|
|
502
|
+
else:
|
|
503
|
+
raise ValueError(f"Invalid {type = }")
|
|
504
|
+
|
|
505
|
+
query = self._get_sql_query(
|
|
506
|
+
endpoint_id,
|
|
507
|
+
[(metric.app, metric.name) for metric in metrics],
|
|
508
|
+
table_path=table_path,
|
|
509
|
+
name=name,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
logger.debug("Querying V3IO TSDB", query=query)
|
|
513
|
+
|
|
514
|
+
df: pd.DataFrame = self._frames_client.read(
|
|
515
|
+
backend=_TSDB_BE,
|
|
516
|
+
start=start,
|
|
517
|
+
end=end,
|
|
518
|
+
query=query, # the filter argument does not work for this complex condition
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
logger.debug(
|
|
522
|
+
"Converting a DataFrame to a list of metrics or results values",
|
|
523
|
+
table=table_path,
|
|
524
|
+
project=self.project,
|
|
525
|
+
endpoint_id=endpoint_id,
|
|
526
|
+
is_empty=df.empty,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
return df_handler(df=df, metrics=metrics, project=self.project)
|
|
530
|
+
|
|
531
|
+
@staticmethod
|
|
532
|
+
def _get_sql_query(
|
|
533
|
+
endpoint_id: str,
|
|
534
|
+
names: list[tuple[str, str]],
|
|
535
|
+
table_path: str,
|
|
536
|
+
name: str = mm_schemas.ResultData.RESULT_NAME,
|
|
537
|
+
) -> str:
|
|
538
|
+
"""Get the SQL query for the results/metrics table"""
|
|
539
|
+
with StringIO() as query:
|
|
540
|
+
query.write(
|
|
541
|
+
f"SELECT * FROM '{table_path}' "
|
|
542
|
+
f"WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
|
|
543
|
+
)
|
|
544
|
+
if names:
|
|
545
|
+
query.write(" AND (")
|
|
546
|
+
|
|
547
|
+
for i, (app_name, result_name) in enumerate(names):
|
|
548
|
+
sub_cond = (
|
|
549
|
+
f"({mm_schemas.WriterEvent.APPLICATION_NAME}='{app_name}' "
|
|
550
|
+
f"AND {name}='{result_name}')"
|
|
551
|
+
)
|
|
552
|
+
if i != 0: # not first sub condition
|
|
553
|
+
query.write(" OR ")
|
|
554
|
+
query.write(sub_cond)
|
|
555
|
+
|
|
556
|
+
query.write(")")
|
|
557
|
+
|
|
558
|
+
query.write(";")
|
|
559
|
+
return query.getvalue()
|
|
560
|
+
|
|
561
|
+
def read_predictions(
|
|
562
|
+
self,
|
|
563
|
+
*,
|
|
564
|
+
endpoint_id: str,
|
|
565
|
+
start: Union[datetime, str],
|
|
566
|
+
end: Union[datetime, str],
|
|
567
|
+
aggregation_window: Optional[str] = None,
|
|
568
|
+
agg_funcs: Optional[list[str]] = None,
|
|
569
|
+
limit: Optional[int] = None,
|
|
570
|
+
) -> Union[
|
|
571
|
+
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
572
|
+
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
573
|
+
]:
|
|
574
|
+
if (agg_funcs and not aggregation_window) or (
|
|
575
|
+
aggregation_window and not agg_funcs
|
|
576
|
+
):
|
|
577
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
578
|
+
"both or neither of `aggregation_window` and `agg_funcs` must be provided"
|
|
579
|
+
)
|
|
580
|
+
df = self._get_records(
|
|
581
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
582
|
+
start=start,
|
|
583
|
+
end=end,
|
|
584
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
585
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
586
|
+
interval=aggregation_window,
|
|
587
|
+
agg_funcs=agg_funcs,
|
|
588
|
+
limit=limit,
|
|
589
|
+
sliding_window_step=aggregation_window,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
full_name = get_invocations_fqn(self.project)
|
|
593
|
+
|
|
594
|
+
if df.empty:
|
|
595
|
+
return mm_schemas.ModelEndpointMonitoringMetricNoData(
|
|
596
|
+
full_name=full_name,
|
|
597
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
latency_column = (
|
|
601
|
+
f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
|
|
602
|
+
if agg_funcs
|
|
603
|
+
else mm_schemas.EventFieldType.LATENCY
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
607
|
+
full_name=full_name,
|
|
608
|
+
values=list(
|
|
609
|
+
zip(
|
|
610
|
+
df.index,
|
|
611
|
+
df[latency_column],
|
|
612
|
+
)
|
|
613
|
+
), # pyright: ignore[reportArgumentType]
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
def read_prediction_metric_for_endpoint_if_exists(
|
|
617
|
+
self, endpoint_id: str
|
|
618
|
+
) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
619
|
+
# Read just one record, because we just want to check if there is any data for this endpoint_id
|
|
620
|
+
predictions = self.read_predictions(
|
|
621
|
+
endpoint_id=endpoint_id, start="0", end="now", limit=1
|
|
622
|
+
)
|
|
623
|
+
if predictions:
|
|
624
|
+
return mm_schemas.ModelEndpointMonitoringMetric(
|
|
625
|
+
project=self.project,
|
|
626
|
+
app=mm_schemas.SpecialApps.MLRUN_INFRA,
|
|
627
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
628
|
+
name=mm_schemas.PredictionsQueryConstants.INVOCATIONS,
|
|
629
|
+
full_name=get_invocations_fqn(self.project),
|
|
630
|
+
)
|
|
@@ -24,6 +24,10 @@ import mlrun.common.schemas
|
|
|
24
24
|
from mlrun.common.schemas.model_monitoring import (
|
|
25
25
|
EventFieldType,
|
|
26
26
|
)
|
|
27
|
+
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
28
|
+
ModelEndpointMonitoringMetricType,
|
|
29
|
+
_compose_full_name,
|
|
30
|
+
)
|
|
27
31
|
from mlrun.model_monitoring.model_endpoint import ModelEndpoint
|
|
28
32
|
from mlrun.utils import logger
|
|
29
33
|
|
|
@@ -292,3 +296,12 @@ def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
|
|
|
292
296
|
mm_constants.HistogramDataDriftApplicationConstants.NAME,
|
|
293
297
|
mm_constants.HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME,
|
|
294
298
|
)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def get_invocations_fqn(project: str) -> str:
|
|
302
|
+
return _compose_full_name(
|
|
303
|
+
project=project,
|
|
304
|
+
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
305
|
+
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
306
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
307
|
+
)
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -24,6 +24,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
24
24
|
HistogramDataDriftApplicationConstants,
|
|
25
25
|
MetricData,
|
|
26
26
|
ResultData,
|
|
27
|
+
ResultKindApp,
|
|
27
28
|
ResultStatusApp,
|
|
28
29
|
WriterEvent,
|
|
29
30
|
WriterEventKind,
|
|
@@ -117,9 +118,13 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
117
118
|
)
|
|
118
119
|
self._endpoints_records = {}
|
|
119
120
|
|
|
120
|
-
@staticmethod
|
|
121
121
|
def _generate_event_on_drift(
|
|
122
|
-
|
|
122
|
+
self,
|
|
123
|
+
entity_id: str,
|
|
124
|
+
result_status: int,
|
|
125
|
+
event_value: dict,
|
|
126
|
+
project_name: str,
|
|
127
|
+
result_kind: int,
|
|
123
128
|
) -> None:
|
|
124
129
|
logger.info("Sending an event")
|
|
125
130
|
entity = mlrun.common.schemas.alert.EventEntities(
|
|
@@ -127,16 +132,35 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
127
132
|
project=project_name,
|
|
128
133
|
ids=[entity_id],
|
|
129
134
|
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
else alert_objects.EventKind.DATA_DRIFT_SUSPECTED
|
|
135
|
+
|
|
136
|
+
event_kind = self._generate_alert_event_kind(
|
|
137
|
+
result_status=result_status, result_kind=result_kind
|
|
134
138
|
)
|
|
139
|
+
|
|
135
140
|
event_data = mlrun.common.schemas.Event(
|
|
136
|
-
kind=event_kind,
|
|
141
|
+
kind=alert_objects.EventKind(value=event_kind),
|
|
142
|
+
entity=entity,
|
|
143
|
+
value_dict=event_value,
|
|
137
144
|
)
|
|
138
145
|
mlrun.get_run_db().generate_event(event_kind, event_data)
|
|
139
146
|
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _generate_alert_event_kind(
|
|
149
|
+
result_kind: int, result_status: int
|
|
150
|
+
) -> alert_objects.EventKind:
|
|
151
|
+
"""Generate the required Event Kind format for the alerting system"""
|
|
152
|
+
if result_kind == ResultKindApp.custom.value:
|
|
153
|
+
# Custom kind is represented as an anomaly detection
|
|
154
|
+
event_kind = "mm_app_anomaly"
|
|
155
|
+
else:
|
|
156
|
+
event_kind = ResultKindApp(value=result_kind).name
|
|
157
|
+
|
|
158
|
+
if result_status == ResultStatusApp.detected.value:
|
|
159
|
+
event_kind = f"{event_kind}_detected"
|
|
160
|
+
else:
|
|
161
|
+
event_kind = f"{event_kind}_suspected"
|
|
162
|
+
return alert_objects.EventKind(value=event_kind)
|
|
163
|
+
|
|
140
164
|
@staticmethod
|
|
141
165
|
def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, WriterEventKind]:
|
|
142
166
|
"""
|
|
@@ -209,14 +233,15 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
209
233
|
"result_value": event[ResultData.RESULT_VALUE],
|
|
210
234
|
}
|
|
211
235
|
self._generate_event_on_drift(
|
|
212
|
-
get_result_instance_fqn(
|
|
236
|
+
entity_id=get_result_instance_fqn(
|
|
213
237
|
event[WriterEvent.ENDPOINT_ID],
|
|
214
238
|
event[WriterEvent.APPLICATION_NAME],
|
|
215
239
|
event[ResultData.RESULT_NAME],
|
|
216
240
|
),
|
|
217
|
-
event[ResultData.RESULT_STATUS],
|
|
218
|
-
event_value,
|
|
219
|
-
self.project,
|
|
241
|
+
result_status=event[ResultData.RESULT_STATUS],
|
|
242
|
+
event_value=event_value,
|
|
243
|
+
project_name=self.project,
|
|
244
|
+
result_kind=event[ResultData.RESULT_KIND],
|
|
220
245
|
)
|
|
221
246
|
|
|
222
247
|
if (
|
mlrun/projects/operations.py
CHANGED
|
@@ -18,6 +18,7 @@ from typing import Optional, Union
|
|
|
18
18
|
from mlrun_pipelines.models import PipelineNodeWrapper
|
|
19
19
|
|
|
20
20
|
import mlrun
|
|
21
|
+
import mlrun.common.constants as mlrun_constants
|
|
21
22
|
from mlrun.utils import hub_prefix
|
|
22
23
|
|
|
23
24
|
from .pipelines import enrich_function_object, pipeline_context
|
|
@@ -190,7 +191,9 @@ def run_function(
|
|
|
190
191
|
local = pipeline_context.is_run_local(local)
|
|
191
192
|
task.metadata.labels = task.metadata.labels or labels or {}
|
|
192
193
|
if pipeline_context.workflow_id:
|
|
193
|
-
task.metadata.labels[
|
|
194
|
+
task.metadata.labels[mlrun_constants.MLRunInternalLabels.workflow] = (
|
|
195
|
+
pipeline_context.workflow_id
|
|
196
|
+
)
|
|
194
197
|
if function.kind == "local":
|
|
195
198
|
command, function = mlrun.run.load_func_code(function)
|
|
196
199
|
function.spec.command = command
|
|
@@ -225,9 +228,9 @@ def run_function(
|
|
|
225
228
|
class BuildStatus:
|
|
226
229
|
"""returned status from build operation"""
|
|
227
230
|
|
|
228
|
-
def __init__(self, ready, outputs=
|
|
231
|
+
def __init__(self, ready, outputs=None, function=None):
|
|
229
232
|
self.ready = ready
|
|
230
|
-
self.outputs = outputs
|
|
233
|
+
self.outputs = outputs or {}
|
|
231
234
|
self.function = function
|
|
232
235
|
|
|
233
236
|
def after(self, step):
|
|
@@ -340,9 +343,9 @@ def build_function(
|
|
|
340
343
|
class DeployStatus:
|
|
341
344
|
"""returned status from deploy operation"""
|
|
342
345
|
|
|
343
|
-
def __init__(self, state, outputs=
|
|
346
|
+
def __init__(self, state, outputs=None, function=None):
|
|
344
347
|
self.state = state
|
|
345
|
-
self.outputs = outputs
|
|
348
|
+
self.outputs = outputs or {}
|
|
346
349
|
self.function = function
|
|
347
350
|
|
|
348
351
|
def after(self, step):
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -26,6 +26,7 @@ from kfp.compiler import compiler
|
|
|
26
26
|
from mlrun_pipelines.helpers import new_pipe_metadata
|
|
27
27
|
|
|
28
28
|
import mlrun
|
|
29
|
+
import mlrun.common.runtimes.constants
|
|
29
30
|
import mlrun.common.schemas
|
|
30
31
|
import mlrun.utils.notifications
|
|
31
32
|
from mlrun.errors import err_to_str
|
|
@@ -371,7 +372,7 @@ class _PipelineRunStatus:
|
|
|
371
372
|
engine: type["_PipelineRunner"],
|
|
372
373
|
project: "mlrun.projects.MlrunProject",
|
|
373
374
|
workflow: WorkflowSpec = None,
|
|
374
|
-
state:
|
|
375
|
+
state: mlrun_pipelines.common.models.RunStatuses = "",
|
|
375
376
|
exc: Exception = None,
|
|
376
377
|
):
|
|
377
378
|
"""
|
|
@@ -479,6 +480,7 @@ class _PipelineRunner(abc.ABC):
|
|
|
479
480
|
timeout=None,
|
|
480
481
|
expected_statuses=None,
|
|
481
482
|
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
483
|
+
**kwargs,
|
|
482
484
|
):
|
|
483
485
|
pass
|
|
484
486
|
|
|
@@ -610,6 +612,7 @@ class _KFPRunner(_PipelineRunner):
|
|
|
610
612
|
timeout=None,
|
|
611
613
|
expected_statuses=None,
|
|
612
614
|
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
615
|
+
**kwargs,
|
|
613
616
|
):
|
|
614
617
|
if timeout is None:
|
|
615
618
|
timeout = 60 * 60
|
|
@@ -733,6 +736,7 @@ class _LocalRunner(_PipelineRunner):
|
|
|
733
736
|
timeout=None,
|
|
734
737
|
expected_statuses=None,
|
|
735
738
|
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
739
|
+
**kwargs,
|
|
736
740
|
):
|
|
737
741
|
pass
|
|
738
742
|
|
|
@@ -860,7 +864,7 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
860
864
|
)
|
|
861
865
|
state = mlrun_pipelines.common.models.RunStatuses.failed
|
|
862
866
|
else:
|
|
863
|
-
state = mlrun_pipelines.common.models.RunStatuses.
|
|
867
|
+
state = mlrun_pipelines.common.models.RunStatuses.running
|
|
864
868
|
project.notifiers.push_pipeline_start_message(
|
|
865
869
|
project.metadata.name,
|
|
866
870
|
)
|
|
@@ -877,24 +881,47 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
877
881
|
@staticmethod
|
|
878
882
|
def get_run_status(
|
|
879
883
|
project,
|
|
880
|
-
run,
|
|
884
|
+
run: _PipelineRunStatus,
|
|
881
885
|
timeout=None,
|
|
882
886
|
expected_statuses=None,
|
|
883
887
|
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
888
|
+
inner_engine: type[_PipelineRunner] = None,
|
|
884
889
|
):
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
890
|
+
inner_engine = inner_engine or _KFPRunner
|
|
891
|
+
if inner_engine.engine == _KFPRunner.engine:
|
|
892
|
+
# ignore notifiers for remote notifications, as they are handled by the remote pipeline notifications,
|
|
893
|
+
# so overriding with CustomNotificationPusher with empty list of notifiers or only local notifiers
|
|
894
|
+
local_project_notifiers = list(
|
|
895
|
+
set(mlrun.utils.notifications.NotificationTypes.local()).intersection(
|
|
896
|
+
set(project.notifiers.notifications.keys())
|
|
897
|
+
)
|
|
898
|
+
)
|
|
899
|
+
notifiers = mlrun.utils.notifications.CustomNotificationPusher(
|
|
900
|
+
local_project_notifiers
|
|
901
|
+
)
|
|
902
|
+
return _KFPRunner.get_run_status(
|
|
903
|
+
project,
|
|
904
|
+
run,
|
|
905
|
+
timeout,
|
|
906
|
+
expected_statuses,
|
|
907
|
+
notifiers=notifiers,
|
|
908
|
+
)
|
|
894
909
|
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
910
|
+
elif inner_engine.engine == _LocalRunner.engine:
|
|
911
|
+
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
912
|
+
pipeline_runner_run = mldb.read_run(run.run_id, project=project.name)
|
|
913
|
+
pipeline_runner_run = mlrun.run.RunObject.from_dict(pipeline_runner_run)
|
|
914
|
+
pipeline_runner_run.logs(db=mldb)
|
|
915
|
+
pipeline_runner_run.refresh()
|
|
916
|
+
run._state = mlrun.common.runtimes.constants.RunStates.run_state_to_pipeline_run_status(
|
|
917
|
+
pipeline_runner_run.status.state
|
|
918
|
+
)
|
|
919
|
+
run._exc = pipeline_runner_run.status.error
|
|
920
|
+
|
|
921
|
+
else:
|
|
922
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
923
|
+
f"Unsupported inner runner engine: {inner_engine.engine}"
|
|
924
|
+
)
|
|
898
925
|
|
|
899
926
|
|
|
900
927
|
def create_pipeline(project, pipeline, functions, secrets=None, handler=None):
|