mlrun 1.7.0rc21__py3-none-any.whl → 1.7.0rc23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +42 -17
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/feature_store.py +78 -28
- mlrun/config.py +3 -0
- mlrun/db/base.py +1 -0
- mlrun/db/httpdb.py +9 -6
- mlrun/db/nopdb.py +1 -0
- mlrun/errors.py +1 -3
- mlrun/execution.py +2 -0
- mlrun/launcher/local.py +4 -0
- mlrun/launcher/remote.py +1 -0
- mlrun/model.py +2 -0
- mlrun/model_monitoring/api.py +1 -0
- mlrun/model_monitoring/applications/base.py +3 -3
- mlrun/model_monitoring/db/stores/__init__.py +27 -21
- mlrun/model_monitoring/db/stores/base/store.py +1 -0
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +8 -8
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +8 -8
- mlrun/model_monitoring/db/tsdb/__init__.py +1 -1
- mlrun/model_monitoring/db/tsdb/base.py +1 -14
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +22 -18
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +67 -46
- mlrun/model_monitoring/helpers.py +25 -4
- mlrun/model_monitoring/stream_processing.py +9 -11
- mlrun/model_monitoring/writer.py +10 -6
- mlrun/projects/operations.py +5 -0
- mlrun/projects/project.py +11 -1
- mlrun/runtimes/base.py +6 -0
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/local.py +7 -1
- mlrun/runtimes/nuclio/application/application.py +0 -2
- mlrun/runtimes/nuclio/serving.py +9 -6
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/states.py +51 -8
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +54 -38
- mlrun/utils/helpers.py +51 -9
- mlrun/utils/notifications/notification/base.py +39 -7
- mlrun/utils/notifications/notification/slack.py +1 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/METADATA +1 -1
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/RECORD +47 -47
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc21.dist-info → mlrun-1.7.0rc23.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
|
-
import os
|
|
17
16
|
import typing
|
|
18
17
|
from dataclasses import dataclass
|
|
19
18
|
from http import HTTPStatus
|
|
@@ -24,8 +23,8 @@ import v3io.dataplane.response
|
|
|
24
23
|
|
|
25
24
|
import mlrun.common.model_monitoring.helpers
|
|
26
25
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
27
|
-
import mlrun.model_monitoring.db
|
|
28
26
|
import mlrun.utils.v3io_clients
|
|
27
|
+
from mlrun.model_monitoring.db import StoreBase
|
|
29
28
|
from mlrun.utils import logger
|
|
30
29
|
|
|
31
30
|
# Fields to encode before storing in the KV table or to decode after retrieving
|
|
@@ -89,18 +88,21 @@ _KIND_TO_SCHEMA_PARAMS: dict[mm_schemas.WriterEventKind, SchemaParams] = {
|
|
|
89
88
|
_EXCLUDE_SCHEMA_FILTER_EXPRESSION = '__name!=".#schema"'
|
|
90
89
|
|
|
91
90
|
|
|
92
|
-
class KVStoreBase(
|
|
91
|
+
class KVStoreBase(StoreBase):
|
|
92
|
+
type: typing.ClassVar[str] = "v3io-nosql"
|
|
93
93
|
"""
|
|
94
94
|
Handles the DB operations when the DB target is from type KV. For the KV operations, we use an instance of V3IO
|
|
95
95
|
client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
|
|
96
96
|
"""
|
|
97
97
|
|
|
98
|
-
def __init__(
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
project: str,
|
|
101
|
+
) -> None:
|
|
99
102
|
super().__init__(project=project)
|
|
100
103
|
# Initialize a V3IO client instance
|
|
101
|
-
self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
|
|
102
104
|
self.client = mlrun.utils.v3io_clients.get_v3io_client(
|
|
103
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
105
|
+
endpoint=mlrun.mlconf.v3io_api,
|
|
104
106
|
)
|
|
105
107
|
# Get the KV table path and container
|
|
106
108
|
self.path, self.container = self._get_path_and_container()
|
|
@@ -186,7 +188,6 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
186
188
|
table_path=self.path,
|
|
187
189
|
key=endpoint_id,
|
|
188
190
|
raise_for_status=v3io.dataplane.RaiseForStatus.never,
|
|
189
|
-
access_key=self.access_key,
|
|
190
191
|
)
|
|
191
192
|
endpoint = endpoint.output.item
|
|
192
193
|
|
|
@@ -499,7 +500,6 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
499
500
|
|
|
500
501
|
def _get_frames_client(self):
|
|
501
502
|
return mlrun.utils.v3io_clients.get_frames_client(
|
|
502
|
-
token=self.access_key,
|
|
503
503
|
address=mlrun.mlconf.v3io_framesd,
|
|
504
504
|
container=self.container,
|
|
505
505
|
)
|
|
@@ -65,7 +65,7 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
65
65
|
def get_tsdb_connector(
|
|
66
66
|
project: str,
|
|
67
67
|
tsdb_connector_type: str = "",
|
|
68
|
-
secret_provider: typing.Optional[typing.Callable] = None,
|
|
68
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
69
69
|
**kwargs,
|
|
70
70
|
) -> TSDBConnector:
|
|
71
71
|
"""
|
|
@@ -25,7 +25,7 @@ from mlrun.utils import logger
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class TSDBConnector(ABC):
|
|
28
|
-
type: str
|
|
28
|
+
type: typing.ClassVar[str]
|
|
29
29
|
|
|
30
30
|
def __init__(self, project: str):
|
|
31
31
|
"""
|
|
@@ -177,19 +177,6 @@ class TSDBConnector(ABC):
|
|
|
177
177
|
:return: Metric values object or no data object.
|
|
178
178
|
"""
|
|
179
179
|
|
|
180
|
-
@abstractmethod
|
|
181
|
-
def read_prediction_metric_for_endpoint_if_exists(
|
|
182
|
-
self, endpoint_id: str
|
|
183
|
-
) -> typing.Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
184
|
-
"""
|
|
185
|
-
Read the "invocations" metric for the provided model endpoint, and return the metric object
|
|
186
|
-
if it exists.
|
|
187
|
-
|
|
188
|
-
:param endpoint_id: The model endpoint identifier.
|
|
189
|
-
:return: `None` if the invocations metric does not exist, otherwise return the
|
|
190
|
-
corresponding metric object.
|
|
191
|
-
"""
|
|
192
|
-
|
|
193
180
|
@staticmethod
|
|
194
181
|
def df_to_metrics_values(
|
|
195
182
|
*,
|
|
@@ -377,21 +377,25 @@ class TDEngineConnector(TSDBConnector):
|
|
|
377
377
|
), # pyright: ignore[reportArgumentType]
|
|
378
378
|
)
|
|
379
379
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
380
|
+
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
381
|
+
#
|
|
382
|
+
# def read_prediction_metric_for_endpoint_if_exists(
|
|
383
|
+
# self, endpoint_id: str
|
|
384
|
+
# ) -> typing.Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
385
|
+
# """
|
|
386
|
+
# Read the "invocations" metric for the provided model endpoint, and return the metric object
|
|
387
|
+
# if it exists.
|
|
388
|
+
#
|
|
389
|
+
# :param endpoint_id: The model endpoint identifier.
|
|
390
|
+
# :return: `None` if the invocations metric does not exist, otherwise return the
|
|
391
|
+
# corresponding metric object.
|
|
392
|
+
# """
|
|
393
|
+
# # Read just one record, because we just want to check if there is any data for this endpoint_id
|
|
394
|
+
# predictions = self.read_predictions(
|
|
395
|
+
# endpoint_id=endpoint_id,
|
|
396
|
+
# start=datetime.min,
|
|
397
|
+
# end=mlrun.utils.now_date(),
|
|
398
|
+
# limit=1,
|
|
399
|
+
# )
|
|
400
|
+
# if predictions:
|
|
401
|
+
# return get_invocations_metric(self.project)
|
|
@@ -12,15 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import typing
|
|
16
15
|
from datetime import datetime
|
|
17
16
|
from io import StringIO
|
|
18
17
|
from typing import Literal, Optional, Union
|
|
19
18
|
|
|
20
19
|
import pandas as pd
|
|
20
|
+
import v3io_frames
|
|
21
21
|
import v3io_frames.client
|
|
22
|
-
import v3io_frames.errors
|
|
23
|
-
from v3io_frames.frames_pb2 import IGNORE
|
|
24
22
|
|
|
25
23
|
import mlrun.common.model_monitoring
|
|
26
24
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
@@ -35,6 +33,14 @@ _TSDB_RATE = "1/s"
|
|
|
35
33
|
_CONTAINER = "users"
|
|
36
34
|
|
|
37
35
|
|
|
36
|
+
def _is_no_schema_error(exc: v3io_frames.ReadError) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
|
+
Check if the error message contains the relevant string to verify the cause.
|
|
40
|
+
"""
|
|
41
|
+
return "No TSDB schema file found" in str(exc)
|
|
42
|
+
|
|
43
|
+
|
|
38
44
|
class V3IOTSDBConnector(TSDBConnector):
|
|
39
45
|
"""
|
|
40
46
|
Handles the TSDB operations when the TSDB connector is of type V3IO. To manage these operations we use V3IO Frames
|
|
@@ -47,7 +53,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
47
53
|
self,
|
|
48
54
|
project: str,
|
|
49
55
|
container: str = _CONTAINER,
|
|
50
|
-
v3io_framesd:
|
|
56
|
+
v3io_framesd: Optional[str] = None,
|
|
51
57
|
create_table: bool = False,
|
|
52
58
|
) -> None:
|
|
53
59
|
super().__init__(project=project)
|
|
@@ -132,7 +138,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
132
138
|
self._frames_client.create(
|
|
133
139
|
backend=_TSDB_BE,
|
|
134
140
|
table=table,
|
|
135
|
-
if_exists=IGNORE,
|
|
141
|
+
if_exists=v3io_frames.IGNORE,
|
|
136
142
|
rate=_TSDB_RATE,
|
|
137
143
|
)
|
|
138
144
|
|
|
@@ -162,7 +168,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
162
168
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
163
169
|
container=self.container,
|
|
164
170
|
v3io_frames=self.v3io_framesd,
|
|
165
|
-
columns=[
|
|
171
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
166
172
|
index_cols=[
|
|
167
173
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
168
174
|
],
|
|
@@ -280,7 +286,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
280
286
|
index_cols=index_cols,
|
|
281
287
|
)
|
|
282
288
|
logger.info("Updated V3IO TSDB successfully", table=table)
|
|
283
|
-
except v3io_frames.
|
|
289
|
+
except v3io_frames.Error as err:
|
|
284
290
|
logger.exception(
|
|
285
291
|
"Could not write drift measures to TSDB",
|
|
286
292
|
err=err,
|
|
@@ -291,7 +297,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
291
297
|
f"Failed to write application result to TSDB: {err}"
|
|
292
298
|
)
|
|
293
299
|
|
|
294
|
-
def delete_tsdb_resources(self, table:
|
|
300
|
+
def delete_tsdb_resources(self, table: Optional[str] = None):
|
|
295
301
|
if table:
|
|
296
302
|
# Delete a specific table
|
|
297
303
|
tables = [table]
|
|
@@ -301,7 +307,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
301
307
|
for table_to_delete in tables:
|
|
302
308
|
try:
|
|
303
309
|
self._frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
|
|
304
|
-
except v3io_frames.
|
|
310
|
+
except v3io_frames.DeleteError as e:
|
|
305
311
|
logger.warning(
|
|
306
312
|
f"Failed to delete TSDB table '{table}'",
|
|
307
313
|
err=mlrun.errors.err_to_str(e),
|
|
@@ -362,7 +368,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
362
368
|
]
|
|
363
369
|
metrics_mapping[metric] = values
|
|
364
370
|
|
|
365
|
-
except v3io_frames.
|
|
371
|
+
except v3io_frames.Error as err:
|
|
366
372
|
logger.warn("Failed to read tsdb", err=err, endpoint=endpoint_id)
|
|
367
373
|
|
|
368
374
|
return metrics_mapping
|
|
@@ -372,12 +378,11 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
372
378
|
table: str,
|
|
373
379
|
start: Union[datetime, str],
|
|
374
380
|
end: Union[datetime, str],
|
|
375
|
-
columns:
|
|
381
|
+
columns: Optional[list[str]] = None,
|
|
376
382
|
filter_query: str = "",
|
|
377
|
-
interval:
|
|
378
|
-
agg_funcs:
|
|
379
|
-
|
|
380
|
-
sliding_window_step: typing.Optional[str] = None,
|
|
383
|
+
interval: Optional[str] = None,
|
|
384
|
+
agg_funcs: Optional[list[str]] = None,
|
|
385
|
+
sliding_window_step: Optional[str] = None,
|
|
381
386
|
**kwargs,
|
|
382
387
|
) -> pd.DataFrame:
|
|
383
388
|
"""
|
|
@@ -400,7 +405,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
400
405
|
:param agg_funcs: The aggregation functions to apply on the columns. Note that if `agg_funcs` is
|
|
401
406
|
provided, `interval` must bg provided as well. Provided as a list of strings in
|
|
402
407
|
the format of ['sum', 'avg', 'count', ...].
|
|
403
|
-
:param limit: The maximum number of records to return.
|
|
404
408
|
:param sliding_window_step: The time step for which the time window moves forward. Note that if
|
|
405
409
|
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
406
410
|
as a string in the format of '1m', '1h', etc.
|
|
@@ -414,9 +418,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
414
418
|
f"Available tables: {list(self.tables.keys())}"
|
|
415
419
|
)
|
|
416
420
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
agg_funcs = ",".join(agg_funcs)
|
|
421
|
+
# Frames client expects the aggregators to be a comma-separated string
|
|
422
|
+
aggregators = ",".join(agg_funcs) if agg_funcs else None
|
|
420
423
|
table_path = self.tables[table]
|
|
421
424
|
try:
|
|
422
425
|
df = self._frames_client.read(
|
|
@@ -427,18 +430,16 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
427
430
|
columns=columns,
|
|
428
431
|
filter=filter_query,
|
|
429
432
|
aggregation_window=interval,
|
|
430
|
-
aggregators=
|
|
433
|
+
aggregators=aggregators,
|
|
431
434
|
step=sliding_window_step,
|
|
432
435
|
**kwargs,
|
|
433
436
|
)
|
|
434
437
|
except v3io_frames.ReadError as err:
|
|
435
|
-
if
|
|
438
|
+
if _is_no_schema_error(err):
|
|
436
439
|
return pd.DataFrame()
|
|
437
440
|
else:
|
|
438
441
|
raise err
|
|
439
442
|
|
|
440
|
-
if limit:
|
|
441
|
-
df = df.head(limit)
|
|
442
443
|
return df
|
|
443
444
|
|
|
444
445
|
def _get_v3io_source_directory(self) -> str:
|
|
@@ -509,8 +510,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
509
510
|
raise ValueError(f"Invalid {type = }")
|
|
510
511
|
|
|
511
512
|
query = self._get_sql_query(
|
|
512
|
-
endpoint_id,
|
|
513
|
-
[(metric.app, metric.name) for metric in metrics],
|
|
513
|
+
endpoint_id=endpoint_id,
|
|
514
|
+
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
514
515
|
table_path=table_path,
|
|
515
516
|
name=name,
|
|
516
517
|
)
|
|
@@ -536,21 +537,28 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
536
537
|
|
|
537
538
|
@staticmethod
|
|
538
539
|
def _get_sql_query(
|
|
540
|
+
*,
|
|
539
541
|
endpoint_id: str,
|
|
540
|
-
names: list[tuple[str, str]],
|
|
541
542
|
table_path: str,
|
|
542
543
|
name: str = mm_schemas.ResultData.RESULT_NAME,
|
|
544
|
+
metric_and_app_names: Optional[list[tuple[str, str]]] = None,
|
|
545
|
+
columns: Optional[list[str]] = None,
|
|
543
546
|
) -> str:
|
|
544
547
|
"""Get the SQL query for the results/metrics table"""
|
|
548
|
+
if columns:
|
|
549
|
+
selection = ",".join(columns)
|
|
550
|
+
else:
|
|
551
|
+
selection = "*"
|
|
552
|
+
|
|
545
553
|
with StringIO() as query:
|
|
546
554
|
query.write(
|
|
547
|
-
f"SELECT
|
|
555
|
+
f"SELECT {selection} FROM '{table_path}' "
|
|
548
556
|
f"WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
|
|
549
557
|
)
|
|
550
|
-
if
|
|
558
|
+
if metric_and_app_names:
|
|
551
559
|
query.write(" AND (")
|
|
552
560
|
|
|
553
|
-
for i, (app_name, result_name) in enumerate(
|
|
561
|
+
for i, (app_name, result_name) in enumerate(metric_and_app_names):
|
|
554
562
|
sub_cond = (
|
|
555
563
|
f"({mm_schemas.WriterEvent.APPLICATION_NAME}='{app_name}' "
|
|
556
564
|
f"AND {name}='{result_name}')"
|
|
@@ -572,7 +580,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
572
580
|
end: Union[datetime, str],
|
|
573
581
|
aggregation_window: Optional[str] = None,
|
|
574
582
|
agg_funcs: Optional[list[str]] = None,
|
|
575
|
-
limit: Optional[int] = None,
|
|
576
583
|
) -> Union[
|
|
577
584
|
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
578
585
|
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
@@ -591,7 +598,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
591
598
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
592
599
|
interval=aggregation_window,
|
|
593
600
|
agg_funcs=agg_funcs,
|
|
594
|
-
limit=limit,
|
|
595
601
|
sliding_window_step=aggregation_window,
|
|
596
602
|
)
|
|
597
603
|
|
|
@@ -619,18 +625,33 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
619
625
|
), # pyright: ignore[reportArgumentType]
|
|
620
626
|
)
|
|
621
627
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
628
|
+
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
629
|
+
#
|
|
630
|
+
# def read_prediction_metric_for_endpoint_if_exists(
|
|
631
|
+
# self, endpoint_id: str
|
|
632
|
+
# ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
633
|
+
# """
|
|
634
|
+
# Read the count of the latency column in the predictions table for the given endpoint_id.
|
|
635
|
+
# We just want to check if there is any data for this endpoint_id.
|
|
636
|
+
# """
|
|
637
|
+
# query = self._get_sql_query(
|
|
638
|
+
# endpoint_id=endpoint_id,
|
|
639
|
+
# table_path=self.tables[mm_schemas.FileTargetKind.PREDICTIONS],
|
|
640
|
+
# columns=[f"count({mm_schemas.EventFieldType.LATENCY})"],
|
|
641
|
+
# )
|
|
642
|
+
# try:
|
|
643
|
+
# logger.debug("Checking TSDB", project=self.project, query=query)
|
|
644
|
+
# df: pd.DataFrame = self._frames_client.read(
|
|
645
|
+
# backend=_TSDB_BE, query=query, start="0", end="now"
|
|
646
|
+
# )
|
|
647
|
+
# except v3io_frames.ReadError as err:
|
|
648
|
+
# if _is_no_schema_error(err):
|
|
649
|
+
# logger.debug(
|
|
650
|
+
# "No predictions yet", project=self.project, endpoint_id=endpoint_id
|
|
651
|
+
# )
|
|
652
|
+
# return
|
|
653
|
+
# else:
|
|
654
|
+
# raise
|
|
655
|
+
#
|
|
656
|
+
# if not df.empty:
|
|
657
|
+
# return get_invocations_metric(self.project)
|
|
@@ -25,6 +25,7 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
25
25
|
EventFieldType,
|
|
26
26
|
)
|
|
27
27
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
28
|
+
ModelEndpointMonitoringMetric,
|
|
28
29
|
ModelEndpointMonitoringMetricType,
|
|
29
30
|
_compose_full_name,
|
|
30
31
|
)
|
|
@@ -96,7 +97,7 @@ def get_monitoring_parquet_path(
|
|
|
96
97
|
return parquet_path
|
|
97
98
|
|
|
98
99
|
|
|
99
|
-
def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
100
|
+
def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -> str:
|
|
100
101
|
"""Get endpoint store connection string from the project secret. If wasn't set, take it from the system
|
|
101
102
|
configurations.
|
|
102
103
|
|
|
@@ -116,7 +117,7 @@ def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
|
116
117
|
|
|
117
118
|
|
|
118
119
|
def get_tsdb_connection_string(
|
|
119
|
-
secret_provider: typing.Optional[typing.Callable] = None,
|
|
120
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
120
121
|
) -> str:
|
|
121
122
|
"""Get TSDB connection string from the project secret. If wasn't set, take it from the system
|
|
122
123
|
configurations.
|
|
@@ -277,9 +278,13 @@ def calculate_inputs_statistics(
|
|
|
277
278
|
return inputs_statistics
|
|
278
279
|
|
|
279
280
|
|
|
280
|
-
def get_endpoint_record(
|
|
281
|
+
def get_endpoint_record(
|
|
282
|
+
project: str,
|
|
283
|
+
endpoint_id: str,
|
|
284
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
285
|
+
) -> dict[str, typing.Any]:
|
|
281
286
|
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
282
|
-
project=project,
|
|
287
|
+
project=project, secret_provider=secret_provider
|
|
283
288
|
)
|
|
284
289
|
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
285
290
|
|
|
@@ -305,3 +310,19 @@ def get_invocations_fqn(project: str) -> str:
|
|
|
305
310
|
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
306
311
|
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
307
312
|
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def get_invocations_metric(project: str) -> ModelEndpointMonitoringMetric:
|
|
316
|
+
"""
|
|
317
|
+
Return the invocations metric of any model endpoint in the given project.
|
|
318
|
+
|
|
319
|
+
:param project: The project name.
|
|
320
|
+
:returns: The model monitoring metric object.
|
|
321
|
+
"""
|
|
322
|
+
return ModelEndpointMonitoringMetric(
|
|
323
|
+
project=project,
|
|
324
|
+
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
325
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
326
|
+
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
327
|
+
full_name=get_invocations_fqn(project),
|
|
328
|
+
)
|
|
@@ -66,10 +66,6 @@ class EventStreamProcessor:
|
|
|
66
66
|
self.parquet_batching_max_events = parquet_batching_max_events
|
|
67
67
|
self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
|
|
68
68
|
|
|
69
|
-
self.model_endpoint_store_target = (
|
|
70
|
-
mlrun.mlconf.model_endpoint_monitoring.store_type
|
|
71
|
-
)
|
|
72
|
-
|
|
73
69
|
logger.info(
|
|
74
70
|
"Initializing model monitoring event stream processor",
|
|
75
71
|
parquet_path=self.parquet_path,
|
|
@@ -139,7 +135,7 @@ class EventStreamProcessor:
|
|
|
139
135
|
def apply_monitoring_serving_graph(
|
|
140
136
|
self,
|
|
141
137
|
fn: mlrun.runtimes.ServingRuntime,
|
|
142
|
-
|
|
138
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
143
139
|
) -> None:
|
|
144
140
|
"""
|
|
145
141
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -167,7 +163,8 @@ class EventStreamProcessor:
|
|
|
167
163
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
168
164
|
|
|
169
165
|
:param fn: A serving function.
|
|
170
|
-
:param
|
|
166
|
+
:param secret_provider: An optional callable function that provides the connection string from the project
|
|
167
|
+
secret.
|
|
171
168
|
"""
|
|
172
169
|
|
|
173
170
|
graph = typing.cast(
|
|
@@ -293,7 +290,6 @@ class EventStreamProcessor:
|
|
|
293
290
|
name="UpdateEndpoint",
|
|
294
291
|
after="ProcessBeforeEndpointUpdate",
|
|
295
292
|
project=self.project,
|
|
296
|
-
model_endpoint_store_target=self.model_endpoint_store_target,
|
|
297
293
|
)
|
|
298
294
|
|
|
299
295
|
apply_update_endpoint()
|
|
@@ -310,7 +306,10 @@ class EventStreamProcessor:
|
|
|
310
306
|
table=self.kv_path,
|
|
311
307
|
)
|
|
312
308
|
|
|
313
|
-
|
|
309
|
+
store_object = mlrun.model_monitoring.get_store_object(
|
|
310
|
+
project=self.project, secret_provider=secret_provider
|
|
311
|
+
)
|
|
312
|
+
if store_object.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
314
313
|
apply_infer_schema()
|
|
315
314
|
|
|
316
315
|
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
@@ -328,7 +327,7 @@ class EventStreamProcessor:
|
|
|
328
327
|
# TSDB branch (skip to Prometheus if in CE env)
|
|
329
328
|
if not mlrun.mlconf.is_ce_mode():
|
|
330
329
|
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
331
|
-
project=self.project, secret_provider=
|
|
330
|
+
project=self.project, secret_provider=secret_provider
|
|
332
331
|
)
|
|
333
332
|
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
334
333
|
|
|
@@ -904,7 +903,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
904
903
|
|
|
905
904
|
|
|
906
905
|
class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
907
|
-
def __init__(self, project: str,
|
|
906
|
+
def __init__(self, project: str, **kwargs):
|
|
908
907
|
"""
|
|
909
908
|
Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
|
|
910
909
|
the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
|
|
@@ -914,7 +913,6 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
|
914
913
|
"""
|
|
915
914
|
super().__init__(**kwargs)
|
|
916
915
|
self.project = project
|
|
917
|
-
self.model_endpoint_store_target = model_endpoint_store_target
|
|
918
916
|
|
|
919
917
|
def do(self, event: dict):
|
|
920
918
|
# Remove labels from the event
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
|
-
from typing import Any, NewType
|
|
16
|
+
from typing import Any, Callable, NewType
|
|
17
17
|
|
|
18
18
|
import mlrun.common.model_monitoring
|
|
19
19
|
import mlrun.common.schemas
|
|
@@ -30,7 +30,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
30
30
|
WriterEventKind,
|
|
31
31
|
)
|
|
32
32
|
from mlrun.common.schemas.notification import NotificationKind, NotificationSeverity
|
|
33
|
-
from mlrun.model_monitoring.helpers import
|
|
33
|
+
from mlrun.model_monitoring.helpers import get_result_instance_fqn
|
|
34
34
|
from mlrun.serving.utils import StepToDict
|
|
35
35
|
from mlrun.utils import logger
|
|
36
36
|
from mlrun.utils.notifications.notification_pusher import CustomNotificationPusher
|
|
@@ -102,7 +102,11 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
102
102
|
|
|
103
103
|
kind = "monitoring_application_stream_pusher"
|
|
104
104
|
|
|
105
|
-
def __init__(
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
project: str,
|
|
108
|
+
secret_provider: Callable = None,
|
|
109
|
+
) -> None:
|
|
106
110
|
self.project = project
|
|
107
111
|
self.name = project # required for the deployment process
|
|
108
112
|
|
|
@@ -111,10 +115,10 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
111
115
|
)
|
|
112
116
|
|
|
113
117
|
self._app_result_store = mlrun.model_monitoring.get_store_object(
|
|
114
|
-
project=self.project
|
|
118
|
+
project=self.project, secret_provider=secret_provider
|
|
115
119
|
)
|
|
116
120
|
self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
117
|
-
project=self.project, secret_provider=
|
|
121
|
+
project=self.project, secret_provider=secret_provider
|
|
118
122
|
)
|
|
119
123
|
self._endpoints_records = {}
|
|
120
124
|
|
|
@@ -223,7 +227,7 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
223
227
|
endpoint_id = event[WriterEvent.ENDPOINT_ID]
|
|
224
228
|
endpoint_record = self._endpoints_records.setdefault(
|
|
225
229
|
endpoint_id,
|
|
226
|
-
|
|
230
|
+
self._app_result_store.get_model_endpoint(endpoint_id=endpoint_id),
|
|
227
231
|
)
|
|
228
232
|
event_value = {
|
|
229
233
|
"app_name": event[WriterEvent.APPLICATION_NAME],
|
mlrun/projects/operations.py
CHANGED
|
@@ -77,6 +77,7 @@ def run_function(
|
|
|
77
77
|
notifications: list[mlrun.model.Notification] = None,
|
|
78
78
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
79
79
|
builder_env: Optional[list] = None,
|
|
80
|
+
reset_on_run: Optional[bool] = None,
|
|
80
81
|
) -> Union[mlrun.model.RunObject, PipelineNodeWrapper]:
|
|
81
82
|
"""Run a local or remote task as part of a local/kubeflow pipeline
|
|
82
83
|
|
|
@@ -167,6 +168,9 @@ def run_function(
|
|
|
167
168
|
artifact type can be given there. The artifact key must appear in the dictionary as
|
|
168
169
|
"key": "the_key".
|
|
169
170
|
:param builder_env: env vars dict for source archive config/credentials e.g. builder_env={"GIT_TOKEN": token}
|
|
171
|
+
:param reset_on_run: When True, function python modules would reload prior to code execution.
|
|
172
|
+
This ensures latest code changes are executed. This argument must be used in
|
|
173
|
+
conjunction with the local=True argument.
|
|
170
174
|
:return: MLRun RunObject or PipelineNodeWrapper
|
|
171
175
|
"""
|
|
172
176
|
engine, function = _get_engine_and_function(function, project_object)
|
|
@@ -215,6 +219,7 @@ def run_function(
|
|
|
215
219
|
schedule=schedule,
|
|
216
220
|
notifications=notifications,
|
|
217
221
|
builder_env=builder_env,
|
|
222
|
+
reset_on_run=reset_on_run,
|
|
218
223
|
)
|
|
219
224
|
if run_result:
|
|
220
225
|
run_result._notified = False
|