mlrun 1.7.0rc20__py3-none-any.whl → 1.7.0rc28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +10 -8
- mlrun/alerts/alert.py +55 -18
- mlrun/api/schemas/__init__.py +3 -3
- mlrun/artifacts/manager.py +26 -0
- mlrun/common/constants.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +26 -3
- mlrun/common/formatters/base.py +44 -9
- mlrun/common/formatters/function.py +12 -7
- mlrun/common/formatters/run.py +26 -0
- mlrun/common/helpers.py +11 -0
- mlrun/common/schemas/__init__.py +4 -0
- mlrun/common/schemas/alert.py +5 -9
- mlrun/common/schemas/api_gateway.py +64 -16
- mlrun/common/schemas/artifact.py +11 -0
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/model_monitoring/constants.py +21 -12
- mlrun/common/schemas/model_monitoring/model_endpoints.py +0 -12
- mlrun/common/schemas/pipeline.py +16 -0
- mlrun/common/schemas/project.py +17 -0
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/types.py +6 -0
- mlrun/config.py +17 -25
- mlrun/datastore/azure_blob.py +2 -1
- mlrun/datastore/datastore.py +3 -3
- mlrun/datastore/google_cloud_storage.py +6 -2
- mlrun/datastore/snowflake_utils.py +3 -1
- mlrun/datastore/sources.py +26 -11
- mlrun/datastore/store_resources.py +2 -0
- mlrun/datastore/targets.py +68 -16
- mlrun/db/base.py +83 -2
- mlrun/db/httpdb.py +280 -63
- mlrun/db/nopdb.py +60 -3
- mlrun/errors.py +5 -3
- mlrun/execution.py +28 -13
- mlrun/feature_store/feature_vector.py +8 -0
- mlrun/feature_store/retrieval/spark_merger.py +13 -2
- mlrun/launcher/local.py +4 -0
- mlrun/launcher/remote.py +1 -0
- mlrun/model.py +32 -3
- mlrun/model_monitoring/api.py +7 -52
- mlrun/model_monitoring/applications/base.py +5 -7
- mlrun/model_monitoring/applications/histogram_data_drift.py +1 -1
- mlrun/model_monitoring/db/stores/__init__.py +37 -24
- mlrun/model_monitoring/db/stores/base/store.py +40 -1
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +42 -87
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +27 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +15 -15
- mlrun/model_monitoring/db/tsdb/base.py +1 -14
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +22 -18
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +86 -56
- mlrun/model_monitoring/helpers.py +34 -9
- mlrun/model_monitoring/stream_processing.py +12 -11
- mlrun/model_monitoring/writer.py +11 -11
- mlrun/projects/operations.py +5 -0
- mlrun/projects/pipelines.py +35 -21
- mlrun/projects/project.py +216 -107
- mlrun/render.py +10 -5
- mlrun/run.py +15 -5
- mlrun/runtimes/__init__.py +2 -0
- mlrun/runtimes/base.py +17 -4
- mlrun/runtimes/daskjob.py +8 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/local.py +23 -4
- mlrun/runtimes/nuclio/application/application.py +0 -2
- mlrun/runtimes/nuclio/function.py +31 -2
- mlrun/runtimes/nuclio/serving.py +9 -6
- mlrun/runtimes/pod.py +5 -29
- mlrun/runtimes/remotesparkjob.py +8 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/routers.py +75 -59
- mlrun/serving/server.py +11 -0
- mlrun/serving/states.py +80 -8
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +66 -39
- mlrun/utils/helpers.py +91 -11
- mlrun/utils/logger.py +36 -2
- mlrun/utils/notifications/notification/base.py +43 -7
- mlrun/utils/notifications/notification/git.py +21 -0
- mlrun/utils/notifications/notification/slack.py +9 -14
- mlrun/utils/notifications/notification/webhook.py +41 -1
- mlrun/utils/notifications/notification_pusher.py +3 -9
- mlrun/utils/regex.py +9 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/METADATA +16 -9
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/RECORD +92 -91
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc20.dist-info → mlrun-1.7.0rc28.dist-info}/top_level.txt +0 -0
|
@@ -57,41 +57,41 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
57
57
|
:param value: Provided enum (invalid) value.
|
|
58
58
|
"""
|
|
59
59
|
valid_values = list(cls.__members__.keys())
|
|
60
|
-
raise mlrun.errors.
|
|
60
|
+
raise mlrun.errors.MLRunInvalidMMStoreType(
|
|
61
61
|
f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def get_tsdb_connector(
|
|
66
66
|
project: str,
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
68
|
+
tsdb_connection_string: typing.Optional[str] = None,
|
|
69
69
|
**kwargs,
|
|
70
70
|
) -> TSDBConnector:
|
|
71
71
|
"""
|
|
72
72
|
Get TSDB connector object.
|
|
73
|
-
:param project:
|
|
74
|
-
:param
|
|
75
|
-
|
|
76
|
-
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
73
|
+
:param project: The name of the project.
|
|
74
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
75
|
+
:param tsdb_connection_string: An optional explicit connection string to the TSDB.
|
|
77
76
|
|
|
78
77
|
:return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
|
|
79
78
|
TSDB connector such as updating drift metrics or write application record result.
|
|
80
79
|
"""
|
|
81
80
|
|
|
82
|
-
tsdb_connection_string =
|
|
83
|
-
|
|
81
|
+
tsdb_connection_string = (
|
|
82
|
+
tsdb_connection_string
|
|
83
|
+
or mlrun.model_monitoring.helpers.get_tsdb_connection_string(
|
|
84
|
+
secret_provider=secret_provider
|
|
85
|
+
)
|
|
84
86
|
)
|
|
85
87
|
|
|
86
88
|
if tsdb_connection_string and tsdb_connection_string.startswith("taosws"):
|
|
87
89
|
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.TDEngine
|
|
88
90
|
kwargs["connection_string"] = tsdb_connection_string
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
tsdb_connector_type
|
|
93
|
-
or mlrun.mlconf.model_endpoint_monitoring.tsdb_connector_type
|
|
94
|
-
)
|
|
91
|
+
elif tsdb_connection_string and tsdb_connection_string == "v3io":
|
|
92
|
+
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
|
|
93
|
+
else:
|
|
94
|
+
tsdb_connector_type = None
|
|
95
95
|
|
|
96
96
|
# Get connector type value from ObjectTSDBFactory enum class
|
|
97
97
|
tsdb_connector_factory = ObjectTSDBFactory(tsdb_connector_type)
|
|
@@ -25,7 +25,7 @@ from mlrun.utils import logger
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class TSDBConnector(ABC):
|
|
28
|
-
type: str
|
|
28
|
+
type: typing.ClassVar[str]
|
|
29
29
|
|
|
30
30
|
def __init__(self, project: str):
|
|
31
31
|
"""
|
|
@@ -177,19 +177,6 @@ class TSDBConnector(ABC):
|
|
|
177
177
|
:return: Metric values object or no data object.
|
|
178
178
|
"""
|
|
179
179
|
|
|
180
|
-
@abstractmethod
|
|
181
|
-
def read_prediction_metric_for_endpoint_if_exists(
|
|
182
|
-
self, endpoint_id: str
|
|
183
|
-
) -> typing.Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
184
|
-
"""
|
|
185
|
-
Read the "invocations" metric for the provided model endpoint, and return the metric object
|
|
186
|
-
if it exists.
|
|
187
|
-
|
|
188
|
-
:param endpoint_id: The model endpoint identifier.
|
|
189
|
-
:return: `None` if the invocations metric does not exist, otherwise return the
|
|
190
|
-
corresponding metric object.
|
|
191
|
-
"""
|
|
192
|
-
|
|
193
180
|
@staticmethod
|
|
194
181
|
def df_to_metrics_values(
|
|
195
182
|
*,
|
|
@@ -377,21 +377,25 @@ class TDEngineConnector(TSDBConnector):
|
|
|
377
377
|
), # pyright: ignore[reportArgumentType]
|
|
378
378
|
)
|
|
379
379
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
380
|
+
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
381
|
+
#
|
|
382
|
+
# def read_prediction_metric_for_endpoint_if_exists(
|
|
383
|
+
# self, endpoint_id: str
|
|
384
|
+
# ) -> typing.Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
385
|
+
# """
|
|
386
|
+
# Read the "invocations" metric for the provided model endpoint, and return the metric object
|
|
387
|
+
# if it exists.
|
|
388
|
+
#
|
|
389
|
+
# :param endpoint_id: The model endpoint identifier.
|
|
390
|
+
# :return: `None` if the invocations metric does not exist, otherwise return the
|
|
391
|
+
# corresponding metric object.
|
|
392
|
+
# """
|
|
393
|
+
# # Read just one record, because we just want to check if there is any data for this endpoint_id
|
|
394
|
+
# predictions = self.read_predictions(
|
|
395
|
+
# endpoint_id=endpoint_id,
|
|
396
|
+
# start=datetime.min,
|
|
397
|
+
# end=mlrun.utils.now_date(),
|
|
398
|
+
# limit=1,
|
|
399
|
+
# )
|
|
400
|
+
# if predictions:
|
|
401
|
+
# return get_invocations_metric(self.project)
|
|
@@ -12,15 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import typing
|
|
16
15
|
from datetime import datetime
|
|
17
16
|
from io import StringIO
|
|
18
17
|
from typing import Literal, Optional, Union
|
|
19
18
|
|
|
20
19
|
import pandas as pd
|
|
20
|
+
import v3io_frames
|
|
21
21
|
import v3io_frames.client
|
|
22
|
-
import v3io_frames.errors
|
|
23
|
-
from v3io_frames.frames_pb2 import IGNORE
|
|
24
22
|
|
|
25
23
|
import mlrun.common.model_monitoring
|
|
26
24
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
@@ -35,6 +33,17 @@ _TSDB_RATE = "1/s"
|
|
|
35
33
|
_CONTAINER = "users"
|
|
36
34
|
|
|
37
35
|
|
|
36
|
+
def _is_no_schema_error(exc: v3io_frames.ReadError) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
|
+
Check if the error message contains the relevant string to verify the cause.
|
|
40
|
+
"""
|
|
41
|
+
msg = str(exc)
|
|
42
|
+
# https://github.com/v3io/v3io-tsdb/blob/v0.14.1/pkg/tsdb/v3iotsdb.go#L205
|
|
43
|
+
# https://github.com/v3io/v3io-tsdb/blob/v0.14.1/pkg/partmgr/partmgr.go#L238
|
|
44
|
+
return "No TSDB schema file found" in msg or "Failed to read schema at path" in msg
|
|
45
|
+
|
|
46
|
+
|
|
38
47
|
class V3IOTSDBConnector(TSDBConnector):
|
|
39
48
|
"""
|
|
40
49
|
Handles the TSDB operations when the TSDB connector is of type V3IO. To manage these operations we use V3IO Frames
|
|
@@ -47,7 +56,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
47
56
|
self,
|
|
48
57
|
project: str,
|
|
49
58
|
container: str = _CONTAINER,
|
|
50
|
-
v3io_framesd:
|
|
59
|
+
v3io_framesd: Optional[str] = None,
|
|
51
60
|
create_table: bool = False,
|
|
52
61
|
) -> None:
|
|
53
62
|
super().__init__(project=project)
|
|
@@ -132,7 +141,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
132
141
|
self._frames_client.create(
|
|
133
142
|
backend=_TSDB_BE,
|
|
134
143
|
table=table,
|
|
135
|
-
if_exists=IGNORE,
|
|
144
|
+
if_exists=v3io_frames.IGNORE,
|
|
136
145
|
rate=_TSDB_RATE,
|
|
137
146
|
)
|
|
138
147
|
|
|
@@ -162,7 +171,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
162
171
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
163
172
|
container=self.container,
|
|
164
173
|
v3io_frames=self.v3io_framesd,
|
|
165
|
-
columns=[
|
|
174
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
166
175
|
index_cols=[
|
|
167
176
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
168
177
|
],
|
|
@@ -280,7 +289,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
280
289
|
index_cols=index_cols,
|
|
281
290
|
)
|
|
282
291
|
logger.info("Updated V3IO TSDB successfully", table=table)
|
|
283
|
-
except v3io_frames.
|
|
292
|
+
except v3io_frames.Error as err:
|
|
284
293
|
logger.exception(
|
|
285
294
|
"Could not write drift measures to TSDB",
|
|
286
295
|
err=err,
|
|
@@ -291,7 +300,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
291
300
|
f"Failed to write application result to TSDB: {err}"
|
|
292
301
|
)
|
|
293
302
|
|
|
294
|
-
def delete_tsdb_resources(self, table:
|
|
303
|
+
def delete_tsdb_resources(self, table: Optional[str] = None):
|
|
295
304
|
if table:
|
|
296
305
|
# Delete a specific table
|
|
297
306
|
tables = [table]
|
|
@@ -301,7 +310,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
301
310
|
for table_to_delete in tables:
|
|
302
311
|
try:
|
|
303
312
|
self._frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
|
|
304
|
-
except v3io_frames.
|
|
313
|
+
except v3io_frames.DeleteError as e:
|
|
305
314
|
logger.warning(
|
|
306
315
|
f"Failed to delete TSDB table '{table}'",
|
|
307
316
|
err=mlrun.errors.err_to_str(e),
|
|
@@ -362,7 +371,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
362
371
|
]
|
|
363
372
|
metrics_mapping[metric] = values
|
|
364
373
|
|
|
365
|
-
except v3io_frames.
|
|
374
|
+
except v3io_frames.Error as err:
|
|
366
375
|
logger.warn("Failed to read tsdb", err=err, endpoint=endpoint_id)
|
|
367
376
|
|
|
368
377
|
return metrics_mapping
|
|
@@ -372,12 +381,11 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
372
381
|
table: str,
|
|
373
382
|
start: Union[datetime, str],
|
|
374
383
|
end: Union[datetime, str],
|
|
375
|
-
columns:
|
|
384
|
+
columns: Optional[list[str]] = None,
|
|
376
385
|
filter_query: str = "",
|
|
377
|
-
interval:
|
|
378
|
-
agg_funcs:
|
|
379
|
-
|
|
380
|
-
sliding_window_step: typing.Optional[str] = None,
|
|
386
|
+
interval: Optional[str] = None,
|
|
387
|
+
agg_funcs: Optional[list[str]] = None,
|
|
388
|
+
sliding_window_step: Optional[str] = None,
|
|
381
389
|
**kwargs,
|
|
382
390
|
) -> pd.DataFrame:
|
|
383
391
|
"""
|
|
@@ -400,7 +408,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
400
408
|
:param agg_funcs: The aggregation functions to apply on the columns. Note that if `agg_funcs` is
|
|
401
409
|
provided, `interval` must bg provided as well. Provided as a list of strings in
|
|
402
410
|
the format of ['sum', 'avg', 'count', ...].
|
|
403
|
-
:param limit: The maximum number of records to return.
|
|
404
411
|
:param sliding_window_step: The time step for which the time window moves forward. Note that if
|
|
405
412
|
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
406
413
|
as a string in the format of '1m', '1h', etc.
|
|
@@ -414,25 +421,28 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
414
421
|
f"Available tables: {list(self.tables.keys())}"
|
|
415
422
|
)
|
|
416
423
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
agg_funcs = ",".join(agg_funcs)
|
|
424
|
+
# Frames client expects the aggregators to be a comma-separated string
|
|
425
|
+
aggregators = ",".join(agg_funcs) if agg_funcs else None
|
|
420
426
|
table_path = self.tables[table]
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
427
|
+
try:
|
|
428
|
+
df = self._frames_client.read(
|
|
429
|
+
backend=_TSDB_BE,
|
|
430
|
+
table=table_path,
|
|
431
|
+
start=start,
|
|
432
|
+
end=end,
|
|
433
|
+
columns=columns,
|
|
434
|
+
filter=filter_query,
|
|
435
|
+
aggregation_window=interval,
|
|
436
|
+
aggregators=aggregators,
|
|
437
|
+
step=sliding_window_step,
|
|
438
|
+
**kwargs,
|
|
439
|
+
)
|
|
440
|
+
except v3io_frames.ReadError as err:
|
|
441
|
+
if _is_no_schema_error(err):
|
|
442
|
+
return pd.DataFrame()
|
|
443
|
+
else:
|
|
444
|
+
raise err
|
|
433
445
|
|
|
434
|
-
if limit:
|
|
435
|
-
df = df.head(limit)
|
|
436
446
|
return df
|
|
437
447
|
|
|
438
448
|
def _get_v3io_source_directory(self) -> str:
|
|
@@ -503,8 +513,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
503
513
|
raise ValueError(f"Invalid {type = }")
|
|
504
514
|
|
|
505
515
|
query = self._get_sql_query(
|
|
506
|
-
endpoint_id,
|
|
507
|
-
[(metric.app, metric.name) for metric in metrics],
|
|
516
|
+
endpoint_id=endpoint_id,
|
|
517
|
+
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
508
518
|
table_path=table_path,
|
|
509
519
|
name=name,
|
|
510
520
|
)
|
|
@@ -530,21 +540,28 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
530
540
|
|
|
531
541
|
@staticmethod
|
|
532
542
|
def _get_sql_query(
|
|
543
|
+
*,
|
|
533
544
|
endpoint_id: str,
|
|
534
|
-
names: list[tuple[str, str]],
|
|
535
545
|
table_path: str,
|
|
536
546
|
name: str = mm_schemas.ResultData.RESULT_NAME,
|
|
547
|
+
metric_and_app_names: Optional[list[tuple[str, str]]] = None,
|
|
548
|
+
columns: Optional[list[str]] = None,
|
|
537
549
|
) -> str:
|
|
538
550
|
"""Get the SQL query for the results/metrics table"""
|
|
551
|
+
if columns:
|
|
552
|
+
selection = ",".join(columns)
|
|
553
|
+
else:
|
|
554
|
+
selection = "*"
|
|
555
|
+
|
|
539
556
|
with StringIO() as query:
|
|
540
557
|
query.write(
|
|
541
|
-
f"SELECT
|
|
558
|
+
f"SELECT {selection} FROM '{table_path}' "
|
|
542
559
|
f"WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
|
|
543
560
|
)
|
|
544
|
-
if
|
|
561
|
+
if metric_and_app_names:
|
|
545
562
|
query.write(" AND (")
|
|
546
563
|
|
|
547
|
-
for i, (app_name, result_name) in enumerate(
|
|
564
|
+
for i, (app_name, result_name) in enumerate(metric_and_app_names):
|
|
548
565
|
sub_cond = (
|
|
549
566
|
f"({mm_schemas.WriterEvent.APPLICATION_NAME}='{app_name}' "
|
|
550
567
|
f"AND {name}='{result_name}')"
|
|
@@ -566,7 +583,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
566
583
|
end: Union[datetime, str],
|
|
567
584
|
aggregation_window: Optional[str] = None,
|
|
568
585
|
agg_funcs: Optional[list[str]] = None,
|
|
569
|
-
limit: Optional[int] = None,
|
|
570
586
|
) -> Union[
|
|
571
587
|
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
572
588
|
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
@@ -585,7 +601,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
585
601
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
586
602
|
interval=aggregation_window,
|
|
587
603
|
agg_funcs=agg_funcs,
|
|
588
|
-
limit=limit,
|
|
589
604
|
sliding_window_step=aggregation_window,
|
|
590
605
|
)
|
|
591
606
|
|
|
@@ -613,18 +628,33 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
613
628
|
), # pyright: ignore[reportArgumentType]
|
|
614
629
|
)
|
|
615
630
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
+
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
632
|
+
#
|
|
633
|
+
# def read_prediction_metric_for_endpoint_if_exists(
|
|
634
|
+
# self, endpoint_id: str
|
|
635
|
+
# ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
636
|
+
# """
|
|
637
|
+
# Read the count of the latency column in the predictions table for the given endpoint_id.
|
|
638
|
+
# We just want to check if there is any data for this endpoint_id.
|
|
639
|
+
# """
|
|
640
|
+
# query = self._get_sql_query(
|
|
641
|
+
# endpoint_id=endpoint_id,
|
|
642
|
+
# table_path=self.tables[mm_schemas.FileTargetKind.PREDICTIONS],
|
|
643
|
+
# columns=[f"count({mm_schemas.EventFieldType.LATENCY})"],
|
|
644
|
+
# )
|
|
645
|
+
# try:
|
|
646
|
+
# logger.debug("Checking TSDB", project=self.project, query=query)
|
|
647
|
+
# df: pd.DataFrame = self._frames_client.read(
|
|
648
|
+
# backend=_TSDB_BE, query=query, start="0", end="now"
|
|
649
|
+
# )
|
|
650
|
+
# except v3io_frames.ReadError as err:
|
|
651
|
+
# if _is_no_schema_error(err):
|
|
652
|
+
# logger.debug(
|
|
653
|
+
# "No predictions yet", project=self.project, endpoint_id=endpoint_id
|
|
654
|
+
# )
|
|
655
|
+
# return
|
|
656
|
+
# else:
|
|
657
|
+
# raise
|
|
658
|
+
#
|
|
659
|
+
# if not df.empty:
|
|
660
|
+
# return get_invocations_metric(self.project)
|
|
@@ -25,6 +25,7 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
25
25
|
EventFieldType,
|
|
26
26
|
)
|
|
27
27
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
28
|
+
ModelEndpointMonitoringMetric,
|
|
28
29
|
ModelEndpointMonitoringMetricType,
|
|
29
30
|
_compose_full_name,
|
|
30
31
|
)
|
|
@@ -58,13 +59,17 @@ def get_stream_path(
|
|
|
58
59
|
|
|
59
60
|
stream_uri = mlrun.get_secret_or_env(
|
|
60
61
|
mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
|
|
61
|
-
) or mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
62
|
-
project=project,
|
|
63
|
-
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
|
|
64
|
-
target="online",
|
|
65
|
-
function_name=function_name,
|
|
66
62
|
)
|
|
67
63
|
|
|
64
|
+
if not stream_uri or stream_uri == "v3io":
|
|
65
|
+
# TODO : remove the first part of this condition in 1.9.0
|
|
66
|
+
stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
67
|
+
project=project,
|
|
68
|
+
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
|
|
69
|
+
target="online",
|
|
70
|
+
function_name=function_name,
|
|
71
|
+
)
|
|
72
|
+
|
|
68
73
|
if isinstance(stream_uri, list): # ML-6043 - user side gets only the new stream uri
|
|
69
74
|
stream_uri = stream_uri[1] # get new stream path, under projects
|
|
70
75
|
return mlrun.common.model_monitoring.helpers.parse_monitoring_stream_path(
|
|
@@ -96,7 +101,7 @@ def get_monitoring_parquet_path(
|
|
|
96
101
|
return parquet_path
|
|
97
102
|
|
|
98
103
|
|
|
99
|
-
def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
104
|
+
def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -> str:
|
|
100
105
|
"""Get endpoint store connection string from the project secret. If wasn't set, take it from the system
|
|
101
106
|
configurations.
|
|
102
107
|
|
|
@@ -116,7 +121,7 @@ def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
|
116
121
|
|
|
117
122
|
|
|
118
123
|
def get_tsdb_connection_string(
|
|
119
|
-
secret_provider: typing.Optional[typing.Callable] = None,
|
|
124
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
120
125
|
) -> str:
|
|
121
126
|
"""Get TSDB connection string from the project secret. If wasn't set, take it from the system
|
|
122
127
|
configurations.
|
|
@@ -277,9 +282,13 @@ def calculate_inputs_statistics(
|
|
|
277
282
|
return inputs_statistics
|
|
278
283
|
|
|
279
284
|
|
|
280
|
-
def get_endpoint_record(
|
|
285
|
+
def get_endpoint_record(
|
|
286
|
+
project: str,
|
|
287
|
+
endpoint_id: str,
|
|
288
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
289
|
+
) -> dict[str, typing.Any]:
|
|
281
290
|
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
282
|
-
project=project,
|
|
291
|
+
project=project, secret_provider=secret_provider
|
|
283
292
|
)
|
|
284
293
|
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
285
294
|
|
|
@@ -305,3 +314,19 @@ def get_invocations_fqn(project: str) -> str:
|
|
|
305
314
|
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
306
315
|
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
307
316
|
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def get_invocations_metric(project: str) -> ModelEndpointMonitoringMetric:
|
|
320
|
+
"""
|
|
321
|
+
Return the invocations metric of any model endpoint in the given project.
|
|
322
|
+
|
|
323
|
+
:param project: The project name.
|
|
324
|
+
:returns: The model monitoring metric object.
|
|
325
|
+
"""
|
|
326
|
+
return ModelEndpointMonitoringMetric(
|
|
327
|
+
project=project,
|
|
328
|
+
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
329
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
330
|
+
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
331
|
+
full_name=get_invocations_fqn(project),
|
|
332
|
+
)
|
|
@@ -66,10 +66,6 @@ class EventStreamProcessor:
|
|
|
66
66
|
self.parquet_batching_max_events = parquet_batching_max_events
|
|
67
67
|
self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
|
|
68
68
|
|
|
69
|
-
self.model_endpoint_store_target = (
|
|
70
|
-
mlrun.mlconf.model_endpoint_monitoring.store_type
|
|
71
|
-
)
|
|
72
|
-
|
|
73
69
|
logger.info(
|
|
74
70
|
"Initializing model monitoring event stream processor",
|
|
75
71
|
parquet_path=self.parquet_path,
|
|
@@ -139,7 +135,7 @@ class EventStreamProcessor:
|
|
|
139
135
|
def apply_monitoring_serving_graph(
|
|
140
136
|
self,
|
|
141
137
|
fn: mlrun.runtimes.ServingRuntime,
|
|
142
|
-
|
|
138
|
+
secret_provider: typing.Optional[typing.Callable[[str], str]] = None,
|
|
143
139
|
) -> None:
|
|
144
140
|
"""
|
|
145
141
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -167,7 +163,8 @@ class EventStreamProcessor:
|
|
|
167
163
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
168
164
|
|
|
169
165
|
:param fn: A serving function.
|
|
170
|
-
:param
|
|
166
|
+
:param secret_provider: An optional callable function that provides the connection string from the project
|
|
167
|
+
secret.
|
|
171
168
|
"""
|
|
172
169
|
|
|
173
170
|
graph = typing.cast(
|
|
@@ -293,7 +290,6 @@ class EventStreamProcessor:
|
|
|
293
290
|
name="UpdateEndpoint",
|
|
294
291
|
after="ProcessBeforeEndpointUpdate",
|
|
295
292
|
project=self.project,
|
|
296
|
-
model_endpoint_store_target=self.model_endpoint_store_target,
|
|
297
293
|
)
|
|
298
294
|
|
|
299
295
|
apply_update_endpoint()
|
|
@@ -310,7 +306,10 @@ class EventStreamProcessor:
|
|
|
310
306
|
table=self.kv_path,
|
|
311
307
|
)
|
|
312
308
|
|
|
313
|
-
|
|
309
|
+
store_object = mlrun.model_monitoring.get_store_object(
|
|
310
|
+
project=self.project, secret_provider=secret_provider
|
|
311
|
+
)
|
|
312
|
+
if store_object.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
314
313
|
apply_infer_schema()
|
|
315
314
|
|
|
316
315
|
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
@@ -328,7 +327,7 @@ class EventStreamProcessor:
|
|
|
328
327
|
# TSDB branch (skip to Prometheus if in CE env)
|
|
329
328
|
if not mlrun.mlconf.is_ce_mode():
|
|
330
329
|
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
331
|
-
project=self.project, secret_provider=
|
|
330
|
+
project=self.project, secret_provider=secret_provider
|
|
332
331
|
)
|
|
333
332
|
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
334
333
|
|
|
@@ -904,7 +903,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
904
903
|
|
|
905
904
|
|
|
906
905
|
class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
907
|
-
def __init__(self, project: str,
|
|
906
|
+
def __init__(self, project: str, **kwargs):
|
|
908
907
|
"""
|
|
909
908
|
Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
|
|
910
909
|
the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
|
|
@@ -914,9 +913,11 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
|
914
913
|
"""
|
|
915
914
|
super().__init__(**kwargs)
|
|
916
915
|
self.project = project
|
|
917
|
-
self.model_endpoint_store_target = model_endpoint_store_target
|
|
918
916
|
|
|
919
917
|
def do(self, event: dict):
|
|
918
|
+
# Remove labels from the event
|
|
919
|
+
event.pop(EventFieldType.LABELS)
|
|
920
|
+
|
|
920
921
|
update_endpoint_record(
|
|
921
922
|
project=self.project,
|
|
922
923
|
endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
|
-
from typing import Any, NewType
|
|
16
|
+
from typing import Any, Callable, NewType
|
|
17
17
|
|
|
18
18
|
import mlrun.common.model_monitoring
|
|
19
19
|
import mlrun.common.schemas
|
|
@@ -30,7 +30,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
30
30
|
WriterEventKind,
|
|
31
31
|
)
|
|
32
32
|
from mlrun.common.schemas.notification import NotificationKind, NotificationSeverity
|
|
33
|
-
from mlrun.model_monitoring.helpers import
|
|
33
|
+
from mlrun.model_monitoring.helpers import get_result_instance_fqn
|
|
34
34
|
from mlrun.serving.utils import StepToDict
|
|
35
35
|
from mlrun.utils import logger
|
|
36
36
|
from mlrun.utils.notifications.notification_pusher import CustomNotificationPusher
|
|
@@ -102,7 +102,11 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
102
102
|
|
|
103
103
|
kind = "monitoring_application_stream_pusher"
|
|
104
104
|
|
|
105
|
-
def __init__(
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
project: str,
|
|
108
|
+
secret_provider: Callable = None,
|
|
109
|
+
) -> None:
|
|
106
110
|
self.project = project
|
|
107
111
|
self.name = project # required for the deployment process
|
|
108
112
|
|
|
@@ -111,10 +115,10 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
111
115
|
)
|
|
112
116
|
|
|
113
117
|
self._app_result_store = mlrun.model_monitoring.get_store_object(
|
|
114
|
-
project=self.project
|
|
118
|
+
project=self.project, secret_provider=secret_provider
|
|
115
119
|
)
|
|
116
120
|
self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
117
|
-
project=self.project, secret_provider=
|
|
121
|
+
project=self.project, secret_provider=secret_provider
|
|
118
122
|
)
|
|
119
123
|
self._endpoints_records = {}
|
|
120
124
|
|
|
@@ -149,11 +153,7 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
149
153
|
result_kind: int, result_status: int
|
|
150
154
|
) -> alert_objects.EventKind:
|
|
151
155
|
"""Generate the required Event Kind format for the alerting system"""
|
|
152
|
-
|
|
153
|
-
# Custom kind is represented as an anomaly detection
|
|
154
|
-
event_kind = "mm_app_anomaly"
|
|
155
|
-
else:
|
|
156
|
-
event_kind = ResultKindApp(value=result_kind).name
|
|
156
|
+
event_kind = ResultKindApp(value=result_kind).name
|
|
157
157
|
|
|
158
158
|
if result_status == ResultStatusApp.detected.value:
|
|
159
159
|
event_kind = f"{event_kind}_detected"
|
|
@@ -223,7 +223,7 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
223
223
|
endpoint_id = event[WriterEvent.ENDPOINT_ID]
|
|
224
224
|
endpoint_record = self._endpoints_records.setdefault(
|
|
225
225
|
endpoint_id,
|
|
226
|
-
|
|
226
|
+
self._app_result_store.get_model_endpoint(endpoint_id=endpoint_id),
|
|
227
227
|
)
|
|
228
228
|
event_value = {
|
|
229
229
|
"app_name": event[WriterEvent.APPLICATION_NAME],
|
mlrun/projects/operations.py
CHANGED
|
@@ -77,6 +77,7 @@ def run_function(
|
|
|
77
77
|
notifications: list[mlrun.model.Notification] = None,
|
|
78
78
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
79
79
|
builder_env: Optional[list] = None,
|
|
80
|
+
reset_on_run: Optional[bool] = None,
|
|
80
81
|
) -> Union[mlrun.model.RunObject, PipelineNodeWrapper]:
|
|
81
82
|
"""Run a local or remote task as part of a local/kubeflow pipeline
|
|
82
83
|
|
|
@@ -167,6 +168,9 @@ def run_function(
|
|
|
167
168
|
artifact type can be given there. The artifact key must appear in the dictionary as
|
|
168
169
|
"key": "the_key".
|
|
169
170
|
:param builder_env: env vars dict for source archive config/credentials e.g. builder_env={"GIT_TOKEN": token}
|
|
171
|
+
:param reset_on_run: When True, function python modules would reload prior to code execution.
|
|
172
|
+
This ensures latest code changes are executed. This argument must be used in
|
|
173
|
+
conjunction with the local=True argument.
|
|
170
174
|
:return: MLRun RunObject or PipelineNodeWrapper
|
|
171
175
|
"""
|
|
172
176
|
engine, function = _get_engine_and_function(function, project_object)
|
|
@@ -215,6 +219,7 @@ def run_function(
|
|
|
215
219
|
schedule=schedule,
|
|
216
220
|
notifications=notifications,
|
|
217
221
|
builder_env=builder_env,
|
|
222
|
+
reset_on_run=reset_on_run,
|
|
218
223
|
)
|
|
219
224
|
if run_result:
|
|
220
225
|
run_result._notified = False
|