mlrun 1.10.0rc40__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +3 -2
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +7 -0
- mlrun/common/model_monitoring/helpers.py +41 -4
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +13 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +10 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +24 -0
- mlrun/common/schemas/hub.py +3 -2
- mlrun/common/schemas/model_monitoring/__init__.py +1 -1
- mlrun/common/schemas/model_monitoring/constants.py +2 -2
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +53 -15
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +2 -3
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +1 -1
- mlrun/datastore/datastore_profile.py +49 -17
- mlrun/datastore/model_provider/huggingface_provider.py +6 -2
- mlrun/datastore/model_provider/model_provider.py +2 -2
- mlrun/datastore/model_provider/openai_provider.py +2 -2
- mlrun/datastore/s3.py +15 -16
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +16 -10
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +16 -3
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +36 -12
- mlrun/db/httpdb.py +316 -101
- mlrun/db/nopdb.py +29 -11
- mlrun/errors.py +4 -2
- mlrun/execution.py +11 -12
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +37 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +67 -76
- mlrun/hub/step.py +113 -0
- mlrun/launcher/base.py +2 -1
- mlrun/launcher/local.py +2 -1
- mlrun/model.py +12 -2
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +2 -2
- mlrun/model_monitoring/applications/base.py +20 -6
- mlrun/model_monitoring/applications/context.py +1 -0
- mlrun/model_monitoring/controller.py +7 -17
- mlrun/model_monitoring/db/_schedules.py +2 -16
- mlrun/model_monitoring/db/_stats.py +2 -13
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +2 -4
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +4 -6
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +147 -79
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +2 -1
- mlrun/model_monitoring/stream_processing.py +18 -16
- mlrun/model_monitoring/writer.py +4 -3
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -44
- mlrun/platforms/iguazio.py +1 -1
- mlrun/projects/operations.py +11 -10
- mlrun/projects/project.py +81 -82
- mlrun/run.py +4 -7
- mlrun/runtimes/__init__.py +2 -204
- mlrun/runtimes/base.py +89 -21
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +4 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/mounts.py +5 -0
- mlrun/runtimes/nuclio/__init__.py +12 -8
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +200 -32
- mlrun/runtimes/nuclio/function.py +154 -49
- mlrun/runtimes/nuclio/serving.py +55 -42
- mlrun/runtimes/pod.py +59 -10
- mlrun/secrets.py +46 -2
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +5 -5
- mlrun/serving/routers.py +3 -3
- mlrun/serving/server.py +46 -43
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +554 -207
- mlrun/serving/steps.py +1 -1
- mlrun/serving/system_steps.py +42 -33
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +89 -16
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/git.py +1 -1
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification_pusher.py +2 -2
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +39 -49
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +144 -130
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -343
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1368
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +0 -51
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -1,1368 +0,0 @@
|
|
|
1
|
-
# Copyright 2024 Iguazio
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import threading
|
|
16
|
-
from datetime import datetime, timedelta
|
|
17
|
-
from typing import Callable, Final, Literal, Optional, Union
|
|
18
|
-
|
|
19
|
-
import pandas as pd
|
|
20
|
-
import taosws
|
|
21
|
-
|
|
22
|
-
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
23
|
-
import mlrun.common.types
|
|
24
|
-
import mlrun.model_monitoring.db.tsdb.tdengine.schemas as tdengine_schemas
|
|
25
|
-
from mlrun.config import config
|
|
26
|
-
from mlrun.datastore.datastore_profile import DatastoreProfile
|
|
27
|
-
from mlrun.model_monitoring.db import TSDBConnector
|
|
28
|
-
from mlrun.model_monitoring.db.tsdb.tdengine.tdengine_connection import (
|
|
29
|
-
Statement,
|
|
30
|
-
TDEngineConnection,
|
|
31
|
-
)
|
|
32
|
-
from mlrun.model_monitoring.helpers import get_invocations_fqn, get_start_end
|
|
33
|
-
from mlrun.utils import logger
|
|
34
|
-
|
|
35
|
-
# Thread-local storage for connections
|
|
36
|
-
_thread_local = threading.local()
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class TDEngineTimestampPrecision(mlrun.common.types.StrEnum):
|
|
40
|
-
"""
|
|
41
|
-
The timestamp precision for the TDEngine database.
|
|
42
|
-
For more information, see:
|
|
43
|
-
https://docs.tdengine.com/tdengine-reference/sql-manual/data-types/#timestamp
|
|
44
|
-
https://docs.tdengine.com/tdengine-reference/sql-manual/manage-databases/#create-database
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
MILLISECOND = "ms" # TDEngine's default
|
|
48
|
-
MICROSECOND = "us" # MLRun's default
|
|
49
|
-
NANOSECOND = "ns"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class TDEngineConnector(TSDBConnector):
|
|
53
|
-
"""
|
|
54
|
-
Handles the TSDB operations when the TSDB connector is of type TDEngine.
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
type: str = mm_schemas.TSDBTarget.TDEngine
|
|
58
|
-
|
|
59
|
-
def __init__(
|
|
60
|
-
self,
|
|
61
|
-
project: str,
|
|
62
|
-
profile: DatastoreProfile,
|
|
63
|
-
timestamp_precision: TDEngineTimestampPrecision = TDEngineTimestampPrecision.MICROSECOND,
|
|
64
|
-
):
|
|
65
|
-
super().__init__(project=project)
|
|
66
|
-
|
|
67
|
-
self._tdengine_connection_profile = profile
|
|
68
|
-
|
|
69
|
-
self._timestamp_precision: Final = ( # cannot be changed after initialization
|
|
70
|
-
timestamp_precision
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
if not mlrun.mlconf.system_id:
|
|
74
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
75
|
-
"system_id is not set in mlrun.mlconf. "
|
|
76
|
-
"TDEngineConnector requires system_id to be configured for database name construction. "
|
|
77
|
-
"Please ensure MLRun configuration is properly loaded before creating TDEngineConnector."
|
|
78
|
-
)
|
|
79
|
-
self.database = (
|
|
80
|
-
f"{tdengine_schemas._MODEL_MONITORING_DATABASE}_{mlrun.mlconf.system_id}"
|
|
81
|
-
)
|
|
82
|
-
self._init_super_tables()
|
|
83
|
-
|
|
84
|
-
@property
|
|
85
|
-
def connection(self) -> TDEngineConnection:
|
|
86
|
-
if not hasattr(_thread_local, "connection"):
|
|
87
|
-
_thread_local.connection = self._create_connection()
|
|
88
|
-
logger.debug(
|
|
89
|
-
"Created new TDEngine connection for thread",
|
|
90
|
-
project=self.project,
|
|
91
|
-
thread_name=threading.current_thread().name,
|
|
92
|
-
thread_id=threading.get_ident(),
|
|
93
|
-
)
|
|
94
|
-
return _thread_local.connection
|
|
95
|
-
|
|
96
|
-
def _create_connection(self) -> TDEngineConnection:
|
|
97
|
-
"""Establish a connection to the TSDB server."""
|
|
98
|
-
logger.debug("Creating a new connection to TDEngine", project=self.project)
|
|
99
|
-
conn = TDEngineConnection(
|
|
100
|
-
self._tdengine_connection_profile.dsn(),
|
|
101
|
-
)
|
|
102
|
-
conn.prefix_statements = [f"USE {self.database}"]
|
|
103
|
-
|
|
104
|
-
return conn
|
|
105
|
-
|
|
106
|
-
def _init_super_tables(self):
|
|
107
|
-
"""Initialize the super tables for the TSDB."""
|
|
108
|
-
self.tables = {
|
|
109
|
-
mm_schemas.TDEngineSuperTables.APP_RESULTS: tdengine_schemas.AppResultTable(
|
|
110
|
-
project=self.project, database=self.database
|
|
111
|
-
),
|
|
112
|
-
mm_schemas.TDEngineSuperTables.METRICS: tdengine_schemas.Metrics(
|
|
113
|
-
project=self.project, database=self.database
|
|
114
|
-
),
|
|
115
|
-
mm_schemas.TDEngineSuperTables.PREDICTIONS: tdengine_schemas.Predictions(
|
|
116
|
-
project=self.project, database=self.database
|
|
117
|
-
),
|
|
118
|
-
mm_schemas.TDEngineSuperTables.ERRORS: tdengine_schemas.Errors(
|
|
119
|
-
project=self.project, database=self.database
|
|
120
|
-
),
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
def _create_db_if_not_exists(self):
|
|
124
|
-
"""Create the database if it does not exist."""
|
|
125
|
-
self.connection.prefix_statements = []
|
|
126
|
-
self.connection.run(
|
|
127
|
-
statements=f"CREATE DATABASE IF NOT EXISTS {self.database} PRECISION '{self._timestamp_precision}'",
|
|
128
|
-
)
|
|
129
|
-
self.connection.prefix_statements = [f"USE {self.database}"]
|
|
130
|
-
logger.debug(
|
|
131
|
-
"The TDEngine database is currently in use",
|
|
132
|
-
project=self.project,
|
|
133
|
-
database=self.database,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
def create_tables(self):
|
|
137
|
-
"""Create TDEngine supertables."""
|
|
138
|
-
|
|
139
|
-
# Create the database if it does not exist
|
|
140
|
-
self._create_db_if_not_exists()
|
|
141
|
-
|
|
142
|
-
for table in self.tables:
|
|
143
|
-
create_table_query = self.tables[table]._create_super_table_query()
|
|
144
|
-
conn = self.connection
|
|
145
|
-
conn.run(
|
|
146
|
-
statements=create_table_query,
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
def write_application_event(
|
|
150
|
-
self,
|
|
151
|
-
event: dict,
|
|
152
|
-
kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
|
|
153
|
-
) -> None:
|
|
154
|
-
"""
|
|
155
|
-
Write a single result or metric to TSDB.
|
|
156
|
-
"""
|
|
157
|
-
|
|
158
|
-
table_name = (
|
|
159
|
-
f"{event[mm_schemas.WriterEvent.ENDPOINT_ID]}_"
|
|
160
|
-
f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}"
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
if kind == mm_schemas.WriterEventKind.RESULT:
|
|
164
|
-
# Write a new result
|
|
165
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS]
|
|
166
|
-
table_name = (
|
|
167
|
-
f"{table_name}_{event[mm_schemas.ResultData.RESULT_NAME]}"
|
|
168
|
-
).replace("-", "_")
|
|
169
|
-
|
|
170
|
-
else:
|
|
171
|
-
# Write a new metric
|
|
172
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.METRICS]
|
|
173
|
-
table_name = (
|
|
174
|
-
f"{table_name}_{event[mm_schemas.MetricData.METRIC_NAME]}"
|
|
175
|
-
).replace("-", "_")
|
|
176
|
-
|
|
177
|
-
# Escape the table name for case-sensitivity (ML-7908)
|
|
178
|
-
# https://github.com/taosdata/taos-connector-python/issues/260
|
|
179
|
-
table_name = f"`{table_name}`"
|
|
180
|
-
|
|
181
|
-
# Convert the datetime strings to datetime objects
|
|
182
|
-
event[mm_schemas.WriterEvent.END_INFER_TIME] = self._convert_to_datetime(
|
|
183
|
-
val=event[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
184
|
-
)
|
|
185
|
-
event[mm_schemas.WriterEvent.START_INFER_TIME] = self._convert_to_datetime(
|
|
186
|
-
val=event[mm_schemas.WriterEvent.START_INFER_TIME]
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
|
|
190
|
-
|
|
191
|
-
# we need the string values to be sent to the connection, not the enum
|
|
192
|
-
columns = {str(key): str(val) for key, val in table.columns.items()}
|
|
193
|
-
|
|
194
|
-
insert_statement = Statement(
|
|
195
|
-
columns=columns,
|
|
196
|
-
subtable=table_name,
|
|
197
|
-
values=event,
|
|
198
|
-
timestamp_precision=self._timestamp_precision,
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
self.connection.run(
|
|
202
|
-
statements=[
|
|
203
|
-
create_table_sql,
|
|
204
|
-
insert_statement,
|
|
205
|
-
],
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
@staticmethod
|
|
209
|
-
def _convert_to_datetime(val: Union[str, datetime]) -> datetime:
|
|
210
|
-
return datetime.fromisoformat(val) if isinstance(val, str) else val
|
|
211
|
-
|
|
212
|
-
@staticmethod
|
|
213
|
-
def _generate_filter_query(
|
|
214
|
-
filter_column: str, filter_values: Union[str, list[Union[str, int]]]
|
|
215
|
-
) -> str:
|
|
216
|
-
"""
|
|
217
|
-
Generate a filter query for TDEngine based on the provided column and values.
|
|
218
|
-
|
|
219
|
-
:param filter_column: The column to filter by.
|
|
220
|
-
:param filter_values: A single value or a list of values to filter by.
|
|
221
|
-
|
|
222
|
-
:return: A string representing the filter query.
|
|
223
|
-
:raise: ``MLRunValueError`` if the filter values are not of type string or list.
|
|
224
|
-
"""
|
|
225
|
-
if isinstance(filter_values, str):
|
|
226
|
-
return f"{filter_column}='{filter_values}'"
|
|
227
|
-
elif isinstance(filter_values, list):
|
|
228
|
-
return f"{filter_column} IN ({', '.join(repr(v) for v in filter_values)}) "
|
|
229
|
-
else:
|
|
230
|
-
raise mlrun.errors.MLRunValueError(
|
|
231
|
-
f"Invalid filter values {filter_values}: must be a string or a list, "
|
|
232
|
-
f"got {type(filter_values).__name__}; filter values: {filter_values}"
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
def _drop_database_query(self) -> str:
|
|
236
|
-
return f"DROP DATABASE IF EXISTS {self.database};"
|
|
237
|
-
|
|
238
|
-
def _get_table_name_query(self) -> str:
|
|
239
|
-
return f"SELECT table_name FROM information_schema.ins_tables where db_name='{self.database}' LIMIT 1;"
|
|
240
|
-
|
|
241
|
-
def apply_monitoring_stream_steps(self, graph, **kwarg):
|
|
242
|
-
"""
|
|
243
|
-
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
244
|
-
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
245
|
-
grafana. At the moment, we store two types of data:
|
|
246
|
-
- prediction latency.
|
|
247
|
-
- custom metrics.
|
|
248
|
-
"""
|
|
249
|
-
|
|
250
|
-
def apply_process_before_tsdb():
|
|
251
|
-
graph.add_step(
|
|
252
|
-
"mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
|
|
253
|
-
name="ProcessBeforeTDEngine",
|
|
254
|
-
after="FilterNOP",
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
def apply_tdengine_target(name, after):
|
|
258
|
-
graph.add_step(
|
|
259
|
-
"mlrun.datastore.storeytargets.TDEngineStoreyTarget",
|
|
260
|
-
name=name,
|
|
261
|
-
after=after,
|
|
262
|
-
url=f"ds://{self._tdengine_connection_profile.name}",
|
|
263
|
-
supertable=self.tables[
|
|
264
|
-
mm_schemas.TDEngineSuperTables.PREDICTIONS
|
|
265
|
-
].super_table,
|
|
266
|
-
table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
|
|
267
|
-
time_col=mm_schemas.EventFieldType.TIME,
|
|
268
|
-
database=self.database,
|
|
269
|
-
columns=[
|
|
270
|
-
mm_schemas.EventFieldType.LATENCY,
|
|
271
|
-
mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
|
|
272
|
-
mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
|
|
273
|
-
mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
|
|
274
|
-
],
|
|
275
|
-
tag_cols=[
|
|
276
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
277
|
-
],
|
|
278
|
-
max_events=1000,
|
|
279
|
-
flush_after_seconds=30,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
apply_process_before_tsdb()
|
|
283
|
-
apply_tdengine_target(
|
|
284
|
-
name="TDEngineTarget",
|
|
285
|
-
after="ProcessBeforeTDEngine",
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
def add_pre_writer_steps(self, graph, after):
|
|
289
|
-
return graph.add_step(
|
|
290
|
-
"mlrun.model_monitoring.db.tsdb.tdengine.writer_graph_steps.ProcessBeforeTDEngine",
|
|
291
|
-
name="ProcessBeforeTDEngine",
|
|
292
|
-
after=after,
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
def apply_writer_steps(self, graph, after, **kwargs) -> None:
|
|
296
|
-
graph.add_step(
|
|
297
|
-
"mlrun.datastore.storeytargets.TDEngineStoreyTarget",
|
|
298
|
-
name="tsdb_metrics",
|
|
299
|
-
after=after,
|
|
300
|
-
url=f"ds://{self._tdengine_connection_profile.name}",
|
|
301
|
-
supertable=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table,
|
|
302
|
-
table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
|
|
303
|
-
time_col=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
304
|
-
database=self.database,
|
|
305
|
-
graph_shape="cylinder",
|
|
306
|
-
columns=[
|
|
307
|
-
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
308
|
-
mm_schemas.MetricData.METRIC_VALUE,
|
|
309
|
-
],
|
|
310
|
-
tag_cols=[
|
|
311
|
-
mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
312
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
313
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
314
|
-
],
|
|
315
|
-
max_events=config.model_endpoint_monitoring.writer_graph.max_events,
|
|
316
|
-
flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
graph.add_step(
|
|
320
|
-
"mlrun.datastore.storeytargets.TDEngineStoreyTarget",
|
|
321
|
-
name="tsdb_app_results",
|
|
322
|
-
after=after,
|
|
323
|
-
url=f"ds://{self._tdengine_connection_profile.name}",
|
|
324
|
-
supertable=self.tables[
|
|
325
|
-
mm_schemas.TDEngineSuperTables.APP_RESULTS
|
|
326
|
-
].super_table,
|
|
327
|
-
table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
|
|
328
|
-
time_col=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
329
|
-
database=self.database,
|
|
330
|
-
graph_shape="cylinder",
|
|
331
|
-
columns=[
|
|
332
|
-
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
333
|
-
mm_schemas.ResultData.RESULT_VALUE,
|
|
334
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
335
|
-
mm_schemas.ResultData.RESULT_EXTRA_DATA,
|
|
336
|
-
],
|
|
337
|
-
tag_cols=[
|
|
338
|
-
mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
339
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
340
|
-
mm_schemas.ResultData.RESULT_NAME,
|
|
341
|
-
mm_schemas.ResultData.RESULT_KIND,
|
|
342
|
-
],
|
|
343
|
-
max_events=config.model_endpoint_monitoring.writer_graph.max_events,
|
|
344
|
-
flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
def handle_model_error(
|
|
348
|
-
self,
|
|
349
|
-
graph,
|
|
350
|
-
tsdb_batching_max_events: int = 1000,
|
|
351
|
-
tsdb_batching_timeout_secs: int = 30,
|
|
352
|
-
**kwargs,
|
|
353
|
-
) -> None:
|
|
354
|
-
graph.add_step(
|
|
355
|
-
"mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ErrorExtractor",
|
|
356
|
-
name="error_extractor",
|
|
357
|
-
after="ForwardError",
|
|
358
|
-
)
|
|
359
|
-
graph.add_step(
|
|
360
|
-
"mlrun.datastore.storeytargets.TDEngineStoreyTarget",
|
|
361
|
-
name="tsdb_error",
|
|
362
|
-
after="error_extractor",
|
|
363
|
-
url=f"ds://{self._tdengine_connection_profile.name}",
|
|
364
|
-
supertable=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
|
|
365
|
-
table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
|
|
366
|
-
time_col=mm_schemas.EventFieldType.TIME,
|
|
367
|
-
database=self.database,
|
|
368
|
-
columns=[
|
|
369
|
-
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
370
|
-
],
|
|
371
|
-
tag_cols=[
|
|
372
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
373
|
-
mm_schemas.EventFieldType.ERROR_TYPE,
|
|
374
|
-
],
|
|
375
|
-
max_events=tsdb_batching_max_events,
|
|
376
|
-
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
def delete_tsdb_records(self, endpoint_ids: list[str]) -> None:
|
|
380
|
-
"""
|
|
381
|
-
To delete subtables within TDEngine, we first query the subtables names with the provided endpoint_ids.
|
|
382
|
-
Then, we drop each subtable.
|
|
383
|
-
"""
|
|
384
|
-
logger.debug(
|
|
385
|
-
"Deleting model endpoint resources using the TDEngine connector",
|
|
386
|
-
project=self.project,
|
|
387
|
-
number_of_endpoints_to_delete=len(endpoint_ids),
|
|
388
|
-
)
|
|
389
|
-
|
|
390
|
-
# Get all subtables with the provided endpoint_ids
|
|
391
|
-
subtables = []
|
|
392
|
-
try:
|
|
393
|
-
for table in self.tables:
|
|
394
|
-
get_subtable_query = self.tables[table]._get_subtables_query_by_tag(
|
|
395
|
-
filter_tag="endpoint_id", filter_values=endpoint_ids
|
|
396
|
-
)
|
|
397
|
-
subtables_result = self.connection.run(query=get_subtable_query)
|
|
398
|
-
subtables.extend([subtable[0] for subtable in subtables_result.data])
|
|
399
|
-
except Exception as e:
|
|
400
|
-
logger.warning(
|
|
401
|
-
"Failed to get subtables for deletion. You may need to delete them manually."
|
|
402
|
-
"These can be found under the following supertables: app_results, "
|
|
403
|
-
"metrics, errors, and predictions.",
|
|
404
|
-
project=self.project,
|
|
405
|
-
error=mlrun.errors.err_to_str(e),
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
# Prepare the drop statements
|
|
409
|
-
drop_statements = [
|
|
410
|
-
self.tables[table].drop_subtable_query(subtable=subtable)
|
|
411
|
-
for subtable in subtables
|
|
412
|
-
]
|
|
413
|
-
try:
|
|
414
|
-
logger.debug("Dropping subtables", drop_statements=drop_statements)
|
|
415
|
-
self.connection.run(statements=drop_statements)
|
|
416
|
-
except Exception as e:
|
|
417
|
-
logger.warning(
|
|
418
|
-
"Failed to delete model endpoint resources. You may need to delete them manually. "
|
|
419
|
-
"These can be found under the following supertables: app_results, "
|
|
420
|
-
"metrics, errors, and predictions.",
|
|
421
|
-
project=self.project,
|
|
422
|
-
error=mlrun.errors.err_to_str(e),
|
|
423
|
-
)
|
|
424
|
-
logger.debug(
|
|
425
|
-
"Deleted all model endpoint resources using the TDEngine connector",
|
|
426
|
-
project=self.project,
|
|
427
|
-
number_of_endpoints_to_delete=len(endpoint_ids),
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
def delete_application_records(
|
|
431
|
-
self, application_name: str, endpoint_ids: Optional[list[str]] = None
|
|
432
|
-
) -> None:
|
|
433
|
-
"""
|
|
434
|
-
Delete application records from the TSDB for the given model endpoints or all if ``endpoint_ids`` is ``None``.
|
|
435
|
-
"""
|
|
436
|
-
logger.debug(
|
|
437
|
-
"Deleting application records",
|
|
438
|
-
project=self.project,
|
|
439
|
-
application_name=application_name,
|
|
440
|
-
endpoint_ids=endpoint_ids,
|
|
441
|
-
)
|
|
442
|
-
tables = [
|
|
443
|
-
self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS],
|
|
444
|
-
self.tables[mm_schemas.TDEngineSuperTables.METRICS],
|
|
445
|
-
]
|
|
446
|
-
|
|
447
|
-
filter_query = self._generate_filter_query(
|
|
448
|
-
filter_column=mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
449
|
-
filter_values=application_name,
|
|
450
|
-
)
|
|
451
|
-
if endpoint_ids:
|
|
452
|
-
endpoint_ids_filter = self._generate_filter_query(
|
|
453
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
454
|
-
filter_values=endpoint_ids,
|
|
455
|
-
)
|
|
456
|
-
filter_query += f" AND {endpoint_ids_filter}"
|
|
457
|
-
|
|
458
|
-
drop_statements: list[str] = []
|
|
459
|
-
for table in tables:
|
|
460
|
-
get_subtable_query = table._get_tables_query_by_condition(filter_query)
|
|
461
|
-
subtables_result = self.connection.run(query=get_subtable_query)
|
|
462
|
-
drop_statements.extend(
|
|
463
|
-
[
|
|
464
|
-
table.drop_subtable_query(subtable=subtable[0])
|
|
465
|
-
for subtable in subtables_result.data
|
|
466
|
-
]
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
logger.debug("Dropping application records", drop_statements=drop_statements)
|
|
470
|
-
self.connection.run(statements=drop_statements)
|
|
471
|
-
|
|
472
|
-
def delete_tsdb_resources(self):
|
|
473
|
-
"""
|
|
474
|
-
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
475
|
-
"""
|
|
476
|
-
logger.debug(
|
|
477
|
-
"Deleting all project resources using the TDEngine connector",
|
|
478
|
-
project=self.project,
|
|
479
|
-
)
|
|
480
|
-
drop_statements = []
|
|
481
|
-
for table in self.tables:
|
|
482
|
-
drop_statements.append(self.tables[table].drop_supertable_query())
|
|
483
|
-
|
|
484
|
-
try:
|
|
485
|
-
self.connection.run(
|
|
486
|
-
statements=drop_statements,
|
|
487
|
-
)
|
|
488
|
-
except Exception as e:
|
|
489
|
-
logger.warning(
|
|
490
|
-
"Failed to drop TDEngine tables. You may need to drop them manually. "
|
|
491
|
-
"These can be found under the following supertables: app_results, "
|
|
492
|
-
"metrics, errors, and predictions.",
|
|
493
|
-
project=self.project,
|
|
494
|
-
error=mlrun.errors.err_to_str(e),
|
|
495
|
-
)
|
|
496
|
-
logger.debug(
|
|
497
|
-
"Deleted all project resources using the TDEngine connector",
|
|
498
|
-
project=self.project,
|
|
499
|
-
)
|
|
500
|
-
|
|
501
|
-
# Check if database is empty and if so, drop it
|
|
502
|
-
self._drop_database_if_empty()
|
|
503
|
-
|
|
504
|
-
def _drop_database_if_empty(self):
|
|
505
|
-
query_random_table_name = self._get_table_name_query()
|
|
506
|
-
drop_database = False
|
|
507
|
-
try:
|
|
508
|
-
table_name = self.connection.run(
|
|
509
|
-
query=query_random_table_name,
|
|
510
|
-
)
|
|
511
|
-
if len(table_name.data) == 0:
|
|
512
|
-
# no tables were found under the database
|
|
513
|
-
drop_database = True
|
|
514
|
-
|
|
515
|
-
except Exception as e:
|
|
516
|
-
logger.warning(
|
|
517
|
-
"Failed to query tables in the database. You may need to drop the database manually if it is empty.",
|
|
518
|
-
project=self.project,
|
|
519
|
-
error=mlrun.errors.err_to_str(e),
|
|
520
|
-
)
|
|
521
|
-
|
|
522
|
-
if drop_database:
|
|
523
|
-
logger.debug(
|
|
524
|
-
"Going to drop the TDEngine database",
|
|
525
|
-
project=self.project,
|
|
526
|
-
database=self.database,
|
|
527
|
-
)
|
|
528
|
-
drop_database_query = self._drop_database_query()
|
|
529
|
-
try:
|
|
530
|
-
self.connection.run(
|
|
531
|
-
statements=drop_database_query,
|
|
532
|
-
)
|
|
533
|
-
logger.debug(
|
|
534
|
-
"The TDEngine database has been successfully dropped",
|
|
535
|
-
project=self.project,
|
|
536
|
-
database=self.database,
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
except Exception as e:
|
|
540
|
-
logger.warning(
|
|
541
|
-
"Failed to drop the database. You may need to drop it manually if it is empty.",
|
|
542
|
-
project=self.project,
|
|
543
|
-
error=mlrun.errors.err_to_str(e),
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
def get_model_endpoint_real_time_metrics(
|
|
547
|
-
self,
|
|
548
|
-
endpoint_id: str,
|
|
549
|
-
metrics: list[str],
|
|
550
|
-
start: str,
|
|
551
|
-
end: str,
|
|
552
|
-
) -> dict[str, list[tuple[str, float]]]:
|
|
553
|
-
# Not implemented, use get_records() instead
|
|
554
|
-
pass
|
|
555
|
-
|
|
556
|
-
def _get_records(
|
|
557
|
-
self,
|
|
558
|
-
table: str,
|
|
559
|
-
start: datetime,
|
|
560
|
-
end: datetime,
|
|
561
|
-
columns: Optional[list[str]] = None,
|
|
562
|
-
filter_query: Optional[str] = None,
|
|
563
|
-
interval: Optional[str] = None,
|
|
564
|
-
agg_funcs: Optional[list] = None,
|
|
565
|
-
limit: Optional[int] = None,
|
|
566
|
-
sliding_window_step: Optional[str] = None,
|
|
567
|
-
timestamp_column: str = mm_schemas.EventFieldType.TIME,
|
|
568
|
-
group_by: Optional[Union[list[str], str]] = None,
|
|
569
|
-
preform_agg_columns: Optional[list] = None,
|
|
570
|
-
order_by: Optional[str] = None,
|
|
571
|
-
desc: Optional[bool] = None,
|
|
572
|
-
partition_by: Optional[str] = None,
|
|
573
|
-
) -> pd.DataFrame:
|
|
574
|
-
"""
|
|
575
|
-
Getting records from TSDB data collection.
|
|
576
|
-
:param table: Either a supertable or a subtable name.
|
|
577
|
-
:param start: The start time of the metrics.
|
|
578
|
-
:param end: The end time of the metrics.
|
|
579
|
-
:param columns: Columns to include in the result.
|
|
580
|
-
:param filter_query: Optional filter expression as a string. TDengine supports SQL-like syntax.
|
|
581
|
-
:param interval: The interval to aggregate the data by. Note that if interval is provided,
|
|
582
|
-
`agg_funcs` must bg provided as well. Provided as a string in the format of '1m',
|
|
583
|
-
'1h', etc.
|
|
584
|
-
:param agg_funcs: The aggregation functions to apply on the columns. Note that if `agg_funcs` is
|
|
585
|
-
provided, `interval` must bg provided as well. Provided as a list of strings in
|
|
586
|
-
the format of ['sum', 'avg', 'count', ...].
|
|
587
|
-
:param limit: The maximum number of records to return.
|
|
588
|
-
:param sliding_window_step: The time step for which the time window moves forward. Note that if
|
|
589
|
-
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
590
|
-
as a string in the format of '1m', '1h', etc.
|
|
591
|
-
:param timestamp_column: The column name that holds the timestamp index.
|
|
592
|
-
:param group_by: The column name to group by. Note that if `group_by` is provided, aggregation
|
|
593
|
-
functions must bg provided
|
|
594
|
-
:param preform_agg_columns: The columns to preform aggregation on.
|
|
595
|
-
notice that all aggregation functions provided will preform on those columns.
|
|
596
|
-
If not provided The default behavior is to preform on all columns in columns,
|
|
597
|
-
if an empty list was provided The aggregation won't be performed.
|
|
598
|
-
:param order_by: The column or alias to preform ordering on the query.
|
|
599
|
-
:param desc: Whether or not to sort the results in descending order.
|
|
600
|
-
:param partition_by: The column to partition the results by. Note that if interval is provided,
|
|
601
|
-
`agg_funcs` must bg provided as well.
|
|
602
|
-
|
|
603
|
-
:return: DataFrame with the provided attributes from the data collection.
|
|
604
|
-
:raise: MLRunInvalidArgumentError if query the provided table failed.
|
|
605
|
-
"""
|
|
606
|
-
|
|
607
|
-
full_query = tdengine_schemas.TDEngineSchema._get_records_query(
|
|
608
|
-
table=table,
|
|
609
|
-
start=start,
|
|
610
|
-
end=end,
|
|
611
|
-
columns_to_filter=columns,
|
|
612
|
-
filter_query=filter_query,
|
|
613
|
-
interval=interval,
|
|
614
|
-
limit=limit,
|
|
615
|
-
agg_funcs=agg_funcs,
|
|
616
|
-
sliding_window_step=sliding_window_step,
|
|
617
|
-
timestamp_column=timestamp_column,
|
|
618
|
-
database=self.database,
|
|
619
|
-
group_by=group_by,
|
|
620
|
-
preform_agg_funcs_columns=preform_agg_columns,
|
|
621
|
-
order_by=order_by,
|
|
622
|
-
desc=desc,
|
|
623
|
-
partition_by=partition_by,
|
|
624
|
-
)
|
|
625
|
-
logger.debug("Querying TDEngine", query=full_query)
|
|
626
|
-
try:
|
|
627
|
-
query_result = self.connection.run(
|
|
628
|
-
query=full_query,
|
|
629
|
-
)
|
|
630
|
-
except taosws.QueryError as e:
|
|
631
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
632
|
-
f"Failed to query table {table} in database {self.database}, {str(e)}"
|
|
633
|
-
)
|
|
634
|
-
|
|
635
|
-
df_columns = [field.name for field in query_result.fields]
|
|
636
|
-
return pd.DataFrame(query_result.data, columns=df_columns)
|
|
637
|
-
|
|
638
|
-
def read_metrics_data(
|
|
639
|
-
self,
|
|
640
|
-
*,
|
|
641
|
-
endpoint_id: str,
|
|
642
|
-
start: datetime,
|
|
643
|
-
end: datetime,
|
|
644
|
-
metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
|
|
645
|
-
type: Literal["metrics", "results"],
|
|
646
|
-
with_result_extra_data: bool = False,
|
|
647
|
-
) -> Union[
|
|
648
|
-
list[
|
|
649
|
-
Union[
|
|
650
|
-
mm_schemas.ModelEndpointMonitoringResultValues,
|
|
651
|
-
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
652
|
-
],
|
|
653
|
-
],
|
|
654
|
-
list[
|
|
655
|
-
Union[
|
|
656
|
-
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
657
|
-
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
658
|
-
],
|
|
659
|
-
],
|
|
660
|
-
]:
|
|
661
|
-
timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
|
|
662
|
-
columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
|
|
663
|
-
if type == "metrics":
|
|
664
|
-
if with_result_extra_data:
|
|
665
|
-
logger.warning(
|
|
666
|
-
"The 'with_result_extra_data' parameter is not supported for metrics, just for results",
|
|
667
|
-
project=self.project,
|
|
668
|
-
endpoint_id=endpoint_id,
|
|
669
|
-
)
|
|
670
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
|
|
671
|
-
name = mm_schemas.MetricData.METRIC_NAME
|
|
672
|
-
columns += [name, mm_schemas.MetricData.METRIC_VALUE]
|
|
673
|
-
df_handler = self.df_to_metrics_values
|
|
674
|
-
elif type == "results":
|
|
675
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table
|
|
676
|
-
name = mm_schemas.ResultData.RESULT_NAME
|
|
677
|
-
columns += [
|
|
678
|
-
name,
|
|
679
|
-
mm_schemas.ResultData.RESULT_VALUE,
|
|
680
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
681
|
-
mm_schemas.ResultData.RESULT_KIND,
|
|
682
|
-
]
|
|
683
|
-
if with_result_extra_data:
|
|
684
|
-
columns.append(mm_schemas.ResultData.RESULT_EXTRA_DATA)
|
|
685
|
-
df_handler = self.df_to_results_values
|
|
686
|
-
else:
|
|
687
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
688
|
-
f"Invalid type {type}, must be either 'metrics' or 'results'."
|
|
689
|
-
)
|
|
690
|
-
|
|
691
|
-
metrics_condition = " OR ".join(
|
|
692
|
-
[
|
|
693
|
-
f"({mm_schemas.WriterEvent.APPLICATION_NAME}='{metric.app}' AND {name}='{metric.name}')"
|
|
694
|
-
for metric in metrics
|
|
695
|
-
]
|
|
696
|
-
)
|
|
697
|
-
filter_query = f"(endpoint_id='{endpoint_id}') AND ({metrics_condition})"
|
|
698
|
-
|
|
699
|
-
df = self._get_records(
|
|
700
|
-
table=table,
|
|
701
|
-
start=start,
|
|
702
|
-
end=end,
|
|
703
|
-
filter_query=filter_query,
|
|
704
|
-
timestamp_column=timestamp_column,
|
|
705
|
-
columns=columns,
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
df[mm_schemas.WriterEvent.END_INFER_TIME] = pd.to_datetime(
|
|
709
|
-
df[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
710
|
-
)
|
|
711
|
-
df.set_index(mm_schemas.WriterEvent.END_INFER_TIME, inplace=True)
|
|
712
|
-
|
|
713
|
-
logger.debug(
|
|
714
|
-
"Converting a DataFrame to a list of metrics or results values",
|
|
715
|
-
table=table,
|
|
716
|
-
project=self.project,
|
|
717
|
-
endpoint_id=endpoint_id,
|
|
718
|
-
is_empty=df.empty,
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
if not with_result_extra_data and type == "results":
|
|
722
|
-
# Set the extra data to an empty string if it's not requested
|
|
723
|
-
df[mm_schemas.ResultData.RESULT_EXTRA_DATA] = ""
|
|
724
|
-
|
|
725
|
-
return df_handler(df=df, metrics=metrics, project=self.project)
|
|
726
|
-
|
|
727
|
-
def read_predictions(
|
|
728
|
-
self,
|
|
729
|
-
*,
|
|
730
|
-
endpoint_id: str,
|
|
731
|
-
start: datetime,
|
|
732
|
-
end: datetime,
|
|
733
|
-
aggregation_window: Optional[str] = None,
|
|
734
|
-
agg_funcs: Optional[list] = None,
|
|
735
|
-
limit: Optional[int] = None,
|
|
736
|
-
) -> Union[
|
|
737
|
-
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
738
|
-
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
739
|
-
]:
|
|
740
|
-
if (agg_funcs and not aggregation_window) or (
|
|
741
|
-
aggregation_window and not agg_funcs
|
|
742
|
-
):
|
|
743
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
744
|
-
"both or neither of `aggregation_window` and `agg_funcs` must be provided"
|
|
745
|
-
)
|
|
746
|
-
df = self._get_records(
|
|
747
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
748
|
-
start=start,
|
|
749
|
-
end=end,
|
|
750
|
-
columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
|
|
751
|
-
filter_query=f"endpoint_id='{endpoint_id}'",
|
|
752
|
-
agg_funcs=agg_funcs,
|
|
753
|
-
interval=aggregation_window,
|
|
754
|
-
limit=limit,
|
|
755
|
-
)
|
|
756
|
-
|
|
757
|
-
full_name = get_invocations_fqn(self.project)
|
|
758
|
-
|
|
759
|
-
if df.empty:
|
|
760
|
-
return mm_schemas.ModelEndpointMonitoringMetricNoData(
|
|
761
|
-
full_name=full_name,
|
|
762
|
-
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
763
|
-
)
|
|
764
|
-
|
|
765
|
-
if aggregation_window:
|
|
766
|
-
# _wend column, which represents the end time of each window, will be used as the time index
|
|
767
|
-
df["_wend"] = pd.to_datetime(df["_wend"])
|
|
768
|
-
df.set_index("_wend", inplace=True)
|
|
769
|
-
|
|
770
|
-
estimated_prediction_count = (
|
|
771
|
-
f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
|
|
772
|
-
if agg_funcs
|
|
773
|
-
else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
|
|
774
|
-
)
|
|
775
|
-
|
|
776
|
-
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
777
|
-
full_name=full_name,
|
|
778
|
-
values=list(
|
|
779
|
-
zip(
|
|
780
|
-
df.index,
|
|
781
|
-
df[estimated_prediction_count],
|
|
782
|
-
)
|
|
783
|
-
), # pyright: ignore[reportArgumentType]
|
|
784
|
-
)
|
|
785
|
-
|
|
786
|
-
def get_last_request(
|
|
787
|
-
self,
|
|
788
|
-
endpoint_ids: Union[str, list[str]],
|
|
789
|
-
start: Optional[datetime] = None,
|
|
790
|
-
end: Optional[datetime] = None,
|
|
791
|
-
) -> Union[pd.DataFrame, dict[str, float]]:
|
|
792
|
-
if not endpoint_ids:
|
|
793
|
-
return {}
|
|
794
|
-
filter_query = self._generate_filter_query(
|
|
795
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
796
|
-
filter_values=endpoint_ids,
|
|
797
|
-
)
|
|
798
|
-
start, end = get_start_end(start, end)
|
|
799
|
-
df = self._get_records(
|
|
800
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
801
|
-
start=start,
|
|
802
|
-
end=end,
|
|
803
|
-
columns=[
|
|
804
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
805
|
-
mm_schemas.EventFieldType.TIME,
|
|
806
|
-
mm_schemas.EventFieldType.LATENCY,
|
|
807
|
-
],
|
|
808
|
-
filter_query=filter_query,
|
|
809
|
-
timestamp_column=mm_schemas.EventFieldType.TIME,
|
|
810
|
-
agg_funcs=["last"],
|
|
811
|
-
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
812
|
-
preform_agg_columns=[mm_schemas.EventFieldType.TIME],
|
|
813
|
-
)
|
|
814
|
-
if not df.empty:
|
|
815
|
-
df.dropna(inplace=True)
|
|
816
|
-
df.rename(
|
|
817
|
-
columns={
|
|
818
|
-
f"last({mm_schemas.EventFieldType.TIME})": mm_schemas.EventFieldType.LAST_REQUEST,
|
|
819
|
-
f"{mm_schemas.EventFieldType.LATENCY}": "last_latency",
|
|
820
|
-
},
|
|
821
|
-
inplace=True,
|
|
822
|
-
)
|
|
823
|
-
df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
|
|
824
|
-
df[mm_schemas.EventFieldType.LAST_REQUEST],
|
|
825
|
-
errors="coerce",
|
|
826
|
-
format="ISO8601",
|
|
827
|
-
utc=True,
|
|
828
|
-
)
|
|
829
|
-
return df
|
|
830
|
-
|
|
831
|
-
def get_drift_status(
|
|
832
|
-
self,
|
|
833
|
-
endpoint_ids: Union[str, list[str]],
|
|
834
|
-
start: Optional[datetime] = None,
|
|
835
|
-
end: Optional[datetime] = None,
|
|
836
|
-
get_raw: bool = False,
|
|
837
|
-
) -> pd.DataFrame:
|
|
838
|
-
filter_query = self._generate_filter_query(
|
|
839
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
840
|
-
filter_values=endpoint_ids,
|
|
841
|
-
)
|
|
842
|
-
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
843
|
-
start, end = get_start_end(start, end)
|
|
844
|
-
df = self._get_records(
|
|
845
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
|
|
846
|
-
start=start,
|
|
847
|
-
end=end,
|
|
848
|
-
columns=[
|
|
849
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
850
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
851
|
-
],
|
|
852
|
-
filter_query=filter_query,
|
|
853
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
854
|
-
agg_funcs=["max"],
|
|
855
|
-
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
856
|
-
preform_agg_columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
857
|
-
)
|
|
858
|
-
df.rename(
|
|
859
|
-
columns={
|
|
860
|
-
f"max({mm_schemas.ResultData.RESULT_STATUS})": mm_schemas.ResultData.RESULT_STATUS
|
|
861
|
-
},
|
|
862
|
-
inplace=True,
|
|
863
|
-
)
|
|
864
|
-
if not df.empty:
|
|
865
|
-
df.dropna(inplace=True)
|
|
866
|
-
return df
|
|
867
|
-
|
|
868
|
-
def count_results_by_status(
|
|
869
|
-
self,
|
|
870
|
-
start: Optional[Union[datetime, str]] = None,
|
|
871
|
-
end: Optional[Union[datetime, str]] = None,
|
|
872
|
-
endpoint_ids: Optional[Union[str, list[str]]] = None,
|
|
873
|
-
application_names: Optional[Union[str, list[str]]] = None,
|
|
874
|
-
result_status_list: Optional[list[int]] = None,
|
|
875
|
-
) -> dict[tuple[str, int], int]:
|
|
876
|
-
filter_query = ""
|
|
877
|
-
|
|
878
|
-
start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
|
|
879
|
-
|
|
880
|
-
if endpoint_ids:
|
|
881
|
-
filter_query = self._generate_filter_query(
|
|
882
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
883
|
-
filter_values=endpoint_ids,
|
|
884
|
-
)
|
|
885
|
-
if application_names:
|
|
886
|
-
app_filter_query = self._generate_filter_query(
|
|
887
|
-
filter_column=mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
888
|
-
filter_values=application_names,
|
|
889
|
-
)
|
|
890
|
-
if filter_query:
|
|
891
|
-
filter_query += f" AND {app_filter_query}"
|
|
892
|
-
else:
|
|
893
|
-
filter_query = app_filter_query
|
|
894
|
-
if result_status_list:
|
|
895
|
-
status_filter_query = self._generate_filter_query(
|
|
896
|
-
filter_column=mm_schemas.ResultData.RESULT_STATUS,
|
|
897
|
-
filter_values=result_status_list,
|
|
898
|
-
)
|
|
899
|
-
if filter_query:
|
|
900
|
-
filter_query += f" AND {status_filter_query}"
|
|
901
|
-
else:
|
|
902
|
-
filter_query = status_filter_query
|
|
903
|
-
|
|
904
|
-
df = self._get_records(
|
|
905
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
|
|
906
|
-
start=start,
|
|
907
|
-
end=end,
|
|
908
|
-
columns=[
|
|
909
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
910
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
911
|
-
mm_schemas.ResultData.RESULT_VALUE,
|
|
912
|
-
],
|
|
913
|
-
filter_query=filter_query,
|
|
914
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
915
|
-
group_by=[
|
|
916
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
917
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
918
|
-
],
|
|
919
|
-
agg_funcs=["count"],
|
|
920
|
-
preform_agg_columns=[mm_schemas.ResultData.RESULT_VALUE],
|
|
921
|
-
)
|
|
922
|
-
if df.empty:
|
|
923
|
-
return {}
|
|
924
|
-
|
|
925
|
-
# Convert DataFrame to a dictionary
|
|
926
|
-
return {
|
|
927
|
-
(
|
|
928
|
-
row[mm_schemas.WriterEvent.APPLICATION_NAME],
|
|
929
|
-
row[mm_schemas.ResultData.RESULT_STATUS],
|
|
930
|
-
): row["count(result_value)"]
|
|
931
|
-
for _, row in df.iterrows()
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
def count_processed_model_endpoints(
|
|
935
|
-
self,
|
|
936
|
-
start: Optional[Union[datetime, str]] = None,
|
|
937
|
-
end: Optional[Union[datetime, str]] = None,
|
|
938
|
-
application_names: Optional[Union[str, list[str]]] = None,
|
|
939
|
-
) -> dict:
|
|
940
|
-
filter_query = ""
|
|
941
|
-
start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
|
|
942
|
-
|
|
943
|
-
if application_names:
|
|
944
|
-
filter_query = self._generate_filter_query(
|
|
945
|
-
filter_column=mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
946
|
-
filter_values=application_names,
|
|
947
|
-
)
|
|
948
|
-
|
|
949
|
-
def get_application_endpoints_records(super_table: str) -> pd.DataFrame:
|
|
950
|
-
return self._get_records(
|
|
951
|
-
table=super_table,
|
|
952
|
-
start=start,
|
|
953
|
-
end=end,
|
|
954
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
955
|
-
columns=[
|
|
956
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
957
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
958
|
-
],
|
|
959
|
-
filter_query=filter_query,
|
|
960
|
-
group_by=[
|
|
961
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
962
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
963
|
-
],
|
|
964
|
-
preform_agg_columns=[mm_schemas.ResultData.RESULT_VALUE],
|
|
965
|
-
agg_funcs=["last"],
|
|
966
|
-
)
|
|
967
|
-
|
|
968
|
-
df_results = get_application_endpoints_records(
|
|
969
|
-
super_table=self.tables[
|
|
970
|
-
mm_schemas.TDEngineSuperTables.APP_RESULTS
|
|
971
|
-
].super_table
|
|
972
|
-
)
|
|
973
|
-
df_metrics = get_application_endpoints_records(
|
|
974
|
-
super_table=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
|
|
975
|
-
)
|
|
976
|
-
|
|
977
|
-
combined_df = pd.concat([df_results, df_metrics]).drop_duplicates()
|
|
978
|
-
|
|
979
|
-
if combined_df.empty:
|
|
980
|
-
return {}
|
|
981
|
-
grouped_df = combined_df.groupby(
|
|
982
|
-
mm_schemas.WriterEvent.APPLICATION_NAME
|
|
983
|
-
).count()
|
|
984
|
-
|
|
985
|
-
# Convert DataFrame to a dictionary
|
|
986
|
-
return grouped_df[mm_schemas.WriterEvent.ENDPOINT_ID].to_dict()
|
|
987
|
-
|
|
988
|
-
def calculate_latest_metrics(
|
|
989
|
-
self,
|
|
990
|
-
start: Optional[Union[datetime, str]] = None,
|
|
991
|
-
end: Optional[Union[datetime, str]] = None,
|
|
992
|
-
application_names: Optional[Union[str, list[str]]] = None,
|
|
993
|
-
) -> list[
|
|
994
|
-
Union[mm_schemas.ApplicationResultRecord, mm_schemas.ApplicationMetricRecord]
|
|
995
|
-
]:
|
|
996
|
-
metric_list = []
|
|
997
|
-
filter_query = ""
|
|
998
|
-
start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
|
|
999
|
-
|
|
1000
|
-
if application_names:
|
|
1001
|
-
filter_query = self._generate_filter_query(
|
|
1002
|
-
filter_column=mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
1003
|
-
filter_values=application_names,
|
|
1004
|
-
)
|
|
1005
|
-
|
|
1006
|
-
def get_latest_metrics_records(
|
|
1007
|
-
record_type: Literal["metrics", "results"],
|
|
1008
|
-
) -> pd.DataFrame:
|
|
1009
|
-
columns = [
|
|
1010
|
-
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
1011
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
1012
|
-
]
|
|
1013
|
-
if record_type == "results":
|
|
1014
|
-
table = self.tables[
|
|
1015
|
-
mm_schemas.TDEngineSuperTables.APP_RESULTS
|
|
1016
|
-
].super_table
|
|
1017
|
-
columns += [
|
|
1018
|
-
mm_schemas.ResultData.RESULT_NAME,
|
|
1019
|
-
mm_schemas.ResultData.RESULT_VALUE,
|
|
1020
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
1021
|
-
mm_schemas.ResultData.RESULT_KIND,
|
|
1022
|
-
]
|
|
1023
|
-
agg_column = mm_schemas.ResultData.RESULT_VALUE
|
|
1024
|
-
else:
|
|
1025
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
|
|
1026
|
-
columns += [
|
|
1027
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
1028
|
-
mm_schemas.MetricData.METRIC_VALUE,
|
|
1029
|
-
]
|
|
1030
|
-
agg_column = mm_schemas.MetricData.METRIC_VALUE
|
|
1031
|
-
|
|
1032
|
-
return self._get_records(
|
|
1033
|
-
table=table,
|
|
1034
|
-
start=start,
|
|
1035
|
-
end=end,
|
|
1036
|
-
columns=columns,
|
|
1037
|
-
filter_query=filter_query,
|
|
1038
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
1039
|
-
# Aggregate per application/metric pair regardless of timestamp
|
|
1040
|
-
group_by=columns[1:],
|
|
1041
|
-
preform_agg_columns=[agg_column],
|
|
1042
|
-
agg_funcs=["last"],
|
|
1043
|
-
)
|
|
1044
|
-
|
|
1045
|
-
df_results = get_latest_metrics_records(record_type="results")
|
|
1046
|
-
df_metrics = get_latest_metrics_records(record_type="metrics")
|
|
1047
|
-
|
|
1048
|
-
if df_results.empty and df_metrics.empty:
|
|
1049
|
-
return metric_list
|
|
1050
|
-
|
|
1051
|
-
def build_metric_objects() -> (
|
|
1052
|
-
list[
|
|
1053
|
-
Union[
|
|
1054
|
-
mm_schemas.ApplicationResultRecord,
|
|
1055
|
-
mm_schemas.ApplicationMetricRecord,
|
|
1056
|
-
]
|
|
1057
|
-
]
|
|
1058
|
-
):
|
|
1059
|
-
metric_objects = []
|
|
1060
|
-
|
|
1061
|
-
if not df_results.empty:
|
|
1062
|
-
df_results.rename(
|
|
1063
|
-
columns={
|
|
1064
|
-
f"last({mm_schemas.ResultData.RESULT_VALUE})": mm_schemas.ResultData.RESULT_VALUE,
|
|
1065
|
-
},
|
|
1066
|
-
inplace=True,
|
|
1067
|
-
)
|
|
1068
|
-
for _, row in df_results.iterrows():
|
|
1069
|
-
metric_objects.append(
|
|
1070
|
-
mm_schemas.ApplicationResultRecord(
|
|
1071
|
-
time=datetime.fromisoformat(
|
|
1072
|
-
row[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
1073
|
-
),
|
|
1074
|
-
result_name=row[mm_schemas.ResultData.RESULT_NAME],
|
|
1075
|
-
kind=row[mm_schemas.ResultData.RESULT_KIND],
|
|
1076
|
-
status=row[mm_schemas.ResultData.RESULT_STATUS],
|
|
1077
|
-
value=row[mm_schemas.ResultData.RESULT_VALUE],
|
|
1078
|
-
)
|
|
1079
|
-
)
|
|
1080
|
-
|
|
1081
|
-
if not df_metrics.empty:
|
|
1082
|
-
df_metrics.rename(
|
|
1083
|
-
columns={
|
|
1084
|
-
f"last({mm_schemas.MetricData.METRIC_VALUE})": mm_schemas.MetricData.METRIC_VALUE,
|
|
1085
|
-
},
|
|
1086
|
-
inplace=True,
|
|
1087
|
-
)
|
|
1088
|
-
for _, row in df_metrics.iterrows():
|
|
1089
|
-
metric_objects.append(
|
|
1090
|
-
mm_schemas.ApplicationMetricRecord(
|
|
1091
|
-
time=datetime.fromisoformat(
|
|
1092
|
-
row[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
1093
|
-
),
|
|
1094
|
-
metric_name=row[mm_schemas.MetricData.METRIC_NAME],
|
|
1095
|
-
value=row[mm_schemas.MetricData.METRIC_VALUE],
|
|
1096
|
-
)
|
|
1097
|
-
)
|
|
1098
|
-
|
|
1099
|
-
return metric_objects
|
|
1100
|
-
|
|
1101
|
-
return build_metric_objects()
|
|
1102
|
-
|
|
1103
|
-
def get_metrics_metadata(
|
|
1104
|
-
self,
|
|
1105
|
-
endpoint_id: Union[str, list[str]],
|
|
1106
|
-
start: Optional[datetime] = None,
|
|
1107
|
-
end: Optional[datetime] = None,
|
|
1108
|
-
) -> pd.DataFrame:
|
|
1109
|
-
start, end = get_start_end(start, end)
|
|
1110
|
-
df = self._get_records(
|
|
1111
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table,
|
|
1112
|
-
start=start,
|
|
1113
|
-
end=end,
|
|
1114
|
-
columns=[
|
|
1115
|
-
mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1116
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
1117
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1118
|
-
],
|
|
1119
|
-
filter_query=self._generate_filter_query(
|
|
1120
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1121
|
-
filter_values=endpoint_id,
|
|
1122
|
-
),
|
|
1123
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
1124
|
-
group_by=[
|
|
1125
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
1126
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
1127
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1128
|
-
],
|
|
1129
|
-
agg_funcs=["last"],
|
|
1130
|
-
)
|
|
1131
|
-
df.rename(
|
|
1132
|
-
columns={
|
|
1133
|
-
f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1134
|
-
f"last({mm_schemas.MetricData.METRIC_NAME})": mm_schemas.MetricData.METRIC_NAME,
|
|
1135
|
-
f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1136
|
-
},
|
|
1137
|
-
inplace=True,
|
|
1138
|
-
)
|
|
1139
|
-
if not df.empty:
|
|
1140
|
-
df.dropna(inplace=True)
|
|
1141
|
-
return df
|
|
1142
|
-
|
|
1143
|
-
def get_results_metadata(
|
|
1144
|
-
self,
|
|
1145
|
-
endpoint_id: Union[str, list[str]],
|
|
1146
|
-
start: Optional[datetime] = None,
|
|
1147
|
-
end: Optional[datetime] = None,
|
|
1148
|
-
) -> pd.DataFrame:
|
|
1149
|
-
start, end = get_start_end(start, end)
|
|
1150
|
-
df = self._get_records(
|
|
1151
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
|
|
1152
|
-
start=start,
|
|
1153
|
-
end=end,
|
|
1154
|
-
columns=[
|
|
1155
|
-
mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1156
|
-
mm_schemas.ResultData.RESULT_NAME,
|
|
1157
|
-
mm_schemas.ResultData.RESULT_KIND,
|
|
1158
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1159
|
-
],
|
|
1160
|
-
filter_query=self._generate_filter_query(
|
|
1161
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1162
|
-
filter_values=endpoint_id,
|
|
1163
|
-
),
|
|
1164
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
1165
|
-
group_by=[
|
|
1166
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
1167
|
-
mm_schemas.ResultData.RESULT_NAME,
|
|
1168
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1169
|
-
],
|
|
1170
|
-
agg_funcs=["last"],
|
|
1171
|
-
)
|
|
1172
|
-
df.rename(
|
|
1173
|
-
columns={
|
|
1174
|
-
f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1175
|
-
f"last({mm_schemas.ResultData.RESULT_NAME})": mm_schemas.ResultData.RESULT_NAME,
|
|
1176
|
-
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND,
|
|
1177
|
-
f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1178
|
-
},
|
|
1179
|
-
inplace=True,
|
|
1180
|
-
)
|
|
1181
|
-
if not df.empty:
|
|
1182
|
-
df.dropna(inplace=True)
|
|
1183
|
-
return df
|
|
1184
|
-
|
|
1185
|
-
def get_error_count(
|
|
1186
|
-
self,
|
|
1187
|
-
endpoint_ids: Union[str, list[str]],
|
|
1188
|
-
start: Optional[datetime] = None,
|
|
1189
|
-
end: Optional[datetime] = None,
|
|
1190
|
-
get_raw: bool = False,
|
|
1191
|
-
) -> pd.DataFrame:
|
|
1192
|
-
filter_query = self._generate_filter_query(
|
|
1193
|
-
filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1194
|
-
filter_values=endpoint_ids,
|
|
1195
|
-
)
|
|
1196
|
-
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
1197
|
-
start, end = get_start_end(start, end)
|
|
1198
|
-
df = self._get_records(
|
|
1199
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
|
|
1200
|
-
start=start,
|
|
1201
|
-
end=end,
|
|
1202
|
-
columns=[
|
|
1203
|
-
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
1204
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1205
|
-
],
|
|
1206
|
-
agg_funcs=["count"],
|
|
1207
|
-
filter_query=filter_query,
|
|
1208
|
-
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1209
|
-
preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
|
|
1210
|
-
)
|
|
1211
|
-
df.rename(
|
|
1212
|
-
columns={f"count({mm_schemas.EventFieldType.MODEL_ERROR})": "error_count"},
|
|
1213
|
-
inplace=True,
|
|
1214
|
-
)
|
|
1215
|
-
if not df.empty:
|
|
1216
|
-
df.dropna(inplace=True)
|
|
1217
|
-
return df
|
|
1218
|
-
|
|
1219
|
-
def get_avg_latency(
|
|
1220
|
-
self,
|
|
1221
|
-
endpoint_ids: Union[str, list[str]],
|
|
1222
|
-
start: Optional[datetime] = None,
|
|
1223
|
-
end: Optional[datetime] = None,
|
|
1224
|
-
get_raw: bool = False,
|
|
1225
|
-
) -> pd.DataFrame:
|
|
1226
|
-
endpoint_ids = (
|
|
1227
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
1228
|
-
)
|
|
1229
|
-
start, end = get_start_end(start, end, delta=timedelta(hours=24))
|
|
1230
|
-
df = self._get_records(
|
|
1231
|
-
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
1232
|
-
start=start,
|
|
1233
|
-
end=end,
|
|
1234
|
-
columns=[
|
|
1235
|
-
mm_schemas.EventFieldType.LATENCY,
|
|
1236
|
-
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1237
|
-
],
|
|
1238
|
-
agg_funcs=["avg"],
|
|
1239
|
-
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
1240
|
-
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
1241
|
-
preform_agg_columns=[mm_schemas.EventFieldType.LATENCY],
|
|
1242
|
-
)
|
|
1243
|
-
df.rename(
|
|
1244
|
-
columns={f"avg({mm_schemas.EventFieldType.LATENCY})": "avg_latency"},
|
|
1245
|
-
inplace=True,
|
|
1246
|
-
)
|
|
1247
|
-
if not df.empty:
|
|
1248
|
-
df.dropna(inplace=True)
|
|
1249
|
-
return df
|
|
1250
|
-
|
|
1251
|
-
async def add_basic_metrics(
|
|
1252
|
-
self,
|
|
1253
|
-
model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
|
|
1254
|
-
project: str,
|
|
1255
|
-
run_in_threadpool: Callable,
|
|
1256
|
-
metric_list: Optional[list[str]] = None,
|
|
1257
|
-
) -> list[mlrun.common.schemas.ModelEndpoint]:
|
|
1258
|
-
"""
|
|
1259
|
-
Add basic metrics to the model endpoint object.
|
|
1260
|
-
|
|
1261
|
-
:param model_endpoint_objects: A list of `ModelEndpoint` objects that will
|
|
1262
|
-
be filled with the relevant basic metrics.
|
|
1263
|
-
:param project: The name of the project.
|
|
1264
|
-
:param run_in_threadpool: A function that runs another function in a thread pool.
|
|
1265
|
-
:param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
|
|
1266
|
-
|
|
1267
|
-
:return: A list of `ModelEndpointMonitoringMetric` objects.
|
|
1268
|
-
"""
|
|
1269
|
-
|
|
1270
|
-
uids = [mep.metadata.uid for mep in model_endpoint_objects]
|
|
1271
|
-
|
|
1272
|
-
metric_name_to_function = {
|
|
1273
|
-
"error_count": self.get_error_count,
|
|
1274
|
-
"last_request": self.get_last_request,
|
|
1275
|
-
"avg_latency": self.get_avg_latency,
|
|
1276
|
-
"result_status": self.get_drift_status,
|
|
1277
|
-
}
|
|
1278
|
-
if metric_list is not None:
|
|
1279
|
-
for metric_name in list(metric_name_to_function):
|
|
1280
|
-
if metric_name not in metric_list:
|
|
1281
|
-
del metric_name_to_function[metric_name]
|
|
1282
|
-
|
|
1283
|
-
metric_name_to_df = {
|
|
1284
|
-
metric_name: function(endpoint_ids=uids)
|
|
1285
|
-
for metric_name, function in metric_name_to_function.items()
|
|
1286
|
-
}
|
|
1287
|
-
|
|
1288
|
-
def add_metrics(
|
|
1289
|
-
mep: mlrun.common.schemas.ModelEndpoint,
|
|
1290
|
-
df_dictionary: dict[str, pd.DataFrame],
|
|
1291
|
-
):
|
|
1292
|
-
for metric in df_dictionary.keys():
|
|
1293
|
-
df = df_dictionary.get(metric, pd.DataFrame())
|
|
1294
|
-
if not df.empty:
|
|
1295
|
-
line = df[df["endpoint_id"] == mep.metadata.uid]
|
|
1296
|
-
if not line.empty and metric in line:
|
|
1297
|
-
value = line[metric].item()
|
|
1298
|
-
if isinstance(value, pd.Timestamp):
|
|
1299
|
-
value = value.to_pydatetime()
|
|
1300
|
-
setattr(mep.status, metric, value)
|
|
1301
|
-
|
|
1302
|
-
return mep
|
|
1303
|
-
|
|
1304
|
-
return list(
|
|
1305
|
-
map(
|
|
1306
|
-
lambda mep: add_metrics(
|
|
1307
|
-
mep=mep,
|
|
1308
|
-
df_dictionary=metric_name_to_df,
|
|
1309
|
-
),
|
|
1310
|
-
model_endpoint_objects,
|
|
1311
|
-
)
|
|
1312
|
-
)
|
|
1313
|
-
|
|
1314
|
-
def get_drift_data(
|
|
1315
|
-
self,
|
|
1316
|
-
start: datetime,
|
|
1317
|
-
end: datetime,
|
|
1318
|
-
) -> mm_schemas.ModelEndpointDriftValues:
|
|
1319
|
-
filter_query = self._generate_filter_query(
|
|
1320
|
-
filter_column=mm_schemas.ResultData.RESULT_STATUS,
|
|
1321
|
-
filter_values=[
|
|
1322
|
-
mm_schemas.ResultStatusApp.potential_detection.value,
|
|
1323
|
-
mm_schemas.ResultStatusApp.detected.value,
|
|
1324
|
-
],
|
|
1325
|
-
)
|
|
1326
|
-
table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table
|
|
1327
|
-
start, end, interval = self._prepare_aligned_start_end(start, end)
|
|
1328
|
-
|
|
1329
|
-
# get per time-interval x endpoint_id combination the max result status
|
|
1330
|
-
df = self._get_records(
|
|
1331
|
-
table=table,
|
|
1332
|
-
start=start,
|
|
1333
|
-
end=end,
|
|
1334
|
-
interval=interval,
|
|
1335
|
-
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
1336
|
-
filter_query=filter_query,
|
|
1337
|
-
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
1338
|
-
agg_funcs=["max"],
|
|
1339
|
-
partition_by=mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
1340
|
-
)
|
|
1341
|
-
if df.empty:
|
|
1342
|
-
return mm_schemas.ModelEndpointDriftValues(values=[])
|
|
1343
|
-
|
|
1344
|
-
df["_wstart"] = pd.to_datetime(df["_wstart"])
|
|
1345
|
-
return self._df_to_drift_data(df)
|
|
1346
|
-
|
|
1347
|
-
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
1348
|
-
#
|
|
1349
|
-
# def read_prediction_metric_for_endpoint_if_exists(
|
|
1350
|
-
# self, endpoint_id: str
|
|
1351
|
-
# ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
1352
|
-
# """
|
|
1353
|
-
# Read the "invocations" metric for the provided model endpoint, and return the metric object
|
|
1354
|
-
# if it exists.
|
|
1355
|
-
#
|
|
1356
|
-
# :param endpoint_id: The model endpoint identifier.
|
|
1357
|
-
# :return: `None` if the invocations metric does not exist, otherwise return the
|
|
1358
|
-
# corresponding metric object.
|
|
1359
|
-
# """
|
|
1360
|
-
# # Read just one record, because we just want to check if there is any data for this endpoint_id
|
|
1361
|
-
# predictions = self.read_predictions(
|
|
1362
|
-
# endpoint_id=endpoint_id,
|
|
1363
|
-
# start=datetime.min,
|
|
1364
|
-
# end=mlrun.utils.now_date(),
|
|
1365
|
-
# limit=1,
|
|
1366
|
-
# )
|
|
1367
|
-
# if predictions:
|
|
1368
|
-
# return get_invocations_metric(self.project)
|