mlrun 1.7.0rc36__py3-none-any.whl → 1.7.0rc38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +64 -0
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/model_monitoring/constants.py +4 -0
- mlrun/common/schemas/notification.py +26 -7
- mlrun/datastore/azure_blob.py +120 -30
- mlrun/datastore/s3.py +8 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/model.py +5 -0
- mlrun/model_monitoring/api.py +1 -1
- mlrun/model_monitoring/applications/_application_steps.py +9 -4
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +14 -1
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +10 -7
- mlrun/model_monitoring/db/tsdb/base.py +141 -12
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +65 -5
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +211 -35
- mlrun/model_monitoring/helpers.py +1 -2
- mlrun/model_monitoring/stream_processing.py +67 -25
- mlrun/model_monitoring/writer.py +4 -1
- mlrun/projects/operations.py +4 -0
- mlrun/projects/project.py +11 -1
- mlrun/runtimes/__init__.py +15 -8
- mlrun/runtimes/base.py +3 -0
- mlrun/runtimes/nuclio/application/application.py +98 -17
- mlrun/runtimes/nuclio/function.py +5 -1
- mlrun/runtimes/pod.py +2 -2
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +11 -16
- mlrun/serving/routers.py +1 -4
- mlrun/serving/server.py +4 -7
- mlrun/serving/states.py +1 -1
- mlrun/serving/v2_serving.py +5 -7
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/METADATA +12 -6
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/RECORD +40 -40
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/top_level.txt +0 -0
|
@@ -15,8 +15,10 @@
|
|
|
15
15
|
import typing
|
|
16
16
|
from abc import ABC, abstractmethod
|
|
17
17
|
from datetime import datetime
|
|
18
|
+
from typing import Union
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
21
|
+
import pydantic
|
|
20
22
|
|
|
21
23
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
22
24
|
import mlrun.model_monitoring.db.tsdb.helpers
|
|
@@ -46,7 +48,7 @@ class TSDBConnector(ABC):
|
|
|
46
48
|
self.project = project
|
|
47
49
|
|
|
48
50
|
@abstractmethod
|
|
49
|
-
def apply_monitoring_stream_steps(self, graph):
|
|
51
|
+
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
50
52
|
"""
|
|
51
53
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
52
54
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -58,6 +60,14 @@ class TSDBConnector(ABC):
|
|
|
58
60
|
"""
|
|
59
61
|
pass
|
|
60
62
|
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Adds a branch to the stream pod graph to handle events that
|
|
67
|
+
arrive with errors from the model server and saves them to the error TSDB table.
|
|
68
|
+
The first step that generates by this method should come after `ForwardError` step.
|
|
69
|
+
"""
|
|
70
|
+
|
|
61
71
|
@abstractmethod
|
|
62
72
|
def write_application_event(
|
|
63
73
|
self,
|
|
@@ -180,6 +190,117 @@ class TSDBConnector(ABC):
|
|
|
180
190
|
:return: Metric values object or no data object.
|
|
181
191
|
"""
|
|
182
192
|
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def get_last_request(
|
|
195
|
+
self,
|
|
196
|
+
endpoint_ids: Union[str, list[str]],
|
|
197
|
+
start: Union[datetime, str] = "0",
|
|
198
|
+
end: Union[datetime, str] = "now",
|
|
199
|
+
) -> pd.DataFrame:
|
|
200
|
+
"""
|
|
201
|
+
Fetches data from the predictions TSDB table and returns the most recent request
|
|
202
|
+
timestamp for each specified endpoint.
|
|
203
|
+
|
|
204
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
205
|
+
:param start: The start time for the query.
|
|
206
|
+
:param end: The end time for the query.
|
|
207
|
+
|
|
208
|
+
:return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
|
|
209
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
@abstractmethod
|
|
213
|
+
def get_drift_status(
|
|
214
|
+
self,
|
|
215
|
+
endpoint_ids: Union[str, list[str]],
|
|
216
|
+
start: Union[datetime, str] = "now-24h",
|
|
217
|
+
end: Union[datetime, str] = "now",
|
|
218
|
+
) -> pd.DataFrame:
|
|
219
|
+
"""
|
|
220
|
+
Fetches data from the app-results TSDB table and returns the highest status among all
|
|
221
|
+
the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
|
|
222
|
+
|
|
223
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
224
|
+
:param start: The start time for the query.
|
|
225
|
+
:param end: The end time for the query.
|
|
226
|
+
|
|
227
|
+
:return: A pd.DataFrame containing the columns [result_status, endpoint_id].
|
|
228
|
+
If an endpoint has not been monitored within the specified time range (last 24 hours),
|
|
229
|
+
it will not appear in the result.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def get_metrics_metadata(
|
|
234
|
+
self,
|
|
235
|
+
endpoint_id: str,
|
|
236
|
+
start: Union[datetime, str] = "0",
|
|
237
|
+
end: Union[datetime, str] = "now",
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""
|
|
240
|
+
Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
|
|
241
|
+
|
|
242
|
+
:param endpoint_id: The model endpoint identifier.
|
|
243
|
+
:param start: The start time of the query.
|
|
244
|
+
:param end: The end time of the query.
|
|
245
|
+
|
|
246
|
+
:return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
|
|
247
|
+
Containing the columns [application_name, metric_name, endpoint_id]
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
@abstractmethod
|
|
251
|
+
def get_results_metadata(
|
|
252
|
+
self,
|
|
253
|
+
endpoint_id: str,
|
|
254
|
+
start: Union[datetime, str] = "0",
|
|
255
|
+
end: Union[datetime, str] = "now",
|
|
256
|
+
) -> pd.DataFrame:
|
|
257
|
+
"""
|
|
258
|
+
Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
|
|
259
|
+
|
|
260
|
+
:param endpoint_id: The model endpoint identifier.
|
|
261
|
+
:param start: The start time of the query.
|
|
262
|
+
:param end: The end time of the query.
|
|
263
|
+
|
|
264
|
+
:return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
|
|
265
|
+
Containing the columns [application_name, result_name, result_kind, endpoint_id]
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def get_error_count(
|
|
270
|
+
self,
|
|
271
|
+
endpoint_ids: Union[str, list[str]],
|
|
272
|
+
start: Union[datetime, str] = "0",
|
|
273
|
+
end: Union[datetime, str] = "now",
|
|
274
|
+
) -> pd.DataFrame:
|
|
275
|
+
"""
|
|
276
|
+
Fetches data from the error TSDB table and returns the error count for each specified endpoint.
|
|
277
|
+
|
|
278
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
279
|
+
:param start: The start time for the query.
|
|
280
|
+
:param end: The end time for the query.
|
|
281
|
+
|
|
282
|
+
:return: A pd.DataFrame containing the columns [error_count, endpoint_id].
|
|
283
|
+
If an endpoint have not raised error within the specified time range, it will not appear in the result.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def get_avg_latency(
|
|
288
|
+
self,
|
|
289
|
+
endpoint_ids: Union[str, list[str]],
|
|
290
|
+
start: Union[datetime, str] = "0",
|
|
291
|
+
end: Union[datetime, str] = "now",
|
|
292
|
+
) -> pd.DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
295
|
+
|
|
296
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
297
|
+
:param start: The start time for the query.
|
|
298
|
+
:param end: The end time for the query.
|
|
299
|
+
|
|
300
|
+
:return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
|
|
301
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
302
|
+
"""
|
|
303
|
+
|
|
183
304
|
@staticmethod
|
|
184
305
|
def df_to_metrics_values(
|
|
185
306
|
*,
|
|
@@ -289,19 +410,27 @@ class TSDBConnector(ABC):
|
|
|
289
410
|
full_name = mlrun.model_monitoring.helpers._compose_full_name(
|
|
290
411
|
project=project, app=app_name, name=name
|
|
291
412
|
)
|
|
292
|
-
|
|
293
|
-
|
|
413
|
+
try:
|
|
414
|
+
metrics_values.append(
|
|
415
|
+
mm_schemas.ModelEndpointMonitoringResultValues(
|
|
416
|
+
full_name=full_name,
|
|
417
|
+
result_kind=result_kind,
|
|
418
|
+
values=list(
|
|
419
|
+
zip(
|
|
420
|
+
sub_df.index,
|
|
421
|
+
sub_df[mm_schemas.ResultData.RESULT_VALUE],
|
|
422
|
+
sub_df[mm_schemas.ResultData.RESULT_STATUS],
|
|
423
|
+
)
|
|
424
|
+
), # pyright: ignore[reportArgumentType]
|
|
425
|
+
)
|
|
426
|
+
)
|
|
427
|
+
except pydantic.ValidationError:
|
|
428
|
+
logger.exception(
|
|
429
|
+
"Failed to convert data-frame into `ModelEndpointMonitoringResultValues`",
|
|
294
430
|
full_name=full_name,
|
|
295
|
-
|
|
296
|
-
values=list(
|
|
297
|
-
zip(
|
|
298
|
-
sub_df.index,
|
|
299
|
-
sub_df[mm_schemas.ResultData.RESULT_VALUE],
|
|
300
|
-
sub_df[mm_schemas.ResultData.RESULT_STATUS],
|
|
301
|
-
)
|
|
302
|
-
), # pyright: ignore[reportArgumentType]
|
|
431
|
+
sub_df_json=sub_df.to_json(),
|
|
303
432
|
)
|
|
304
|
-
|
|
433
|
+
raise
|
|
305
434
|
del metrics_without_data[full_name]
|
|
306
435
|
|
|
307
436
|
for metric in metrics_without_data.values():
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import typing
|
|
16
16
|
from datetime import datetime
|
|
17
|
+
from typing import Union
|
|
17
18
|
|
|
18
19
|
import pandas as pd
|
|
19
20
|
import taosws
|
|
@@ -156,6 +157,9 @@ class TDEngineConnector(TSDBConnector):
|
|
|
156
157
|
after="ProcessBeforeTDEngine",
|
|
157
158
|
)
|
|
158
159
|
|
|
160
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
161
|
+
pass
|
|
162
|
+
|
|
159
163
|
def delete_tsdb_resources(self):
|
|
160
164
|
"""
|
|
161
165
|
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
@@ -246,11 +250,9 @@ class TDEngineConnector(TSDBConnector):
|
|
|
246
250
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
247
251
|
f"Failed to query table {table} in database {self.database}, {str(e)}"
|
|
248
252
|
)
|
|
249
|
-
columns = []
|
|
250
|
-
for column in query_result.fields:
|
|
251
|
-
columns.append(column.name())
|
|
252
253
|
|
|
253
|
-
|
|
254
|
+
df_columns = [field.name() for field in query_result.fields]
|
|
255
|
+
return pd.DataFrame(query_result, columns=df_columns)
|
|
254
256
|
|
|
255
257
|
def read_metrics_data(
|
|
256
258
|
self,
|
|
@@ -274,13 +276,22 @@ class TDEngineConnector(TSDBConnector):
|
|
|
274
276
|
],
|
|
275
277
|
],
|
|
276
278
|
]:
|
|
279
|
+
timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
|
|
280
|
+
columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
|
|
277
281
|
if type == "metrics":
|
|
278
282
|
table = mm_schemas.TDEngineSuperTables.METRICS
|
|
279
283
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
284
|
+
columns += [name, mm_schemas.MetricData.METRIC_VALUE]
|
|
280
285
|
df_handler = self.df_to_metrics_values
|
|
281
286
|
elif type == "results":
|
|
282
287
|
table = mm_schemas.TDEngineSuperTables.APP_RESULTS
|
|
283
288
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
289
|
+
columns += [
|
|
290
|
+
name,
|
|
291
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
292
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
293
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
294
|
+
]
|
|
284
295
|
df_handler = self.df_to_results_values
|
|
285
296
|
else:
|
|
286
297
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -300,7 +311,8 @@ class TDEngineConnector(TSDBConnector):
|
|
|
300
311
|
start=start,
|
|
301
312
|
end=end,
|
|
302
313
|
filter_query=filter_query,
|
|
303
|
-
timestamp_column=
|
|
314
|
+
timestamp_column=timestamp_column,
|
|
315
|
+
columns=columns,
|
|
304
316
|
)
|
|
305
317
|
|
|
306
318
|
df[mm_schemas.WriterEvent.END_INFER_TIME] = pd.to_datetime(
|
|
@@ -377,6 +389,54 @@ class TDEngineConnector(TSDBConnector):
|
|
|
377
389
|
), # pyright: ignore[reportArgumentType]
|
|
378
390
|
)
|
|
379
391
|
|
|
392
|
+
def get_last_request(
|
|
393
|
+
self,
|
|
394
|
+
endpoint_ids: Union[str, list[str]],
|
|
395
|
+
start: Union[datetime, str] = "0",
|
|
396
|
+
end: Union[datetime, str] = "now",
|
|
397
|
+
) -> pd.DataFrame:
|
|
398
|
+
pass
|
|
399
|
+
|
|
400
|
+
def get_drift_status(
|
|
401
|
+
self,
|
|
402
|
+
endpoint_ids: Union[str, list[str]],
|
|
403
|
+
start: Union[datetime, str] = "now-24h",
|
|
404
|
+
end: Union[datetime, str] = "now",
|
|
405
|
+
) -> pd.DataFrame:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
def get_metrics_metadata(
|
|
409
|
+
self,
|
|
410
|
+
endpoint_id: str,
|
|
411
|
+
start: Union[datetime, str] = "0",
|
|
412
|
+
end: Union[datetime, str] = "now",
|
|
413
|
+
) -> pd.DataFrame:
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
def get_results_metadata(
|
|
417
|
+
self,
|
|
418
|
+
endpoint_id: str,
|
|
419
|
+
start: Union[datetime, str] = "0",
|
|
420
|
+
end: Union[datetime, str] = "now",
|
|
421
|
+
) -> pd.DataFrame:
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
def get_error_count(
|
|
425
|
+
self,
|
|
426
|
+
endpoint_ids: Union[str, list[str]],
|
|
427
|
+
start: Union[datetime, str] = "0",
|
|
428
|
+
end: Union[datetime, str] = "now",
|
|
429
|
+
) -> pd.DataFrame:
|
|
430
|
+
pass
|
|
431
|
+
|
|
432
|
+
def get_avg_latency(
|
|
433
|
+
self,
|
|
434
|
+
endpoint_ids: Union[str, list[str]],
|
|
435
|
+
start: Union[datetime, str] = "0",
|
|
436
|
+
end: Union[datetime, str] = "now",
|
|
437
|
+
) -> pd.DataFrame:
|
|
438
|
+
pass
|
|
439
|
+
|
|
380
440
|
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
381
441
|
#
|
|
382
442
|
# def read_prediction_metric_for_endpoint_if_exists(
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
from datetime import datetime
|
|
15
15
|
from typing import Any
|
|
16
16
|
|
|
17
17
|
import mlrun.feature_store.steps
|
|
@@ -20,6 +20,7 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
20
20
|
EventKeyMetrics,
|
|
21
21
|
EventLiveStats,
|
|
22
22
|
)
|
|
23
|
+
from mlrun.utils import logger
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -134,3 +135,24 @@ class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
|
134
135
|
else:
|
|
135
136
|
unpacked[key] = new_event[key]
|
|
136
137
|
return unpacked if unpacked else None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ErrorExtractor(mlrun.feature_store.steps.MapClass):
|
|
141
|
+
def __init__(self, **kwargs):
|
|
142
|
+
"""
|
|
143
|
+
Prepare the event for insertion into the errors TSDB table.
|
|
144
|
+
"""
|
|
145
|
+
super().__init__(**kwargs)
|
|
146
|
+
|
|
147
|
+
def do(self, event):
|
|
148
|
+
error = event.get("error")
|
|
149
|
+
timestamp = datetime.fromisoformat(event.get("when"))
|
|
150
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
151
|
+
event = {
|
|
152
|
+
EventFieldType.MODEL_ERROR: str(error),
|
|
153
|
+
EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
154
|
+
EventFieldType.TIMESTAMP: timestamp,
|
|
155
|
+
EventFieldType.ERROR_COUNT: 1.0,
|
|
156
|
+
}
|
|
157
|
+
logger.info("Write error to errors TSDB table", event=event)
|
|
158
|
+
return event
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from datetime import datetime
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
16
|
from io import StringIO
|
|
17
17
|
from typing import Literal, Optional, Union
|
|
18
18
|
|
|
@@ -33,7 +33,7 @@ _TSDB_RATE = "1/s"
|
|
|
33
33
|
_CONTAINER = "users"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def _is_no_schema_error(exc: v3io_frames.
|
|
36
|
+
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
37
37
|
"""
|
|
38
38
|
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
39
|
Check if the error message contains the relevant string to verify the cause.
|
|
@@ -89,6 +89,19 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
89
89
|
)
|
|
90
90
|
self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
|
|
91
91
|
|
|
92
|
+
errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
93
|
+
project=self.project,
|
|
94
|
+
kind=mm_schemas.FileTargetKind.ERRORS,
|
|
95
|
+
)
|
|
96
|
+
(
|
|
97
|
+
_,
|
|
98
|
+
_,
|
|
99
|
+
errors_path,
|
|
100
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
101
|
+
errors_table_full_path
|
|
102
|
+
)
|
|
103
|
+
self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
|
|
104
|
+
|
|
92
105
|
monitoring_application_full_path = (
|
|
93
106
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
94
107
|
project=self.project,
|
|
@@ -160,7 +173,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
160
173
|
- endpoint_features (Prediction and feature names and values)
|
|
161
174
|
- custom_metrics (user-defined metrics)
|
|
162
175
|
"""
|
|
163
|
-
|
|
164
176
|
# Write latency per prediction, labeled by endpoint ID only
|
|
165
177
|
graph.add_step(
|
|
166
178
|
"storey.TSDBTarget",
|
|
@@ -171,7 +183,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
171
183
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
172
184
|
container=self.container,
|
|
173
185
|
v3io_frames=self.v3io_framesd,
|
|
174
|
-
columns=[
|
|
186
|
+
columns=[
|
|
187
|
+
mm_schemas.EventFieldType.LATENCY,
|
|
188
|
+
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
189
|
+
],
|
|
175
190
|
index_cols=[
|
|
176
191
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
177
192
|
],
|
|
@@ -255,6 +270,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
255
270
|
apply_storey_filter()
|
|
256
271
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
257
272
|
|
|
273
|
+
def handle_model_error(
|
|
274
|
+
self,
|
|
275
|
+
graph,
|
|
276
|
+
tsdb_batching_max_events: int = 10,
|
|
277
|
+
tsdb_batching_timeout_secs: int = 60,
|
|
278
|
+
**kwargs,
|
|
279
|
+
) -> None:
|
|
280
|
+
graph.add_step(
|
|
281
|
+
"mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
|
|
282
|
+
name="error_extractor",
|
|
283
|
+
after="ForwardError",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
graph.add_step(
|
|
287
|
+
"storey.TSDBTarget",
|
|
288
|
+
name="tsdb_error",
|
|
289
|
+
after="error_extractor",
|
|
290
|
+
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
|
|
291
|
+
rate="1/s",
|
|
292
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
293
|
+
container=self.container,
|
|
294
|
+
v3io_frames=self.v3io_framesd,
|
|
295
|
+
columns=[
|
|
296
|
+
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
297
|
+
mm_schemas.EventFieldType.ERROR_COUNT,
|
|
298
|
+
],
|
|
299
|
+
index_cols=[
|
|
300
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
301
|
+
],
|
|
302
|
+
max_events=tsdb_batching_max_events,
|
|
303
|
+
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
304
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
305
|
+
)
|
|
306
|
+
|
|
258
307
|
def write_application_event(
|
|
259
308
|
self,
|
|
260
309
|
event: dict,
|
|
@@ -437,7 +486,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
437
486
|
step=sliding_window_step,
|
|
438
487
|
**kwargs,
|
|
439
488
|
)
|
|
440
|
-
except v3io_frames.
|
|
489
|
+
except v3io_frames.Error as err:
|
|
441
490
|
if _is_no_schema_error(err):
|
|
442
491
|
return pd.DataFrame()
|
|
443
492
|
else:
|
|
@@ -504,10 +553,16 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
504
553
|
if type == "metrics":
|
|
505
554
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
506
555
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
556
|
+
columns = [mm_schemas.MetricData.METRIC_VALUE]
|
|
507
557
|
df_handler = self.df_to_metrics_values
|
|
508
558
|
elif type == "results":
|
|
509
559
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
510
560
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
561
|
+
columns = [
|
|
562
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
563
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
564
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
565
|
+
]
|
|
511
566
|
df_handler = self.df_to_results_values
|
|
512
567
|
else:
|
|
513
568
|
raise ValueError(f"Invalid {type = }")
|
|
@@ -517,6 +572,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
517
572
|
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
518
573
|
table_path=table_path,
|
|
519
574
|
name=name,
|
|
575
|
+
columns=columns,
|
|
520
576
|
)
|
|
521
577
|
|
|
522
578
|
logger.debug("Querying V3IO TSDB", query=query)
|
|
@@ -627,33 +683,153 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
627
683
|
), # pyright: ignore[reportArgumentType]
|
|
628
684
|
)
|
|
629
685
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
686
|
+
def get_last_request(
|
|
687
|
+
self,
|
|
688
|
+
endpoint_ids: Union[str, list[str]],
|
|
689
|
+
start: Union[datetime, str] = "0",
|
|
690
|
+
end: Union[datetime, str] = "now",
|
|
691
|
+
) -> pd.DataFrame:
|
|
692
|
+
endpoint_ids = (
|
|
693
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
694
|
+
)
|
|
695
|
+
df = self._get_records(
|
|
696
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
697
|
+
start=start,
|
|
698
|
+
end=end,
|
|
699
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
700
|
+
agg_funcs=["last"],
|
|
701
|
+
)
|
|
702
|
+
if not df.empty:
|
|
703
|
+
df.rename(
|
|
704
|
+
columns={
|
|
705
|
+
f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
|
|
706
|
+
f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
|
|
707
|
+
},
|
|
708
|
+
inplace=True,
|
|
709
|
+
)
|
|
710
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
|
|
711
|
+
mm_schemas.EventFieldType.LAST_REQUEST
|
|
712
|
+
].map(
|
|
713
|
+
lambda last_request: datetime.fromtimestamp(
|
|
714
|
+
last_request, tz=timezone.utc
|
|
715
|
+
)
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
return df.reset_index(drop=True)
|
|
719
|
+
|
|
720
|
+
def get_drift_status(
|
|
721
|
+
self,
|
|
722
|
+
endpoint_ids: Union[str, list[str]],
|
|
723
|
+
start: Union[datetime, str] = "now-24h",
|
|
724
|
+
end: Union[datetime, str] = "now",
|
|
725
|
+
) -> pd.DataFrame:
|
|
726
|
+
endpoint_ids = (
|
|
727
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
728
|
+
)
|
|
729
|
+
df = self._get_records(
|
|
730
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
731
|
+
start=start,
|
|
732
|
+
end=end,
|
|
733
|
+
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
734
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
735
|
+
agg_funcs=["max"],
|
|
736
|
+
group_by="endpoint_id",
|
|
737
|
+
)
|
|
738
|
+
if not df.empty:
|
|
739
|
+
df.columns = [
|
|
740
|
+
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
741
|
+
]
|
|
742
|
+
return df.reset_index(drop=True)
|
|
743
|
+
|
|
744
|
+
def get_metrics_metadata(
|
|
745
|
+
self,
|
|
746
|
+
endpoint_id: str,
|
|
747
|
+
start: Union[datetime, str] = "0",
|
|
748
|
+
end: Union[datetime, str] = "now",
|
|
749
|
+
) -> pd.DataFrame:
|
|
750
|
+
df = self._get_records(
|
|
751
|
+
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
752
|
+
start=start,
|
|
753
|
+
end=end,
|
|
754
|
+
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
755
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
756
|
+
agg_funcs=["last"],
|
|
757
|
+
)
|
|
758
|
+
if not df.empty:
|
|
759
|
+
df.drop(
|
|
760
|
+
columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
|
|
761
|
+
)
|
|
762
|
+
return df.reset_index(drop=True)
|
|
763
|
+
|
|
764
|
+
def get_results_metadata(
|
|
765
|
+
self,
|
|
766
|
+
endpoint_id: str,
|
|
767
|
+
start: Union[datetime, str] = "0",
|
|
768
|
+
end: Union[datetime, str] = "now",
|
|
769
|
+
) -> pd.DataFrame:
|
|
770
|
+
df = self._get_records(
|
|
771
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
772
|
+
start=start,
|
|
773
|
+
end=end,
|
|
774
|
+
columns=[
|
|
775
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
776
|
+
],
|
|
777
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
778
|
+
agg_funcs=["last"],
|
|
779
|
+
)
|
|
780
|
+
if not df.empty:
|
|
781
|
+
df.rename(
|
|
782
|
+
columns={
|
|
783
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
|
|
784
|
+
},
|
|
785
|
+
inplace=True,
|
|
786
|
+
)
|
|
787
|
+
return df.reset_index(drop=True)
|
|
788
|
+
|
|
789
|
+
def get_error_count(
|
|
790
|
+
self,
|
|
791
|
+
endpoint_ids: Union[str, list[str]],
|
|
792
|
+
start: Union[datetime, str] = "0",
|
|
793
|
+
end: Union[datetime, str] = "now",
|
|
794
|
+
) -> pd.DataFrame:
|
|
795
|
+
endpoint_ids = (
|
|
796
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
797
|
+
)
|
|
798
|
+
df = self._get_records(
|
|
799
|
+
table=mm_schemas.FileTargetKind.ERRORS,
|
|
800
|
+
start=start,
|
|
801
|
+
end=end,
|
|
802
|
+
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
803
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
804
|
+
agg_funcs=["count"],
|
|
805
|
+
)
|
|
806
|
+
if not df.empty:
|
|
807
|
+
df.rename(
|
|
808
|
+
columns={
|
|
809
|
+
f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
|
|
810
|
+
},
|
|
811
|
+
inplace=True,
|
|
812
|
+
)
|
|
813
|
+
df.dropna(inplace=True)
|
|
814
|
+
return df.reset_index(drop=True)
|
|
815
|
+
|
|
816
|
+
def get_avg_latency(
|
|
817
|
+
self,
|
|
818
|
+
endpoint_ids: Union[str, list[str]],
|
|
819
|
+
start: Union[datetime, str] = "0",
|
|
820
|
+
end: Union[datetime, str] = "now",
|
|
821
|
+
) -> pd.DataFrame:
|
|
822
|
+
endpoint_ids = (
|
|
823
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
824
|
+
)
|
|
825
|
+
df = self._get_records(
|
|
826
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
827
|
+
start=start,
|
|
828
|
+
end=end,
|
|
829
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
830
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
831
|
+
agg_funcs=["avg"],
|
|
832
|
+
)
|
|
833
|
+
if not df.empty:
|
|
834
|
+
df.dropna(inplace=True)
|
|
835
|
+
return df.reset_index(drop=True)
|
|
@@ -45,8 +45,7 @@ class _BatchDict(typing.TypedDict):
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
def get_stream_path(
|
|
48
|
-
project: str =
|
|
49
|
-
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
48
|
+
project: str, function_name: str = mm_constants.MonitoringFunctionNames.STREAM
|
|
50
49
|
) -> str:
|
|
51
50
|
"""
|
|
52
51
|
Get stream path from the project secret. If wasn't set, take it from the system configurations
|