mlrun 1.7.0rc36__py3-none-any.whl → 1.7.0rc38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (40) hide show
  1. mlrun/alerts/alert.py +64 -0
  2. mlrun/common/schemas/alert.py +2 -2
  3. mlrun/common/schemas/model_monitoring/constants.py +4 -0
  4. mlrun/common/schemas/notification.py +26 -7
  5. mlrun/datastore/azure_blob.py +120 -30
  6. mlrun/datastore/s3.py +8 -1
  7. mlrun/feature_store/common.py +6 -11
  8. mlrun/model.py +5 -0
  9. mlrun/model_monitoring/api.py +1 -1
  10. mlrun/model_monitoring/applications/_application_steps.py +9 -4
  11. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +14 -1
  12. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +10 -7
  13. mlrun/model_monitoring/db/tsdb/base.py +141 -12
  14. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +65 -5
  15. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
  16. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +211 -35
  17. mlrun/model_monitoring/helpers.py +1 -2
  18. mlrun/model_monitoring/stream_processing.py +67 -25
  19. mlrun/model_monitoring/writer.py +4 -1
  20. mlrun/projects/operations.py +4 -0
  21. mlrun/projects/project.py +11 -1
  22. mlrun/runtimes/__init__.py +15 -8
  23. mlrun/runtimes/base.py +3 -0
  24. mlrun/runtimes/nuclio/application/application.py +98 -17
  25. mlrun/runtimes/nuclio/function.py +5 -1
  26. mlrun/runtimes/pod.py +2 -2
  27. mlrun/runtimes/remotesparkjob.py +2 -5
  28. mlrun/runtimes/sparkjob/spark3job.py +11 -16
  29. mlrun/serving/routers.py +1 -4
  30. mlrun/serving/server.py +4 -7
  31. mlrun/serving/states.py +1 -1
  32. mlrun/serving/v2_serving.py +5 -7
  33. mlrun/track/trackers/mlflow_tracker.py +5 -0
  34. mlrun/utils/version/version.json +2 -2
  35. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/METADATA +12 -6
  36. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/RECORD +40 -40
  37. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/WHEEL +1 -1
  38. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/LICENSE +0 -0
  39. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/entry_points.txt +0 -0
  40. {mlrun-1.7.0rc36.dist-info → mlrun-1.7.0rc38.dist-info}/top_level.txt +0 -0
@@ -15,8 +15,10 @@
15
15
  import typing
16
16
  from abc import ABC, abstractmethod
17
17
  from datetime import datetime
18
+ from typing import Union
18
19
 
19
20
  import pandas as pd
21
+ import pydantic
20
22
 
21
23
  import mlrun.common.schemas.model_monitoring as mm_schemas
22
24
  import mlrun.model_monitoring.db.tsdb.helpers
@@ -46,7 +48,7 @@ class TSDBConnector(ABC):
46
48
  self.project = project
47
49
 
48
50
  @abstractmethod
49
- def apply_monitoring_stream_steps(self, graph):
51
+ def apply_monitoring_stream_steps(self, graph) -> None:
50
52
  """
51
53
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
52
54
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -58,6 +60,14 @@ class TSDBConnector(ABC):
58
60
  """
59
61
  pass
60
62
 
63
+ @abstractmethod
64
+ def handle_model_error(self, graph, **kwargs) -> None:
65
+ """
66
+ Adds a branch to the stream pod graph to handle events that
67
+ arrive with errors from the model server and saves them to the error TSDB table.
68
+ The first step that generates by this method should come after `ForwardError` step.
69
+ """
70
+
61
71
  @abstractmethod
62
72
  def write_application_event(
63
73
  self,
@@ -180,6 +190,117 @@ class TSDBConnector(ABC):
180
190
  :return: Metric values object or no data object.
181
191
  """
182
192
 
193
+ @abstractmethod
194
+ def get_last_request(
195
+ self,
196
+ endpoint_ids: Union[str, list[str]],
197
+ start: Union[datetime, str] = "0",
198
+ end: Union[datetime, str] = "now",
199
+ ) -> pd.DataFrame:
200
+ """
201
+ Fetches data from the predictions TSDB table and returns the most recent request
202
+ timestamp for each specified endpoint.
203
+
204
+ :param endpoint_ids: A list of model endpoint identifiers.
205
+ :param start: The start time for the query.
206
+ :param end: The end time for the query.
207
+
208
+ :return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
209
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
210
+ """
211
+
212
+ @abstractmethod
213
+ def get_drift_status(
214
+ self,
215
+ endpoint_ids: Union[str, list[str]],
216
+ start: Union[datetime, str] = "now-24h",
217
+ end: Union[datetime, str] = "now",
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Fetches data from the app-results TSDB table and returns the highest status among all
221
+ the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
222
+
223
+ :param endpoint_ids: A list of model endpoint identifiers.
224
+ :param start: The start time for the query.
225
+ :param end: The end time for the query.
226
+
227
+ :return: A pd.DataFrame containing the columns [result_status, endpoint_id].
228
+ If an endpoint has not been monitored within the specified time range (last 24 hours),
229
+ it will not appear in the result.
230
+ """
231
+
232
+ @abstractmethod
233
+ def get_metrics_metadata(
234
+ self,
235
+ endpoint_id: str,
236
+ start: Union[datetime, str] = "0",
237
+ end: Union[datetime, str] = "now",
238
+ ) -> pd.DataFrame:
239
+ """
240
+ Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
241
+
242
+ :param endpoint_id: The model endpoint identifier.
243
+ :param start: The start time of the query.
244
+ :param end: The end time of the query.
245
+
246
+ :return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
247
+ Containing the columns [application_name, metric_name, endpoint_id]
248
+ """
249
+
250
+ @abstractmethod
251
+ def get_results_metadata(
252
+ self,
253
+ endpoint_id: str,
254
+ start: Union[datetime, str] = "0",
255
+ end: Union[datetime, str] = "now",
256
+ ) -> pd.DataFrame:
257
+ """
258
+ Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
259
+
260
+ :param endpoint_id: The model endpoint identifier.
261
+ :param start: The start time of the query.
262
+ :param end: The end time of the query.
263
+
264
+ :return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
265
+ Containing the columns [application_name, result_name, result_kind, endpoint_id]
266
+ """
267
+
268
+ @abstractmethod
269
+ def get_error_count(
270
+ self,
271
+ endpoint_ids: Union[str, list[str]],
272
+ start: Union[datetime, str] = "0",
273
+ end: Union[datetime, str] = "now",
274
+ ) -> pd.DataFrame:
275
+ """
276
+ Fetches data from the error TSDB table and returns the error count for each specified endpoint.
277
+
278
+ :param endpoint_ids: A list of model endpoint identifiers.
279
+ :param start: The start time for the query.
280
+ :param end: The end time for the query.
281
+
282
+ :return: A pd.DataFrame containing the columns [error_count, endpoint_id].
283
+ If an endpoint have not raised error within the specified time range, it will not appear in the result.
284
+ """
285
+
286
+ @abstractmethod
287
+ def get_avg_latency(
288
+ self,
289
+ endpoint_ids: Union[str, list[str]],
290
+ start: Union[datetime, str] = "0",
291
+ end: Union[datetime, str] = "now",
292
+ ) -> pd.DataFrame:
293
+ """
294
+ Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
295
+
296
+ :param endpoint_ids: A list of model endpoint identifiers.
297
+ :param start: The start time for the query.
298
+ :param end: The end time for the query.
299
+
300
+ :return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
301
+ If an endpoint has not been invoked within the specified time range, it will not appear in the result.
302
+ """
303
+
183
304
  @staticmethod
184
305
  def df_to_metrics_values(
185
306
  *,
@@ -289,19 +410,27 @@ class TSDBConnector(ABC):
289
410
  full_name = mlrun.model_monitoring.helpers._compose_full_name(
290
411
  project=project, app=app_name, name=name
291
412
  )
292
- metrics_values.append(
293
- mm_schemas.ModelEndpointMonitoringResultValues(
413
+ try:
414
+ metrics_values.append(
415
+ mm_schemas.ModelEndpointMonitoringResultValues(
416
+ full_name=full_name,
417
+ result_kind=result_kind,
418
+ values=list(
419
+ zip(
420
+ sub_df.index,
421
+ sub_df[mm_schemas.ResultData.RESULT_VALUE],
422
+ sub_df[mm_schemas.ResultData.RESULT_STATUS],
423
+ )
424
+ ), # pyright: ignore[reportArgumentType]
425
+ )
426
+ )
427
+ except pydantic.ValidationError:
428
+ logger.exception(
429
+ "Failed to convert data-frame into `ModelEndpointMonitoringResultValues`",
294
430
  full_name=full_name,
295
- result_kind=result_kind,
296
- values=list(
297
- zip(
298
- sub_df.index,
299
- sub_df[mm_schemas.ResultData.RESULT_VALUE],
300
- sub_df[mm_schemas.ResultData.RESULT_STATUS],
301
- )
302
- ), # pyright: ignore[reportArgumentType]
431
+ sub_df_json=sub_df.to_json(),
303
432
  )
304
- )
433
+ raise
305
434
  del metrics_without_data[full_name]
306
435
 
307
436
  for metric in metrics_without_data.values():
@@ -14,6 +14,7 @@
14
14
 
15
15
  import typing
16
16
  from datetime import datetime
17
+ from typing import Union
17
18
 
18
19
  import pandas as pd
19
20
  import taosws
@@ -156,6 +157,9 @@ class TDEngineConnector(TSDBConnector):
156
157
  after="ProcessBeforeTDEngine",
157
158
  )
158
159
 
160
+ def handle_model_error(self, graph, **kwargs) -> None:
161
+ pass
162
+
159
163
  def delete_tsdb_resources(self):
160
164
  """
161
165
  Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
@@ -246,11 +250,9 @@ class TDEngineConnector(TSDBConnector):
246
250
  raise mlrun.errors.MLRunInvalidArgumentError(
247
251
  f"Failed to query table {table} in database {self.database}, {str(e)}"
248
252
  )
249
- columns = []
250
- for column in query_result.fields:
251
- columns.append(column.name())
252
253
 
253
- return pd.DataFrame(query_result, columns=columns)
254
+ df_columns = [field.name() for field in query_result.fields]
255
+ return pd.DataFrame(query_result, columns=df_columns)
254
256
 
255
257
  def read_metrics_data(
256
258
  self,
@@ -274,13 +276,22 @@ class TDEngineConnector(TSDBConnector):
274
276
  ],
275
277
  ],
276
278
  ]:
279
+ timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
280
+ columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
277
281
  if type == "metrics":
278
282
  table = mm_schemas.TDEngineSuperTables.METRICS
279
283
  name = mm_schemas.MetricData.METRIC_NAME
284
+ columns += [name, mm_schemas.MetricData.METRIC_VALUE]
280
285
  df_handler = self.df_to_metrics_values
281
286
  elif type == "results":
282
287
  table = mm_schemas.TDEngineSuperTables.APP_RESULTS
283
288
  name = mm_schemas.ResultData.RESULT_NAME
289
+ columns += [
290
+ name,
291
+ mm_schemas.ResultData.RESULT_VALUE,
292
+ mm_schemas.ResultData.RESULT_STATUS,
293
+ mm_schemas.ResultData.RESULT_KIND,
294
+ ]
284
295
  df_handler = self.df_to_results_values
285
296
  else:
286
297
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -300,7 +311,8 @@ class TDEngineConnector(TSDBConnector):
300
311
  start=start,
301
312
  end=end,
302
313
  filter_query=filter_query,
303
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
314
+ timestamp_column=timestamp_column,
315
+ columns=columns,
304
316
  )
305
317
 
306
318
  df[mm_schemas.WriterEvent.END_INFER_TIME] = pd.to_datetime(
@@ -377,6 +389,54 @@ class TDEngineConnector(TSDBConnector):
377
389
  ), # pyright: ignore[reportArgumentType]
378
390
  )
379
391
 
392
+ def get_last_request(
393
+ self,
394
+ endpoint_ids: Union[str, list[str]],
395
+ start: Union[datetime, str] = "0",
396
+ end: Union[datetime, str] = "now",
397
+ ) -> pd.DataFrame:
398
+ pass
399
+
400
+ def get_drift_status(
401
+ self,
402
+ endpoint_ids: Union[str, list[str]],
403
+ start: Union[datetime, str] = "now-24h",
404
+ end: Union[datetime, str] = "now",
405
+ ) -> pd.DataFrame:
406
+ pass
407
+
408
+ def get_metrics_metadata(
409
+ self,
410
+ endpoint_id: str,
411
+ start: Union[datetime, str] = "0",
412
+ end: Union[datetime, str] = "now",
413
+ ) -> pd.DataFrame:
414
+ pass
415
+
416
+ def get_results_metadata(
417
+ self,
418
+ endpoint_id: str,
419
+ start: Union[datetime, str] = "0",
420
+ end: Union[datetime, str] = "now",
421
+ ) -> pd.DataFrame:
422
+ pass
423
+
424
+ def get_error_count(
425
+ self,
426
+ endpoint_ids: Union[str, list[str]],
427
+ start: Union[datetime, str] = "0",
428
+ end: Union[datetime, str] = "now",
429
+ ) -> pd.DataFrame:
430
+ pass
431
+
432
+ def get_avg_latency(
433
+ self,
434
+ endpoint_ids: Union[str, list[str]],
435
+ start: Union[datetime, str] = "0",
436
+ end: Union[datetime, str] = "now",
437
+ ) -> pd.DataFrame:
438
+ pass
439
+
380
440
  # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
381
441
  #
382
442
  # def read_prediction_metric_for_endpoint_if_exists(
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ from datetime import datetime
15
15
  from typing import Any
16
16
 
17
17
  import mlrun.feature_store.steps
@@ -20,6 +20,7 @@ from mlrun.common.schemas.model_monitoring import (
20
20
  EventKeyMetrics,
21
21
  EventLiveStats,
22
22
  )
23
+ from mlrun.utils import logger
23
24
 
24
25
 
25
26
  def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
@@ -134,3 +135,24 @@ class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
134
135
  else:
135
136
  unpacked[key] = new_event[key]
136
137
  return unpacked if unpacked else None
138
+
139
+
140
+ class ErrorExtractor(mlrun.feature_store.steps.MapClass):
141
+ def __init__(self, **kwargs):
142
+ """
143
+ Prepare the event for insertion into the errors TSDB table.
144
+ """
145
+ super().__init__(**kwargs)
146
+
147
+ def do(self, event):
148
+ error = event.get("error")
149
+ timestamp = datetime.fromisoformat(event.get("when"))
150
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
151
+ event = {
152
+ EventFieldType.MODEL_ERROR: str(error),
153
+ EventFieldType.ENDPOINT_ID: endpoint_id,
154
+ EventFieldType.TIMESTAMP: timestamp,
155
+ EventFieldType.ERROR_COUNT: 1.0,
156
+ }
157
+ logger.info("Write error to errors TSDB table", event=event)
158
+ return event
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from datetime import datetime
15
+ from datetime import datetime, timezone
16
16
  from io import StringIO
17
17
  from typing import Literal, Optional, Union
18
18
 
@@ -33,7 +33,7 @@ _TSDB_RATE = "1/s"
33
33
  _CONTAINER = "users"
34
34
 
35
35
 
36
- def _is_no_schema_error(exc: v3io_frames.ReadError) -> bool:
36
+ def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
37
37
  """
38
38
  In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
39
39
  Check if the error message contains the relevant string to verify the cause.
@@ -89,6 +89,19 @@ class V3IOTSDBConnector(TSDBConnector):
89
89
  )
90
90
  self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
91
91
 
92
+ errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
93
+ project=self.project,
94
+ kind=mm_schemas.FileTargetKind.ERRORS,
95
+ )
96
+ (
97
+ _,
98
+ _,
99
+ errors_path,
100
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
101
+ errors_table_full_path
102
+ )
103
+ self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
104
+
92
105
  monitoring_application_full_path = (
93
106
  mlrun.mlconf.get_model_monitoring_file_target_path(
94
107
  project=self.project,
@@ -160,7 +173,6 @@ class V3IOTSDBConnector(TSDBConnector):
160
173
  - endpoint_features (Prediction and feature names and values)
161
174
  - custom_metrics (user-defined metrics)
162
175
  """
163
-
164
176
  # Write latency per prediction, labeled by endpoint ID only
165
177
  graph.add_step(
166
178
  "storey.TSDBTarget",
@@ -171,7 +183,10 @@ class V3IOTSDBConnector(TSDBConnector):
171
183
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
172
184
  container=self.container,
173
185
  v3io_frames=self.v3io_framesd,
174
- columns=[mm_schemas.EventFieldType.LATENCY],
186
+ columns=[
187
+ mm_schemas.EventFieldType.LATENCY,
188
+ mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
189
+ ],
175
190
  index_cols=[
176
191
  mm_schemas.EventFieldType.ENDPOINT_ID,
177
192
  ],
@@ -255,6 +270,40 @@ class V3IOTSDBConnector(TSDBConnector):
255
270
  apply_storey_filter()
256
271
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
257
272
 
273
+ def handle_model_error(
274
+ self,
275
+ graph,
276
+ tsdb_batching_max_events: int = 10,
277
+ tsdb_batching_timeout_secs: int = 60,
278
+ **kwargs,
279
+ ) -> None:
280
+ graph.add_step(
281
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
282
+ name="error_extractor",
283
+ after="ForwardError",
284
+ )
285
+
286
+ graph.add_step(
287
+ "storey.TSDBTarget",
288
+ name="tsdb_error",
289
+ after="error_extractor",
290
+ path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
291
+ rate="1/s",
292
+ time_col=mm_schemas.EventFieldType.TIMESTAMP,
293
+ container=self.container,
294
+ v3io_frames=self.v3io_framesd,
295
+ columns=[
296
+ mm_schemas.EventFieldType.MODEL_ERROR,
297
+ mm_schemas.EventFieldType.ERROR_COUNT,
298
+ ],
299
+ index_cols=[
300
+ mm_schemas.EventFieldType.ENDPOINT_ID,
301
+ ],
302
+ max_events=tsdb_batching_max_events,
303
+ flush_after_seconds=tsdb_batching_timeout_secs,
304
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
305
+ )
306
+
258
307
  def write_application_event(
259
308
  self,
260
309
  event: dict,
@@ -437,7 +486,7 @@ class V3IOTSDBConnector(TSDBConnector):
437
486
  step=sliding_window_step,
438
487
  **kwargs,
439
488
  )
440
- except v3io_frames.ReadError as err:
489
+ except v3io_frames.Error as err:
441
490
  if _is_no_schema_error(err):
442
491
  return pd.DataFrame()
443
492
  else:
@@ -504,10 +553,16 @@ class V3IOTSDBConnector(TSDBConnector):
504
553
  if type == "metrics":
505
554
  table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
506
555
  name = mm_schemas.MetricData.METRIC_NAME
556
+ columns = [mm_schemas.MetricData.METRIC_VALUE]
507
557
  df_handler = self.df_to_metrics_values
508
558
  elif type == "results":
509
559
  table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
510
560
  name = mm_schemas.ResultData.RESULT_NAME
561
+ columns = [
562
+ mm_schemas.ResultData.RESULT_VALUE,
563
+ mm_schemas.ResultData.RESULT_STATUS,
564
+ mm_schemas.ResultData.RESULT_KIND,
565
+ ]
511
566
  df_handler = self.df_to_results_values
512
567
  else:
513
568
  raise ValueError(f"Invalid {type = }")
@@ -517,6 +572,7 @@ class V3IOTSDBConnector(TSDBConnector):
517
572
  metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
518
573
  table_path=table_path,
519
574
  name=name,
575
+ columns=columns,
520
576
  )
521
577
 
522
578
  logger.debug("Querying V3IO TSDB", query=query)
@@ -627,33 +683,153 @@ class V3IOTSDBConnector(TSDBConnector):
627
683
  ), # pyright: ignore[reportArgumentType]
628
684
  )
629
685
 
630
- # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
631
- #
632
- # def read_prediction_metric_for_endpoint_if_exists(
633
- # self, endpoint_id: str
634
- # ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
635
- # """
636
- # Read the count of the latency column in the predictions table for the given endpoint_id.
637
- # We just want to check if there is any data for this endpoint_id.
638
- # """
639
- # query = self._get_sql_query(
640
- # endpoint_id=endpoint_id,
641
- # table_path=self.tables[mm_schemas.FileTargetKind.PREDICTIONS],
642
- # columns=[f"count({mm_schemas.EventFieldType.LATENCY})"],
643
- # )
644
- # try:
645
- # logger.debug("Checking TSDB", project=self.project, query=query)
646
- # df: pd.DataFrame = self._frames_client.read(
647
- # backend=_TSDB_BE, query=query, start="0", end="now"
648
- # )
649
- # except v3io_frames.ReadError as err:
650
- # if _is_no_schema_error(err):
651
- # logger.debug(
652
- # "No predictions yet", project=self.project, endpoint_id=endpoint_id
653
- # )
654
- # return
655
- # else:
656
- # raise
657
- #
658
- # if not df.empty:
659
- # return get_invocations_metric(self.project)
686
+ def get_last_request(
687
+ self,
688
+ endpoint_ids: Union[str, list[str]],
689
+ start: Union[datetime, str] = "0",
690
+ end: Union[datetime, str] = "now",
691
+ ) -> pd.DataFrame:
692
+ endpoint_ids = (
693
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
694
+ )
695
+ df = self._get_records(
696
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
697
+ start=start,
698
+ end=end,
699
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
700
+ agg_funcs=["last"],
701
+ )
702
+ if not df.empty:
703
+ df.rename(
704
+ columns={
705
+ f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
706
+ f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
707
+ },
708
+ inplace=True,
709
+ )
710
+ df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
711
+ mm_schemas.EventFieldType.LAST_REQUEST
712
+ ].map(
713
+ lambda last_request: datetime.fromtimestamp(
714
+ last_request, tz=timezone.utc
715
+ )
716
+ )
717
+
718
+ return df.reset_index(drop=True)
719
+
720
+ def get_drift_status(
721
+ self,
722
+ endpoint_ids: Union[str, list[str]],
723
+ start: Union[datetime, str] = "now-24h",
724
+ end: Union[datetime, str] = "now",
725
+ ) -> pd.DataFrame:
726
+ endpoint_ids = (
727
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
728
+ )
729
+ df = self._get_records(
730
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
731
+ start=start,
732
+ end=end,
733
+ columns=[mm_schemas.ResultData.RESULT_STATUS],
734
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
735
+ agg_funcs=["max"],
736
+ group_by="endpoint_id",
737
+ )
738
+ if not df.empty:
739
+ df.columns = [
740
+ col[len("max(") : -1] if "max(" in col else col for col in df.columns
741
+ ]
742
+ return df.reset_index(drop=True)
743
+
744
+ def get_metrics_metadata(
745
+ self,
746
+ endpoint_id: str,
747
+ start: Union[datetime, str] = "0",
748
+ end: Union[datetime, str] = "now",
749
+ ) -> pd.DataFrame:
750
+ df = self._get_records(
751
+ table=mm_schemas.V3IOTSDBTables.METRICS,
752
+ start=start,
753
+ end=end,
754
+ columns=[mm_schemas.MetricData.METRIC_VALUE],
755
+ filter_query=f"endpoint_id=='{endpoint_id}'",
756
+ agg_funcs=["last"],
757
+ )
758
+ if not df.empty:
759
+ df.drop(
760
+ columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
761
+ )
762
+ return df.reset_index(drop=True)
763
+
764
+ def get_results_metadata(
765
+ self,
766
+ endpoint_id: str,
767
+ start: Union[datetime, str] = "0",
768
+ end: Union[datetime, str] = "now",
769
+ ) -> pd.DataFrame:
770
+ df = self._get_records(
771
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
772
+ start=start,
773
+ end=end,
774
+ columns=[
775
+ mm_schemas.ResultData.RESULT_KIND,
776
+ ],
777
+ filter_query=f"endpoint_id=='{endpoint_id}'",
778
+ agg_funcs=["last"],
779
+ )
780
+ if not df.empty:
781
+ df.rename(
782
+ columns={
783
+ f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
784
+ },
785
+ inplace=True,
786
+ )
787
+ return df.reset_index(drop=True)
788
+
789
+ def get_error_count(
790
+ self,
791
+ endpoint_ids: Union[str, list[str]],
792
+ start: Union[datetime, str] = "0",
793
+ end: Union[datetime, str] = "now",
794
+ ) -> pd.DataFrame:
795
+ endpoint_ids = (
796
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
797
+ )
798
+ df = self._get_records(
799
+ table=mm_schemas.FileTargetKind.ERRORS,
800
+ start=start,
801
+ end=end,
802
+ columns=[mm_schemas.EventFieldType.ERROR_COUNT],
803
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
804
+ agg_funcs=["count"],
805
+ )
806
+ if not df.empty:
807
+ df.rename(
808
+ columns={
809
+ f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
810
+ },
811
+ inplace=True,
812
+ )
813
+ df.dropna(inplace=True)
814
+ return df.reset_index(drop=True)
815
+
816
+ def get_avg_latency(
817
+ self,
818
+ endpoint_ids: Union[str, list[str]],
819
+ start: Union[datetime, str] = "0",
820
+ end: Union[datetime, str] = "now",
821
+ ) -> pd.DataFrame:
822
+ endpoint_ids = (
823
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
824
+ )
825
+ df = self._get_records(
826
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
827
+ start=start,
828
+ end=end,
829
+ columns=[mm_schemas.EventFieldType.LATENCY],
830
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
831
+ agg_funcs=["avg"],
832
+ )
833
+ if not df.empty:
834
+ df.dropna(inplace=True)
835
+ return df.reset_index(drop=True)
@@ -45,8 +45,7 @@ class _BatchDict(typing.TypedDict):
45
45
 
46
46
 
47
47
  def get_stream_path(
48
- project: str = None,
49
- function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
48
+ project: str, function_name: str = mm_constants.MonitoringFunctionNames.STREAM
50
49
  ) -> str:
51
50
  """
52
51
  Get stream path from the project secret. If wasn't set, take it from the system configurations