mlrun 1.8.0rc5__py3-none-any.whl → 1.8.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (74) hide show
  1. mlrun/__init__.py +1 -0
  2. mlrun/artifacts/__init__.py +1 -1
  3. mlrun/artifacts/base.py +21 -1
  4. mlrun/artifacts/document.py +62 -39
  5. mlrun/artifacts/manager.py +12 -5
  6. mlrun/common/constants.py +1 -0
  7. mlrun/common/model_monitoring/__init__.py +0 -2
  8. mlrun/common/model_monitoring/helpers.py +0 -28
  9. mlrun/common/schemas/__init__.py +2 -4
  10. mlrun/common/schemas/alert.py +77 -1
  11. mlrun/common/schemas/client_spec.py +0 -1
  12. mlrun/common/schemas/model_monitoring/__init__.py +0 -6
  13. mlrun/common/schemas/model_monitoring/constants.py +11 -9
  14. mlrun/common/schemas/model_monitoring/model_endpoints.py +77 -149
  15. mlrun/common/schemas/notification.py +6 -0
  16. mlrun/common/schemas/project.py +3 -0
  17. mlrun/config.py +2 -3
  18. mlrun/datastore/datastore_profile.py +57 -17
  19. mlrun/datastore/sources.py +1 -2
  20. mlrun/datastore/store_resources.py +7 -2
  21. mlrun/datastore/vectorstore.py +99 -62
  22. mlrun/db/base.py +34 -20
  23. mlrun/db/httpdb.py +249 -163
  24. mlrun/db/nopdb.py +40 -17
  25. mlrun/execution.py +14 -7
  26. mlrun/feature_store/api.py +1 -0
  27. mlrun/model.py +3 -0
  28. mlrun/model_monitoring/__init__.py +3 -2
  29. mlrun/model_monitoring/api.py +64 -53
  30. mlrun/model_monitoring/applications/_application_steps.py +3 -1
  31. mlrun/model_monitoring/applications/base.py +115 -15
  32. mlrun/model_monitoring/applications/context.py +42 -24
  33. mlrun/model_monitoring/applications/histogram_data_drift.py +1 -1
  34. mlrun/model_monitoring/controller.py +43 -37
  35. mlrun/model_monitoring/db/__init__.py +0 -2
  36. mlrun/model_monitoring/db/tsdb/base.py +2 -1
  37. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +2 -1
  38. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +43 -0
  39. mlrun/model_monitoring/helpers.py +78 -66
  40. mlrun/model_monitoring/stream_processing.py +83 -270
  41. mlrun/model_monitoring/writer.py +1 -10
  42. mlrun/projects/pipelines.py +37 -1
  43. mlrun/projects/project.py +173 -70
  44. mlrun/run.py +40 -0
  45. mlrun/runtimes/nuclio/function.py +7 -6
  46. mlrun/runtimes/nuclio/serving.py +9 -4
  47. mlrun/serving/routers.py +158 -145
  48. mlrun/serving/server.py +6 -0
  49. mlrun/serving/states.py +21 -7
  50. mlrun/serving/v2_serving.py +94 -68
  51. mlrun/utils/helpers.py +23 -33
  52. mlrun/utils/notifications/notification/mail.py +17 -6
  53. mlrun/utils/notifications/notification_pusher.py +9 -5
  54. mlrun/utils/regex.py +8 -1
  55. mlrun/utils/version/version.json +2 -2
  56. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/METADATA +2 -2
  57. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/RECORD +61 -74
  58. mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +0 -149
  59. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  60. mlrun/model_monitoring/db/stores/base/__init__.py +0 -15
  61. mlrun/model_monitoring/db/stores/base/store.py +0 -154
  62. mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
  63. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -46
  64. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -93
  65. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -47
  66. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -25
  67. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -408
  68. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
  69. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -464
  70. mlrun/model_monitoring/model_endpoint.py +0 -120
  71. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/LICENSE +0 -0
  72. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/WHEEL +0 -0
  73. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/entry_points.txt +0 -0
  74. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc9.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import json
16
15
  import socket
17
16
  from typing import Any, Optional, Protocol, cast
18
17
 
@@ -28,12 +27,11 @@ import mlrun.features
28
27
  import mlrun.serving
29
28
  import mlrun.utils
30
29
  from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
31
- from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
30
+ from mlrun.common.model_monitoring.helpers import FeatureStats
31
+ from mlrun.common.schemas import ModelEndpoint
32
32
  from mlrun.model_monitoring.helpers import (
33
33
  calculate_inputs_statistics,
34
- get_endpoint_record,
35
34
  )
36
- from mlrun.model_monitoring.model_endpoint import ModelEndpoint
37
35
 
38
36
 
39
37
  class _ArtifactsLogger(Protocol):
@@ -64,6 +62,7 @@ class MonitoringApplicationContext:
64
62
  :param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
65
63
  :param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
66
64
  :param endpoint_id: (str) ID of the monitored model endpoint
65
+ :param endpoint_name: (str) Name of the monitored model endpoint
67
66
  :param output_stream_uri: (str) URI of the output stream for results
68
67
  :param model_endpoint: (ModelEndpoint) The model endpoint object.
69
68
  :param feature_names: (list[str]) List of models feature names.
@@ -82,10 +81,14 @@ class MonitoringApplicationContext:
82
81
  model_endpoint_dict: Optional[dict[str, ModelEndpoint]] = None,
83
82
  logger: Optional[mlrun.utils.Logger] = None,
84
83
  graph_context: Optional[mlrun.serving.GraphContext] = None,
84
+ context: Optional["mlrun.MLClientCtx"] = None,
85
85
  artifacts_logger: Optional[_ArtifactsLogger] = None,
86
+ sample_df: Optional[pd.DataFrame] = None,
87
+ feature_stats: Optional[FeatureStats] = None,
86
88
  ) -> None:
87
89
  """
88
- Initialize a :code:`MonitoringApplicationContext` object.
90
+ The :code:`__init__` method initializes a :code:`MonitoringApplicationContext` object
91
+ and has the following attributes.
89
92
  Note: this object should not be instantiated manually.
90
93
 
91
94
  :param application_name: The application name.
@@ -93,17 +96,27 @@ class MonitoringApplicationContext:
93
96
  :param model_endpoint_dict: Optional - dictionary of model endpoints.
94
97
  :param logger: Optional - MLRun logger instance.
95
98
  :param graph_context: Optional - GraphContext instance.
99
+ :param context: Optional - MLClientCtx instance.
96
100
  :param artifacts_logger: Optional - an object that can log artifacts,
97
101
  typically :py:class:`~mlrun.projects.MlrunProject` or
98
102
  :py:class:`~mlrun.execution.MLClientCtx`.
103
+ :param sample_df: Optional - pandas data-frame as the current dataset.
104
+ When set, it replaces the data read from the offline source.
105
+ :param feature_stats: Optional - statistics dictionary of the reference data.
106
+ When set, it overrides the model endpoint's feature stats.
99
107
  """
100
108
  self.application_name = application_name
101
109
 
102
110
  if graph_context:
103
111
  self.project_name = graph_context.project
104
112
  self.project = mlrun.load_project(url=self.project_name)
105
- else:
106
- self.project = cast("mlrun.MlrunProject", mlrun.get_current_project())
113
+ elif context:
114
+ potential_project = context.get_project_object()
115
+ if not potential_project:
116
+ raise mlrun.errors.MLRunValueError(
117
+ "Could not load project from context"
118
+ )
119
+ self.project = potential_project
107
120
  self.project_name = self.project.name
108
121
 
109
122
  self._artifacts_logger: _ArtifactsLogger = artifacts_logger or self.project
@@ -134,29 +147,38 @@ class MonitoringApplicationContext:
134
147
  self.endpoint_id = cast(
135
148
  str, event.get(mm_constants.ApplicationEvent.ENDPOINT_ID)
136
149
  )
150
+ self.endpoint_name = cast(
151
+ str, event.get(mm_constants.ApplicationEvent.ENDPOINT_NAME)
152
+ )
137
153
  self.output_stream_uri = cast(
138
154
  str, event.get(mm_constants.ApplicationEvent.OUTPUT_STREAM_URI)
139
155
  )
140
156
 
141
- self._feature_stats: Optional[FeatureStats] = None
157
+ self._feature_stats: Optional[FeatureStats] = feature_stats
142
158
  self._sample_df_stats: Optional[FeatureStats] = None
143
159
 
144
160
  # Default labels for the artifacts
145
161
  self._default_labels = self._get_default_labels()
146
162
 
147
163
  # Persistent data - fetched when needed
148
- self._sample_df: Optional[pd.DataFrame] = None
164
+ self._sample_df: Optional[pd.DataFrame] = sample_df
149
165
  self._model_endpoint: Optional[ModelEndpoint] = (
150
166
  model_endpoint_dict.get(self.endpoint_id) if model_endpoint_dict else None
151
167
  )
152
168
 
153
169
  def _get_default_labels(self) -> dict[str, str]:
154
- return {
170
+ labels = {
155
171
  mlrun_constants.MLRunInternalLabels.runner_pod: socket.gethostname(),
156
172
  mlrun_constants.MLRunInternalLabels.producer_type: "model-monitoring-app",
157
173
  mlrun_constants.MLRunInternalLabels.app_name: self.application_name,
158
- mlrun_constants.MLRunInternalLabels.endpoint_id: self.endpoint_id,
159
174
  }
175
+ for key, value in [
176
+ (mlrun_constants.MLRunInternalLabels.endpoint_id, self.endpoint_id),
177
+ (mlrun_constants.MLRunInternalLabels.endpoint_name, self.endpoint_name),
178
+ ]:
179
+ if value:
180
+ labels[key] = value
181
+ return labels
160
182
 
161
183
  def _add_default_labels(self, labels: Optional[dict[str, str]]) -> dict[str, str]:
162
184
  """Add the default labels to logged artifacts labels"""
@@ -166,7 +188,7 @@ class MonitoringApplicationContext:
166
188
  def sample_df(self) -> pd.DataFrame:
167
189
  if self._sample_df is None:
168
190
  feature_set = fstore.get_feature_set(
169
- self.model_endpoint.status.monitoring_feature_set_uri
191
+ self.model_endpoint.spec.monitoring_feature_set_uri
170
192
  )
171
193
  features = [f"{feature_set.metadata.name}.*"]
172
194
  vector = fstore.FeatureVector(
@@ -188,16 +210,18 @@ class MonitoringApplicationContext:
188
210
  @property
189
211
  def model_endpoint(self) -> ModelEndpoint:
190
212
  if not self._model_endpoint:
191
- self._model_endpoint = ModelEndpoint.from_flat_dict(
192
- get_endpoint_record(self.project_name, self.endpoint_id)
213
+ self._model_endpoint = mlrun.db.get_run_db().get_model_endpoint(
214
+ name=self.endpoint_name,
215
+ project=self.project_name,
216
+ endpoint_id=self.endpoint_id,
217
+ feature_analysis=True,
193
218
  )
194
219
  return self._model_endpoint
195
220
 
196
221
  @property
197
222
  def feature_stats(self) -> FeatureStats:
198
223
  if not self._feature_stats:
199
- self._feature_stats = json.loads(self.model_endpoint.status.feature_stats)
200
- pad_features_hist(self._feature_stats)
224
+ self._feature_stats = self.model_endpoint.spec.feature_stats
201
225
  return self._feature_stats
202
226
 
203
227
  @property
@@ -212,18 +236,12 @@ class MonitoringApplicationContext:
212
236
  @property
213
237
  def feature_names(self) -> list[str]:
214
238
  """The feature names of the model"""
215
- feature_names = self.model_endpoint.spec.feature_names
216
- return (
217
- feature_names
218
- if isinstance(feature_names, list)
219
- else json.loads(feature_names)
220
- )
239
+ return self.model_endpoint.spec.feature_names
221
240
 
222
241
  @property
223
242
  def label_names(self) -> list[str]:
224
243
  """The label names of the model"""
225
- label_names = self.model_endpoint.spec.label_names
226
- return label_names if isinstance(label_names, list) else json.loads(label_names)
244
+ return self.model_endpoint.spec.label_names
227
245
 
228
246
  @property
229
247
  def model(self) -> tuple[str, ModelArtifact, dict]:
@@ -227,7 +227,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
227
227
  :param metrics: the calculated metrics
228
228
  :param metrics_per_feature: metric calculated per feature
229
229
  :param monitoring_context: context object for current monitoring application
230
- :return: list of mm_results._ModelMonitoringApplicationStats for histogram data drift application
230
+ :returns: list of mm_results._ModelMonitoringApplicationStats for histogram data drift application
231
231
  """
232
232
  stats = []
233
233
  for stats_type in HistogramDataDriftApplication._STATS_TYPES:
@@ -19,7 +19,7 @@ import os
19
19
  from collections.abc import Iterator
20
20
  from contextlib import AbstractContextManager
21
21
  from types import TracebackType
22
- from typing import Any, NamedTuple, Optional, cast
22
+ from typing import NamedTuple, Optional, cast
23
23
 
24
24
  import nuclio_sdk
25
25
 
@@ -27,6 +27,7 @@ import mlrun
27
27
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
28
  import mlrun.feature_store as fstore
29
29
  import mlrun.model_monitoring
30
+ from mlrun.common.schemas import EndpointType
30
31
  from mlrun.datastore import get_stream_pusher
31
32
  from mlrun.errors import err_to_str
32
33
  from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
@@ -65,7 +66,7 @@ class _BatchWindow:
65
66
  self._start = self._get_last_analyzed()
66
67
 
67
68
  def _get_saved_last_analyzed(self) -> Optional[int]:
68
- return self._db.get_application_time(self._application)
69
+ return cast(int, self._db.get_application_time(self._application))
69
70
 
70
71
  def _update_last_analyzed(self, last_analyzed: int) -> None:
71
72
  self._db.update_application_time(
@@ -161,18 +162,20 @@ class _BatchWindowGenerator(AbstractContextManager):
161
162
  )
162
163
 
163
164
  @classmethod
164
- def _get_last_updated_time(cls, last_request: str, has_stream: bool) -> int:
165
+ def _get_last_updated_time(
166
+ cls, last_request: datetime.datetime, not_batch_endpoint: bool
167
+ ) -> int:
165
168
  """
166
169
  Get the last updated time of a model endpoint.
167
170
  """
168
171
  last_updated = int(
169
- cls._date_string2timestamp(last_request)
172
+ last_request.timestamp()
170
173
  - cast(
171
174
  float,
172
175
  mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
173
176
  )
174
177
  )
175
- if not has_stream:
178
+ if not not_batch_endpoint:
176
179
  # If the endpoint does not have a stream, `last_updated` should be
177
180
  # the minimum between the current time and the last updated time.
178
181
  # This compensates for the bumping mechanism - see
@@ -183,17 +186,13 @@ class _BatchWindowGenerator(AbstractContextManager):
183
186
  )
184
187
  return last_updated
185
188
 
186
- @staticmethod
187
- def _date_string2timestamp(date_string: str) -> int:
188
- return int(datetime.datetime.fromisoformat(date_string).timestamp())
189
-
190
189
  def get_intervals(
191
190
  self,
192
191
  *,
193
192
  application: str,
194
- first_request: str,
195
- last_request: str,
196
- has_stream: bool,
193
+ first_request: datetime.datetime,
194
+ last_request: datetime.datetime,
195
+ not_batch_endpoint: bool,
197
196
  ) -> Iterator[_Interval]:
198
197
  """
199
198
  Get the batch window for a specific endpoint and application.
@@ -204,8 +203,8 @@ class _BatchWindowGenerator(AbstractContextManager):
204
203
  schedules_file=self._schedules_file,
205
204
  application=application,
206
205
  timedelta_seconds=self._timedelta,
207
- last_updated=self._get_last_updated_time(last_request, has_stream),
208
- first_request=self._date_string2timestamp(first_request),
206
+ last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
207
+ first_request=int(first_request.timestamp()),
209
208
  )
210
209
  yield from batch_window.get_intervals()
211
210
 
@@ -235,8 +234,6 @@ class MonitoringApplicationController:
235
234
 
236
235
  logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
237
236
 
238
- self.db = mlrun.model_monitoring.get_store_object(project=self.project)
239
-
240
237
  self._window_length = _get_window_length()
241
238
 
242
239
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
@@ -253,19 +250,16 @@ class MonitoringApplicationController:
253
250
  return access_key
254
251
 
255
252
  @staticmethod
256
- def _should_monitor_endpoint(endpoint: dict[str, Any]) -> bool:
253
+ def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
257
254
  return (
258
- # Is the model endpoint active?
259
- endpoint[mm_constants.EventFieldType.ACTIVE]
260
255
  # Is the model endpoint monitored?
261
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
262
- == mm_constants.ModelMonitoringMode.enabled
256
+ endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
263
257
  # Was the model endpoint called? I.e., are the first and last requests nonempty?
264
- and endpoint[mm_constants.EventFieldType.FIRST_REQUEST]
265
- and endpoint[mm_constants.EventFieldType.LAST_REQUEST]
258
+ and endpoint.status.first_request
259
+ and endpoint.status.last_request
266
260
  # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
267
- and int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
268
- != mm_constants.EndpointType.ROUTER
261
+ and endpoint.metadata.endpoint_type.value
262
+ != mm_constants.EndpointType.ROUTER.value
269
263
  )
270
264
 
271
265
  def run(self) -> None:
@@ -281,7 +275,10 @@ class MonitoringApplicationController:
281
275
  logger.info("Start running monitoring controller")
282
276
  try:
283
277
  applications_names = []
284
- endpoints = self.db.list_model_endpoints(include_stats=True)
278
+ endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
279
+ project=self.project, tsdb_metrics=True
280
+ )
281
+ endpoints = endpoints_list.endpoints
285
282
  if not endpoints:
286
283
  logger.info("No model endpoints found", project=self.project)
287
284
  return
@@ -333,12 +330,19 @@ class MonitoringApplicationController:
333
330
  model_monitoring_access_key=self.model_monitoring_access_key,
334
331
  storage_options=self.storage_options,
335
332
  )
333
+ else:
334
+ logger.debug(
335
+ "Skipping endpoint, not ready or not suitable for monitoring",
336
+ endpoint_id=endpoint.metadata.uid,
337
+ endpoint_name=endpoint.metadata.name,
338
+ )
339
+ logger.info("Finished running monitoring controller")
336
340
 
337
341
  @classmethod
338
342
  def model_endpoint_process(
339
343
  cls,
340
344
  project: str,
341
- endpoint: dict,
345
+ endpoint: mlrun.common.schemas.ModelEndpoint,
342
346
  applications_names: list[str],
343
347
  window_length: int,
344
348
  model_monitoring_access_key: str,
@@ -356,11 +360,11 @@ class MonitoringApplicationController:
356
360
  :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
357
361
  :param storage_options: (dict) Storage options for reading the infer parquet files.
358
362
  """
359
- endpoint_id = endpoint[mm_constants.EventFieldType.UID]
360
- has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
361
- m_fs = fstore.get_feature_set(
362
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
363
+ endpoint_id = endpoint.metadata.uid
364
+ not_batch_endpoint = not (
365
+ endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
363
366
  )
367
+ m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
364
368
  try:
365
369
  with _BatchWindowGenerator(
366
370
  project=project, endpoint_id=endpoint_id, window_length=window_length
@@ -371,11 +375,9 @@ class MonitoringApplicationController:
371
375
  end_infer_time,
372
376
  ) in batch_window_generator.get_intervals(
373
377
  application=application,
374
- first_request=endpoint[
375
- mm_constants.EventFieldType.FIRST_REQUEST
376
- ],
377
- last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
378
- has_stream=has_stream,
378
+ first_request=endpoint.status.first_request,
379
+ last_request=endpoint.status.last_request,
380
+ not_batch_endpoint=not_batch_endpoint,
379
381
  ):
380
382
  df = m_fs.to_dataframe(
381
383
  start_time=start_infer_time,
@@ -401,15 +403,17 @@ class MonitoringApplicationController:
401
403
  start_infer_time=start_infer_time,
402
404
  end_infer_time=end_infer_time,
403
405
  endpoint_id=endpoint_id,
406
+ endpoint_name=endpoint.metadata.name,
404
407
  project=project,
405
408
  applications_names=[application],
406
409
  model_monitoring_access_key=model_monitoring_access_key,
407
410
  )
411
+ logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
408
412
 
409
413
  except Exception:
410
414
  logger.exception(
411
415
  "Encountered an exception",
412
- endpoint_id=endpoint[mm_constants.EventFieldType.UID],
416
+ endpoint_id=endpoint.metadata.uid,
413
417
  )
414
418
 
415
419
  @staticmethod
@@ -417,6 +421,7 @@ class MonitoringApplicationController:
417
421
  start_infer_time: datetime.datetime,
418
422
  end_infer_time: datetime.datetime,
419
423
  endpoint_id: str,
424
+ endpoint_name: str,
420
425
  project: str,
421
426
  applications_names: list[str],
422
427
  model_monitoring_access_key: str,
@@ -440,6 +445,7 @@ class MonitoringApplicationController:
440
445
  sep=" ", timespec="microseconds"
441
446
  ),
442
447
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
448
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
443
449
  mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
444
450
  project=project,
445
451
  function_name=mm_constants.MonitoringFunctionNames.WRITER,
@@ -12,7 +12,5 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .stores import ObjectStoreFactory, get_store_object
16
- from .stores.base import StoreBase
17
15
  from .tsdb import get_tsdb_connector
18
16
  from .tsdb.base import TSDBConnector
@@ -47,7 +47,7 @@ class TSDBConnector(ABC):
47
47
  self.project = project
48
48
 
49
49
  @abstractmethod
50
- def apply_monitoring_stream_steps(self, graph) -> None:
50
+ def apply_monitoring_stream_steps(self, graph, **kwargs) -> None:
51
51
  """
52
52
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
53
53
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -294,6 +294,7 @@ class TSDBConnector(ABC):
294
294
  ) -> pd.DataFrame:
295
295
  """
296
296
  Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
297
+ in the provided time range, which by default is the last 24 hours.
297
298
 
298
299
  :param endpoint_ids: A list of model endpoint identifiers.
299
300
  :param start: The start time for the query.
@@ -164,7 +164,7 @@ class TDEngineConnector(TSDBConnector):
164
164
  def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
165
165
  return datetime.fromisoformat(val) if isinstance(val, str) else val
166
166
 
167
- def apply_monitoring_stream_steps(self, graph):
167
+ def apply_monitoring_stream_steps(self, graph, **kwarg):
168
168
  """
169
169
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
170
170
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -701,6 +701,7 @@ class TDEngineConnector(TSDBConnector):
701
701
  endpoint_ids = (
702
702
  endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
703
703
  )
704
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
704
705
  start, end = self._get_start_end(start, end)
705
706
  df = self._get_records(
706
707
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -168,6 +168,9 @@ class V3IOTSDBConnector(TSDBConnector):
168
168
  tsdb_batching_max_events: int = 1000,
169
169
  tsdb_batching_timeout_secs: int = 30,
170
170
  sample_window: int = 10,
171
+ aggregate_windows: Optional[list[str]] = None,
172
+ aggregate_period: str = "1m",
173
+ **kwarg,
171
174
  ):
172
175
  """
173
176
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -178,7 +181,40 @@ class V3IOTSDBConnector(TSDBConnector):
178
181
  - endpoint_features (Prediction and feature names and values)
179
182
  - custom_metrics (user-defined metrics)
180
183
  """
184
+ aggregate_windows = aggregate_windows or ["5m", "1h"]
181
185
 
186
+ # Calculate number of predictions and average latency
187
+ def apply_storey_aggregations():
188
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
189
+ graph.add_step(
190
+ class_name="storey.AggregateByKey",
191
+ aggregates=[
192
+ {
193
+ "name": EventFieldType.LATENCY,
194
+ "column": EventFieldType.LATENCY,
195
+ "operations": ["count", "avg"],
196
+ "windows": aggregate_windows,
197
+ "period": aggregate_period,
198
+ }
199
+ ],
200
+ name=EventFieldType.LATENCY,
201
+ after="MapFeatureNames",
202
+ step_name="Aggregates",
203
+ table=".",
204
+ key_field=EventFieldType.ENDPOINT_ID,
205
+ )
206
+ # Calculate average latency time for each window (5 min and 1 hour by default)
207
+ graph.add_step(
208
+ class_name="storey.Rename",
209
+ mapping={
210
+ "latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
211
+ "latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
212
+ },
213
+ name="Rename",
214
+ after=EventFieldType.LATENCY,
215
+ )
216
+
217
+ apply_storey_aggregations()
182
218
  # Write latency per prediction, labeled by endpoint ID only
183
219
  graph.add_step(
184
220
  "storey.TSDBTarget",
@@ -853,6 +889,7 @@ class V3IOTSDBConnector(TSDBConnector):
853
889
  endpoint_ids = (
854
890
  endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
855
891
  )
892
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
856
893
  start, end = self._get_start_end(start, end)
857
894
  df = self._get_records(
858
895
  table=mm_schemas.FileTargetKind.PREDICTIONS,
@@ -864,4 +901,10 @@ class V3IOTSDBConnector(TSDBConnector):
864
901
  )
865
902
  if not df.empty:
866
903
  df.dropna(inplace=True)
904
+ df.rename(
905
+ columns={
906
+ f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
907
+ },
908
+ inplace=True,
909
+ )
867
910
  return df.reset_index(drop=True)