mlrun 1.8.0rc5__py3-none-any.whl → 1.8.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (60) hide show
  1. mlrun/artifacts/__init__.py +1 -1
  2. mlrun/artifacts/base.py +12 -1
  3. mlrun/artifacts/document.py +59 -38
  4. mlrun/common/model_monitoring/__init__.py +0 -2
  5. mlrun/common/model_monitoring/helpers.py +0 -28
  6. mlrun/common/schemas/__init__.py +1 -4
  7. mlrun/common/schemas/client_spec.py +0 -1
  8. mlrun/common/schemas/model_monitoring/__init__.py +0 -6
  9. mlrun/common/schemas/model_monitoring/constants.py +11 -9
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +77 -149
  11. mlrun/common/schemas/notification.py +6 -0
  12. mlrun/config.py +0 -2
  13. mlrun/datastore/datastore_profile.py +57 -17
  14. mlrun/datastore/vectorstore.py +67 -59
  15. mlrun/db/base.py +22 -18
  16. mlrun/db/httpdb.py +116 -148
  17. mlrun/db/nopdb.py +33 -17
  18. mlrun/execution.py +11 -4
  19. mlrun/model.py +3 -0
  20. mlrun/model_monitoring/__init__.py +3 -2
  21. mlrun/model_monitoring/api.py +40 -43
  22. mlrun/model_monitoring/applications/_application_steps.py +3 -1
  23. mlrun/model_monitoring/applications/context.py +15 -17
  24. mlrun/model_monitoring/controller.py +43 -37
  25. mlrun/model_monitoring/db/__init__.py +0 -2
  26. mlrun/model_monitoring/db/tsdb/base.py +2 -1
  27. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +2 -1
  28. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +43 -0
  29. mlrun/model_monitoring/helpers.py +12 -66
  30. mlrun/model_monitoring/stream_processing.py +83 -270
  31. mlrun/model_monitoring/writer.py +1 -10
  32. mlrun/projects/project.py +63 -55
  33. mlrun/runtimes/nuclio/function.py +7 -6
  34. mlrun/runtimes/nuclio/serving.py +7 -1
  35. mlrun/serving/routers.py +158 -145
  36. mlrun/serving/server.py +6 -0
  37. mlrun/serving/states.py +2 -0
  38. mlrun/serving/v2_serving.py +69 -60
  39. mlrun/utils/helpers.py +14 -30
  40. mlrun/utils/notifications/notification/mail.py +17 -6
  41. mlrun/utils/version/version.json +2 -2
  42. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/METADATA +1 -1
  43. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/RECORD +47 -60
  44. mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +0 -149
  45. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  46. mlrun/model_monitoring/db/stores/base/__init__.py +0 -15
  47. mlrun/model_monitoring/db/stores/base/store.py +0 -154
  48. mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
  49. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -46
  50. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -93
  51. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -47
  52. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -25
  53. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -408
  54. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
  55. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -464
  56. mlrun/model_monitoring/model_endpoint.py +0 -120
  57. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/LICENSE +0 -0
  58. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/WHEEL +0 -0
  59. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/entry_points.txt +0 -0
  60. {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/top_level.txt +0 -0
@@ -26,11 +26,14 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
26
26
  import mlrun.feature_store
27
27
  import mlrun.model_monitoring.applications as mm_app
28
28
  import mlrun.serving
29
+ from mlrun.common.schemas import ModelEndpoint
30
+ from mlrun.common.schemas.model_monitoring import (
31
+ FunctionURI,
32
+ )
29
33
  from mlrun.data_types.infer import InferOptions, get_df_stats
30
34
  from mlrun.utils import datetime_now, logger
31
35
 
32
36
  from .helpers import update_model_endpoint_last_request
33
- from .model_endpoint import ModelEndpoint
34
37
 
35
38
  # A union of all supported dataset types:
36
39
  DatasetType = typing.Union[
@@ -46,8 +49,6 @@ def get_or_create_model_endpoint(
46
49
  function_name: str = "",
47
50
  context: mlrun.MLClientCtx = None,
48
51
  sample_set_statistics: typing.Optional[dict[str, typing.Any]] = None,
49
- drift_threshold: typing.Optional[float] = None,
50
- possible_drift_threshold: typing.Optional[float] = None,
51
52
  monitoring_mode: mm_constants.ModelMonitoringMode = mm_constants.ModelMonitoringMode.disabled,
52
53
  db_session=None,
53
54
  ) -> ModelEndpoint:
@@ -68,10 +69,6 @@ def get_or_create_model_endpoint(
68
69
  full function hash.
69
70
  :param sample_set_statistics: Dictionary of sample set statistics that will be used as a reference data for
70
71
  the new model endpoint (applicable only to new endpoint_id).
71
- :param drift_threshold: (deprecated) The threshold of which to mark drifts (applicable only to new
72
- endpoint_id).
73
- :param possible_drift_threshold: (deprecated) The threshold of which to mark possible drifts (applicable only to new
74
- endpoint_id).
75
72
  :param monitoring_mode: If enabled, apply model monitoring features on the provided endpoint id
76
73
  (applicable only to new endpoint_id).
77
74
  :param db_session: A runtime session that manages the current dialog with the database.
@@ -79,18 +76,15 @@ def get_or_create_model_endpoint(
79
76
  :return: A ModelEndpoint object
80
77
  """
81
78
 
82
- if not endpoint_id:
83
- # Generate a new model endpoint id based on the project name and model name
84
- endpoint_id = hashlib.sha1(
85
- f"{project}_{model_endpoint_name}".encode()
86
- ).hexdigest()
87
-
88
79
  if not db_session:
89
80
  # Generate a runtime database
90
81
  db_session = mlrun.get_run_db()
91
82
  try:
92
83
  model_endpoint = db_session.get_model_endpoint(
93
- project=project, endpoint_id=endpoint_id
84
+ project=project,
85
+ name=model_endpoint_name,
86
+ endpoint_id=endpoint_id,
87
+ function_name=function_name,
94
88
  )
95
89
  # If other fields provided, validate that they are correspond to the existing model endpoint data
96
90
  _model_endpoint_validations(
@@ -104,7 +98,6 @@ def get_or_create_model_endpoint(
104
98
  model_endpoint = _generate_model_endpoint(
105
99
  project=project,
106
100
  db_session=db_session,
107
- endpoint_id=endpoint_id,
108
101
  model_path=model_path,
109
102
  model_endpoint_name=model_endpoint_name,
110
103
  function_name=function_name,
@@ -208,13 +201,13 @@ def record_results(
208
201
  monitoring_mode=monitoring_mode,
209
202
  db_session=db,
210
203
  )
211
- logger.debug("Model endpoint", endpoint=model_endpoint.to_dict())
204
+ logger.debug("Model endpoint", endpoint=model_endpoint)
212
205
 
213
206
  timestamp = datetime_now()
214
207
  if infer_results_df is not None:
215
208
  # Write the monitoring parquet to the relevant model endpoint context
216
209
  write_monitoring_df(
217
- feature_set_uri=model_endpoint.status.monitoring_feature_set_uri,
210
+ feature_set_uri=model_endpoint.spec.monitoring_feature_set_uri,
218
211
  infer_datetime=timestamp,
219
212
  endpoint_id=model_endpoint.metadata.uid,
220
213
  infer_results_df=infer_results_df,
@@ -278,7 +271,7 @@ def _model_endpoint_validations(
278
271
  # Feature stats
279
272
  if (
280
273
  sample_set_statistics
281
- and sample_set_statistics != model_endpoint.status.feature_stats
274
+ and sample_set_statistics != model_endpoint.spec.feature_stats
282
275
  ):
283
276
  logger.warning(
284
277
  "Provided sample set statistics is different from the registered statistics. "
@@ -330,7 +323,6 @@ def write_monitoring_df(
330
323
  def _generate_model_endpoint(
331
324
  project: str,
332
325
  db_session,
333
- endpoint_id: str,
334
326
  model_path: str,
335
327
  model_endpoint_name: str,
336
328
  function_name: str,
@@ -344,7 +336,6 @@ def _generate_model_endpoint(
344
336
  :param project: Project name.
345
337
 
346
338
  :param db_session: A session that manages the current dialog with the database.
347
- :param endpoint_id: Model endpoint unique ID.
348
339
  :param model_path: The model Store path.
349
340
  :param model_endpoint_name: Model endpoint name will be presented under the new model endpoint.
350
341
  :param function_name: If a new model endpoint is created, use this function name for generating the
@@ -357,32 +348,38 @@ def _generate_model_endpoint(
357
348
 
358
349
  :return `mlrun.model_monitoring.model_endpoint.ModelEndpoint` object.
359
350
  """
360
- model_endpoint = ModelEndpoint()
361
- model_endpoint.metadata.project = project
362
- model_endpoint.metadata.uid = endpoint_id
363
- if function_name:
364
- model_endpoint.spec.function_uri = project + "/" + function_name
365
- elif not context:
366
- raise mlrun.errors.MLRunInvalidArgumentError(
367
- "Please provide either a function name or a valid MLRun context"
351
+ if not function_name and context:
352
+ function_name = FunctionURI.from_string(
353
+ context.to_dict()["spec"]["function"]
354
+ ).function
355
+ model_obj = None
356
+ if model_path:
357
+ model_obj: mlrun.artifacts.ModelArtifact = (
358
+ mlrun.datastore.store_resources.get_store_resource(
359
+ model_path, db=db_session
360
+ )
368
361
  )
369
- else:
370
- model_endpoint.spec.function_uri = context.to_dict()["spec"]["function"]
371
- model_endpoint.spec.model_uri = model_path
372
- model_endpoint.spec.model = model_endpoint_name
373
- model_endpoint.spec.model_class = "drift-analysis"
374
- model_endpoint.spec.monitoring_mode = monitoring_mode
375
- model_endpoint.status.first_request = model_endpoint.status.last_request = (
376
- datetime_now().isoformat()
377
- )
378
- if sample_set_statistics:
379
- model_endpoint.status.feature_stats = sample_set_statistics
380
-
381
- db_session.create_model_endpoint(
382
- project=project, endpoint_id=endpoint_id, model_endpoint=model_endpoint
362
+ current_time = datetime_now()
363
+ model_endpoint = mlrun.common.schemas.ModelEndpoint(
364
+ metadata=mlrun.common.schemas.ModelEndpointMetadata(
365
+ project=project,
366
+ name=model_endpoint_name,
367
+ endpoint_type=mlrun.common.schemas.model_monitoring.EndpointType.BATCH_EP,
368
+ ),
369
+ spec=mlrun.common.schemas.ModelEndpointSpec(
370
+ function_name=function_name,
371
+ model_name=model_obj.metadata.key if model_path else None,
372
+ model_uid=model_obj.metadata.uid if model_path else None,
373
+ model_class="drift-analysis",
374
+ ),
375
+ status=mlrun.common.schemas.ModelEndpointStatus(
376
+ monitoring_mode=monitoring_mode,
377
+ first_request=current_time,
378
+ last_request=current_time,
379
+ ),
383
380
  )
384
381
 
385
- return db_session.get_model_endpoint(project=project, endpoint_id=endpoint_id)
382
+ return db_session.create_model_endpoint(model_endpoint=model_endpoint)
386
383
 
387
384
 
388
385
  def get_sample_set_statistics(
@@ -16,6 +16,7 @@ import json
16
16
  import traceback
17
17
  from typing import Any, Optional, Union
18
18
 
19
+ import mlrun.common.schemas
19
20
  import mlrun.common.schemas.alert as alert_objects
20
21
  import mlrun.common.schemas.model_monitoring.constants as mm_constant
21
22
  import mlrun.datastore
@@ -81,6 +82,7 @@ class _PushToMonitoringWriter(StepToDict):
81
82
  self._lazy_init()
82
83
  application_results, application_context = event
83
84
  writer_event = {
85
+ mm_constant.WriterEvent.ENDPOINT_NAME: application_context.endpoint_name,
84
86
  mm_constant.WriterEvent.APPLICATION_NAME: application_context.application_name,
85
87
  mm_constant.WriterEvent.ENDPOINT_ID: application_context.endpoint_id,
86
88
  mm_constant.WriterEvent.START_INFER_TIME: application_context.start_infer_time.isoformat(
@@ -125,7 +127,7 @@ class _PrepareMonitoringEvent(StepToDict):
125
127
  """
126
128
  self.graph_context = context
127
129
  self.application_name = application_name
128
- self.model_endpoints: dict[str, mlrun.model_monitoring.ModelEndpoint] = {}
130
+ self.model_endpoints: dict[str, mlrun.common.schemas.ModelEndpoint] = {}
129
131
 
130
132
  def do(self, event: dict[str, Any]) -> MonitoringApplicationContext:
131
133
  """
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import json
16
15
  import socket
17
16
  from typing import Any, Optional, Protocol, cast
18
17
 
@@ -28,12 +27,11 @@ import mlrun.features
28
27
  import mlrun.serving
29
28
  import mlrun.utils
30
29
  from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
31
- from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
30
+ from mlrun.common.model_monitoring.helpers import FeatureStats
31
+ from mlrun.common.schemas import ModelEndpoint
32
32
  from mlrun.model_monitoring.helpers import (
33
33
  calculate_inputs_statistics,
34
- get_endpoint_record,
35
34
  )
36
- from mlrun.model_monitoring.model_endpoint import ModelEndpoint
37
35
 
38
36
 
39
37
  class _ArtifactsLogger(Protocol):
@@ -64,6 +62,7 @@ class MonitoringApplicationContext:
64
62
  :param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
65
63
  :param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
66
64
  :param endpoint_id: (str) ID of the monitored model endpoint
65
+ :param endpoint_name: (str) Name of the monitored model endpoint
67
66
  :param output_stream_uri: (str) URI of the output stream for results
68
67
  :param model_endpoint: (ModelEndpoint) The model endpoint object.
69
68
  :param feature_names: (list[str]) List of models feature names.
@@ -134,6 +133,9 @@ class MonitoringApplicationContext:
134
133
  self.endpoint_id = cast(
135
134
  str, event.get(mm_constants.ApplicationEvent.ENDPOINT_ID)
136
135
  )
136
+ self.endpoint_name = cast(
137
+ str, event.get(mm_constants.ApplicationEvent.ENDPOINT_NAME)
138
+ )
137
139
  self.output_stream_uri = cast(
138
140
  str, event.get(mm_constants.ApplicationEvent.OUTPUT_STREAM_URI)
139
141
  )
@@ -166,7 +168,7 @@ class MonitoringApplicationContext:
166
168
  def sample_df(self) -> pd.DataFrame:
167
169
  if self._sample_df is None:
168
170
  feature_set = fstore.get_feature_set(
169
- self.model_endpoint.status.monitoring_feature_set_uri
171
+ self.model_endpoint.spec.monitoring_feature_set_uri
170
172
  )
171
173
  features = [f"{feature_set.metadata.name}.*"]
172
174
  vector = fstore.FeatureVector(
@@ -188,16 +190,18 @@ class MonitoringApplicationContext:
188
190
  @property
189
191
  def model_endpoint(self) -> ModelEndpoint:
190
192
  if not self._model_endpoint:
191
- self._model_endpoint = ModelEndpoint.from_flat_dict(
192
- get_endpoint_record(self.project_name, self.endpoint_id)
193
+ self._model_endpoint = mlrun.db.get_run_db().get_model_endpoint(
194
+ name=self.endpoint_name,
195
+ project=self.project_name,
196
+ endpoint_id=self.endpoint_id,
197
+ feature_analysis=True,
193
198
  )
194
199
  return self._model_endpoint
195
200
 
196
201
  @property
197
202
  def feature_stats(self) -> FeatureStats:
198
203
  if not self._feature_stats:
199
- self._feature_stats = json.loads(self.model_endpoint.status.feature_stats)
200
- pad_features_hist(self._feature_stats)
204
+ self._feature_stats = self.model_endpoint.spec.feature_stats
201
205
  return self._feature_stats
202
206
 
203
207
  @property
@@ -212,18 +216,12 @@ class MonitoringApplicationContext:
212
216
  @property
213
217
  def feature_names(self) -> list[str]:
214
218
  """The feature names of the model"""
215
- feature_names = self.model_endpoint.spec.feature_names
216
- return (
217
- feature_names
218
- if isinstance(feature_names, list)
219
- else json.loads(feature_names)
220
- )
219
+ return self.model_endpoint.spec.feature_names
221
220
 
222
221
  @property
223
222
  def label_names(self) -> list[str]:
224
223
  """The label names of the model"""
225
- label_names = self.model_endpoint.spec.label_names
226
- return label_names if isinstance(label_names, list) else json.loads(label_names)
224
+ return self.model_endpoint.spec.label_names
227
225
 
228
226
  @property
229
227
  def model(self) -> tuple[str, ModelArtifact, dict]:
@@ -19,7 +19,7 @@ import os
19
19
  from collections.abc import Iterator
20
20
  from contextlib import AbstractContextManager
21
21
  from types import TracebackType
22
- from typing import Any, NamedTuple, Optional, cast
22
+ from typing import NamedTuple, Optional, cast
23
23
 
24
24
  import nuclio_sdk
25
25
 
@@ -27,6 +27,7 @@ import mlrun
27
27
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
28
  import mlrun.feature_store as fstore
29
29
  import mlrun.model_monitoring
30
+ from mlrun.common.schemas import EndpointType
30
31
  from mlrun.datastore import get_stream_pusher
31
32
  from mlrun.errors import err_to_str
32
33
  from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
@@ -65,7 +66,7 @@ class _BatchWindow:
65
66
  self._start = self._get_last_analyzed()
66
67
 
67
68
  def _get_saved_last_analyzed(self) -> Optional[int]:
68
- return self._db.get_application_time(self._application)
69
+ return cast(int, self._db.get_application_time(self._application))
69
70
 
70
71
  def _update_last_analyzed(self, last_analyzed: int) -> None:
71
72
  self._db.update_application_time(
@@ -161,18 +162,20 @@ class _BatchWindowGenerator(AbstractContextManager):
161
162
  )
162
163
 
163
164
  @classmethod
164
- def _get_last_updated_time(cls, last_request: str, has_stream: bool) -> int:
165
+ def _get_last_updated_time(
166
+ cls, last_request: datetime.datetime, not_batch_endpoint: bool
167
+ ) -> int:
165
168
  """
166
169
  Get the last updated time of a model endpoint.
167
170
  """
168
171
  last_updated = int(
169
- cls._date_string2timestamp(last_request)
172
+ last_request.timestamp()
170
173
  - cast(
171
174
  float,
172
175
  mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
173
176
  )
174
177
  )
175
- if not has_stream:
178
+ if not not_batch_endpoint:
176
179
  # If the endpoint does not have a stream, `last_updated` should be
177
180
  # the minimum between the current time and the last updated time.
178
181
  # This compensates for the bumping mechanism - see
@@ -183,17 +186,13 @@ class _BatchWindowGenerator(AbstractContextManager):
183
186
  )
184
187
  return last_updated
185
188
 
186
- @staticmethod
187
- def _date_string2timestamp(date_string: str) -> int:
188
- return int(datetime.datetime.fromisoformat(date_string).timestamp())
189
-
190
189
  def get_intervals(
191
190
  self,
192
191
  *,
193
192
  application: str,
194
- first_request: str,
195
- last_request: str,
196
- has_stream: bool,
193
+ first_request: datetime.datetime,
194
+ last_request: datetime.datetime,
195
+ not_batch_endpoint: bool,
197
196
  ) -> Iterator[_Interval]:
198
197
  """
199
198
  Get the batch window for a specific endpoint and application.
@@ -204,8 +203,8 @@ class _BatchWindowGenerator(AbstractContextManager):
204
203
  schedules_file=self._schedules_file,
205
204
  application=application,
206
205
  timedelta_seconds=self._timedelta,
207
- last_updated=self._get_last_updated_time(last_request, has_stream),
208
- first_request=self._date_string2timestamp(first_request),
206
+ last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
207
+ first_request=int(first_request.timestamp()),
209
208
  )
210
209
  yield from batch_window.get_intervals()
211
210
 
@@ -235,8 +234,6 @@ class MonitoringApplicationController:
235
234
 
236
235
  logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
237
236
 
238
- self.db = mlrun.model_monitoring.get_store_object(project=self.project)
239
-
240
237
  self._window_length = _get_window_length()
241
238
 
242
239
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
@@ -253,19 +250,16 @@ class MonitoringApplicationController:
253
250
  return access_key
254
251
 
255
252
  @staticmethod
256
- def _should_monitor_endpoint(endpoint: dict[str, Any]) -> bool:
253
+ def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
257
254
  return (
258
- # Is the model endpoint active?
259
- endpoint[mm_constants.EventFieldType.ACTIVE]
260
255
  # Is the model endpoint monitored?
261
- and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
262
- == mm_constants.ModelMonitoringMode.enabled
256
+ endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
263
257
  # Was the model endpoint called? I.e., are the first and last requests nonempty?
264
- and endpoint[mm_constants.EventFieldType.FIRST_REQUEST]
265
- and endpoint[mm_constants.EventFieldType.LAST_REQUEST]
258
+ and endpoint.status.first_request
259
+ and endpoint.status.last_request
266
260
  # Is the model endpoint not a router endpoint? Router endpoint has no feature stats
267
- and int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
268
- != mm_constants.EndpointType.ROUTER
261
+ and endpoint.metadata.endpoint_type.value
262
+ != mm_constants.EndpointType.ROUTER.value
269
263
  )
270
264
 
271
265
  def run(self) -> None:
@@ -281,7 +275,10 @@ class MonitoringApplicationController:
281
275
  logger.info("Start running monitoring controller")
282
276
  try:
283
277
  applications_names = []
284
- endpoints = self.db.list_model_endpoints(include_stats=True)
278
+ endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
279
+ project=self.project, tsdb_metrics=True
280
+ )
281
+ endpoints = endpoints_list.endpoints
285
282
  if not endpoints:
286
283
  logger.info("No model endpoints found", project=self.project)
287
284
  return
@@ -333,12 +330,19 @@ class MonitoringApplicationController:
333
330
  model_monitoring_access_key=self.model_monitoring_access_key,
334
331
  storage_options=self.storage_options,
335
332
  )
333
+ else:
334
+ logger.debug(
335
+ "Skipping endpoint, not ready or not suitable for monitoring",
336
+ endpoint_id=endpoint.metadata.uid,
337
+ endpoint_name=endpoint.metadata.name,
338
+ )
339
+ logger.info("Finished running monitoring controller")
336
340
 
337
341
  @classmethod
338
342
  def model_endpoint_process(
339
343
  cls,
340
344
  project: str,
341
- endpoint: dict,
345
+ endpoint: mlrun.common.schemas.ModelEndpoint,
342
346
  applications_names: list[str],
343
347
  window_length: int,
344
348
  model_monitoring_access_key: str,
@@ -356,11 +360,11 @@ class MonitoringApplicationController:
356
360
  :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
357
361
  :param storage_options: (dict) Storage options for reading the infer parquet files.
358
362
  """
359
- endpoint_id = endpoint[mm_constants.EventFieldType.UID]
360
- has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
361
- m_fs = fstore.get_feature_set(
362
- endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
363
+ endpoint_id = endpoint.metadata.uid
364
+ not_batch_endpoint = not (
365
+ endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
363
366
  )
367
+ m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
364
368
  try:
365
369
  with _BatchWindowGenerator(
366
370
  project=project, endpoint_id=endpoint_id, window_length=window_length
@@ -371,11 +375,9 @@ class MonitoringApplicationController:
371
375
  end_infer_time,
372
376
  ) in batch_window_generator.get_intervals(
373
377
  application=application,
374
- first_request=endpoint[
375
- mm_constants.EventFieldType.FIRST_REQUEST
376
- ],
377
- last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
378
- has_stream=has_stream,
378
+ first_request=endpoint.status.first_request,
379
+ last_request=endpoint.status.last_request,
380
+ not_batch_endpoint=not_batch_endpoint,
379
381
  ):
380
382
  df = m_fs.to_dataframe(
381
383
  start_time=start_infer_time,
@@ -401,15 +403,17 @@ class MonitoringApplicationController:
401
403
  start_infer_time=start_infer_time,
402
404
  end_infer_time=end_infer_time,
403
405
  endpoint_id=endpoint_id,
406
+ endpoint_name=endpoint.metadata.name,
404
407
  project=project,
405
408
  applications_names=[application],
406
409
  model_monitoring_access_key=model_monitoring_access_key,
407
410
  )
411
+ logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
408
412
 
409
413
  except Exception:
410
414
  logger.exception(
411
415
  "Encountered an exception",
412
- endpoint_id=endpoint[mm_constants.EventFieldType.UID],
416
+ endpoint_id=endpoint.metadata.uid,
413
417
  )
414
418
 
415
419
  @staticmethod
@@ -417,6 +421,7 @@ class MonitoringApplicationController:
417
421
  start_infer_time: datetime.datetime,
418
422
  end_infer_time: datetime.datetime,
419
423
  endpoint_id: str,
424
+ endpoint_name: str,
420
425
  project: str,
421
426
  applications_names: list[str],
422
427
  model_monitoring_access_key: str,
@@ -440,6 +445,7 @@ class MonitoringApplicationController:
440
445
  sep=" ", timespec="microseconds"
441
446
  ),
442
447
  mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
448
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
443
449
  mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
444
450
  project=project,
445
451
  function_name=mm_constants.MonitoringFunctionNames.WRITER,
@@ -12,7 +12,5 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .stores import ObjectStoreFactory, get_store_object
16
- from .stores.base import StoreBase
17
15
  from .tsdb import get_tsdb_connector
18
16
  from .tsdb.base import TSDBConnector
@@ -47,7 +47,7 @@ class TSDBConnector(ABC):
47
47
  self.project = project
48
48
 
49
49
  @abstractmethod
50
- def apply_monitoring_stream_steps(self, graph) -> None:
50
+ def apply_monitoring_stream_steps(self, graph, **kwargs) -> None:
51
51
  """
52
52
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
53
53
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -294,6 +294,7 @@ class TSDBConnector(ABC):
294
294
  ) -> pd.DataFrame:
295
295
  """
296
296
  Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
297
+ in the provided time range, which by default is the last 24 hours.
297
298
 
298
299
  :param endpoint_ids: A list of model endpoint identifiers.
299
300
  :param start: The start time for the query.
@@ -164,7 +164,7 @@ class TDEngineConnector(TSDBConnector):
164
164
  def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
165
165
  return datetime.fromisoformat(val) if isinstance(val, str) else val
166
166
 
167
- def apply_monitoring_stream_steps(self, graph):
167
+ def apply_monitoring_stream_steps(self, graph, **kwarg):
168
168
  """
169
169
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
170
170
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -701,6 +701,7 @@ class TDEngineConnector(TSDBConnector):
701
701
  endpoint_ids = (
702
702
  endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
703
703
  )
704
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
704
705
  start, end = self._get_start_end(start, end)
705
706
  df = self._get_records(
706
707
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -168,6 +168,9 @@ class V3IOTSDBConnector(TSDBConnector):
168
168
  tsdb_batching_max_events: int = 1000,
169
169
  tsdb_batching_timeout_secs: int = 30,
170
170
  sample_window: int = 10,
171
+ aggregate_windows: Optional[list[str]] = None,
172
+ aggregate_period: str = "1m",
173
+ **kwarg,
171
174
  ):
172
175
  """
173
176
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -178,7 +181,40 @@ class V3IOTSDBConnector(TSDBConnector):
178
181
  - endpoint_features (Prediction and feature names and values)
179
182
  - custom_metrics (user-defined metrics)
180
183
  """
184
+ aggregate_windows = aggregate_windows or ["5m", "1h"]
181
185
 
186
+ # Calculate number of predictions and average latency
187
+ def apply_storey_aggregations():
188
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
189
+ graph.add_step(
190
+ class_name="storey.AggregateByKey",
191
+ aggregates=[
192
+ {
193
+ "name": EventFieldType.LATENCY,
194
+ "column": EventFieldType.LATENCY,
195
+ "operations": ["count", "avg"],
196
+ "windows": aggregate_windows,
197
+ "period": aggregate_period,
198
+ }
199
+ ],
200
+ name=EventFieldType.LATENCY,
201
+ after="MapFeatureNames",
202
+ step_name="Aggregates",
203
+ table=".",
204
+ key_field=EventFieldType.ENDPOINT_ID,
205
+ )
206
+ # Calculate average latency time for each window (5 min and 1 hour by default)
207
+ graph.add_step(
208
+ class_name="storey.Rename",
209
+ mapping={
210
+ "latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
211
+ "latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
212
+ },
213
+ name="Rename",
214
+ after=EventFieldType.LATENCY,
215
+ )
216
+
217
+ apply_storey_aggregations()
182
218
  # Write latency per prediction, labeled by endpoint ID only
183
219
  graph.add_step(
184
220
  "storey.TSDBTarget",
@@ -853,6 +889,7 @@ class V3IOTSDBConnector(TSDBConnector):
853
889
  endpoint_ids = (
854
890
  endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
855
891
  )
892
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
856
893
  start, end = self._get_start_end(start, end)
857
894
  df = self._get_records(
858
895
  table=mm_schemas.FileTargetKind.PREDICTIONS,
@@ -864,4 +901,10 @@ class V3IOTSDBConnector(TSDBConnector):
864
901
  )
865
902
  if not df.empty:
866
903
  df.dropna(inplace=True)
904
+ df.rename(
905
+ columns={
906
+ f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
907
+ },
908
+ inplace=True,
909
+ )
867
910
  return df.reset_index(drop=True)