mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
@@ -24,15 +24,12 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
24
24
  import mlrun.errors
25
25
  import mlrun.feature_store as fstore
26
26
  import mlrun.feature_store.feature_set as fs
27
- import mlrun.features
28
27
  import mlrun.serving
29
28
  import mlrun.utils
30
29
  from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
31
30
  from mlrun.common.model_monitoring.helpers import FeatureStats
32
31
  from mlrun.common.schemas import ModelEndpoint
33
- from mlrun.model_monitoring.helpers import (
34
- calculate_inputs_statistics,
35
- )
32
+ from mlrun.model_monitoring.helpers import calculate_inputs_statistics
36
33
 
37
34
 
38
35
  class _ArtifactsLogger(Protocol):
@@ -252,6 +249,7 @@ class MonitoringApplicationContext:
252
249
  project=self.project_name,
253
250
  endpoint_id=self.endpoint_id,
254
251
  feature_analysis=True,
252
+ tsdb_metrics=False,
255
253
  )
256
254
  return self._model_endpoint
257
255
 
@@ -11,33 +11,37 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import collections
15
15
  import concurrent.futures
16
16
  import datetime
17
17
  import json
18
18
  import os
19
19
  import traceback
20
+ import warnings
20
21
  from collections.abc import Iterator
21
22
  from contextlib import AbstractContextManager
22
23
  from types import TracebackType
23
- from typing import Any, NamedTuple, Optional, Union, cast
24
+ from typing import Any, Final, NamedTuple, Optional, Union, cast
24
25
 
25
26
  import nuclio_sdk
27
+ import numpy as np
26
28
  import pandas as pd
27
29
 
28
30
  import mlrun
29
31
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
32
+ import mlrun.feature_store as fstore
30
33
  import mlrun.model_monitoring
31
34
  import mlrun.model_monitoring.db._schedules as schedules
32
35
  import mlrun.model_monitoring.helpers
33
36
  import mlrun.platforms.iguazio
37
+ from mlrun.common.schemas import EndpointType
34
38
  from mlrun.common.schemas.model_monitoring.constants import (
35
39
  ControllerEvent,
36
40
  ControllerEventEndpointPolicy,
37
41
  )
38
42
  from mlrun.errors import err_to_str
39
43
  from mlrun.model_monitoring.helpers import batch_dict2timedelta
40
- from mlrun.utils import logger
44
+ from mlrun.utils import datetime_now, logger
41
45
 
42
46
  _SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
43
47
  _SECONDS_IN_MINUTE = 60
@@ -49,14 +53,16 @@ class _Interval(NamedTuple):
49
53
 
50
54
 
51
55
  class _BatchWindow:
56
+ TIMESTAMP_RESOLUTION_MICRO: Final = 1e-6 # 0.000001 seconds or 1 microsecond
57
+
52
58
  def __init__(
53
59
  self,
54
60
  *,
55
61
  schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
56
62
  application: str,
57
63
  timedelta_seconds: int,
58
- last_updated: int,
59
- first_request: int,
64
+ last_updated: float,
65
+ first_request: float,
60
66
  endpoint_mode: mm_constants.EndpointMode = mm_constants.EndpointMode.REAL_TIME,
61
67
  ) -> None:
62
68
  """
@@ -73,15 +79,17 @@ class _BatchWindow:
73
79
  self._endpoint_mode = endpoint_mode
74
80
  self._start = self._get_last_analyzed()
75
81
 
76
- def _get_saved_last_analyzed(self) -> Optional[int]:
77
- return cast(int, self._db.get_application_time(self._application))
82
+ def _get_saved_last_analyzed(
83
+ self,
84
+ ) -> Optional[float]:
85
+ return self._db.get_application_time(self._application)
78
86
 
79
- def _update_last_analyzed(self, last_analyzed: int) -> None:
87
+ def _update_last_analyzed(self, last_analyzed: float) -> None:
80
88
  self._db.update_application_time(
81
89
  application=self._application, timestamp=last_analyzed
82
90
  )
83
91
 
84
- def _get_initial_last_analyzed(self) -> int:
92
+ def _get_initial_last_analyzed(self) -> float:
85
93
  if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
86
94
  logger.info(
87
95
  "No last analyzed time was found for this endpoint and application, as this is "
@@ -107,7 +115,7 @@ class _BatchWindow:
107
115
  self._stop - first_period_in_seconds,
108
116
  )
109
117
 
110
- def _get_last_analyzed(self) -> int:
118
+ def _get_last_analyzed(self) -> float:
111
119
  saved_last_analyzed = self._get_saved_last_analyzed()
112
120
  if saved_last_analyzed is not None:
113
121
  if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
@@ -127,13 +135,14 @@ class _BatchWindow:
127
135
  # Iterate timestamp from start until timestamp <= stop - step
128
136
  # so that the last interval will end at (timestamp + step) <= stop.
129
137
  # Add 1 to stop - step to get <= and not <.
130
- for timestamp in range(self._start, self._stop - self._step + 1, self._step):
138
+ for timestamp in np.arange(
139
+ self._start, self._stop - self._step + 1, self._step
140
+ ):
131
141
  entered = True
132
- start_time = datetime.datetime.fromtimestamp(
133
- timestamp, tz=datetime.timezone.utc
134
- )
142
+ start_time = datetime.datetime.fromtimestamp(timestamp, tz=datetime.UTC)
135
143
  end_time = datetime.datetime.fromtimestamp(
136
- timestamp + self._step, tz=datetime.timezone.utc
144
+ timestamp - self.TIMESTAMP_RESOLUTION_MICRO + self._step,
145
+ tz=datetime.UTC,
137
146
  )
138
147
  yield _Interval(start_time, end_time)
139
148
 
@@ -149,27 +158,19 @@ class _BatchWindow:
149
158
  # If the endpoint is a batch endpoint, we need to update the last analyzed time
150
159
  # to the end of the batch time.
151
160
  if last_analyzed:
152
- if last_analyzed < self._stop:
161
+ if last_analyzed - self.TIMESTAMP_RESOLUTION_MICRO < self._stop:
153
162
  # If the last analyzed time is earlier than the stop time,
154
163
  # yield the final partial interval from last_analyzed to stop
155
164
  yield _Interval(
156
- datetime.datetime.fromtimestamp(
157
- last_analyzed, tz=datetime.timezone.utc
158
- ),
159
- datetime.datetime.fromtimestamp(
160
- self._stop, tz=datetime.timezone.utc
161
- ),
165
+ datetime.datetime.fromtimestamp(last_analyzed, tz=datetime.UTC),
166
+ datetime.datetime.fromtimestamp(self._stop, tz=datetime.UTC),
162
167
  )
163
168
  else:
164
169
  # The time span between the start and end of the batch is shorter than the step,
165
170
  # so we need to yield a partial interval covering that range.
166
171
  yield _Interval(
167
- datetime.datetime.fromtimestamp(
168
- self._start, tz=datetime.timezone.utc
169
- ),
170
- datetime.datetime.fromtimestamp(
171
- self._stop, tz=datetime.timezone.utc
172
- ),
172
+ datetime.datetime.fromtimestamp(self._start, tz=datetime.UTC),
173
+ datetime.datetime.fromtimestamp(self._stop, tz=datetime.UTC),
173
174
  )
174
175
 
175
176
  self._update_last_analyzed(last_analyzed=self._stop)
@@ -223,7 +224,7 @@ class _BatchWindowGenerator(AbstractContextManager):
223
224
  def get_application_list(self) -> set[str]:
224
225
  return self._schedules_file.get_application_list()
225
226
 
226
- def get_min_last_analyzed(self) -> Optional[int]:
227
+ def get_min_last_analyzed(self) -> Optional[float]:
227
228
  return self._schedules_file.get_min_timestamp()
228
229
 
229
230
  @classmethod
@@ -231,22 +232,29 @@ class _BatchWindowGenerator(AbstractContextManager):
231
232
  cls,
232
233
  last_request: datetime.datetime,
233
234
  endpoint_mode: mm_constants.EndpointMode,
234
- ) -> int:
235
+ not_old_batch_endpoint: bool,
236
+ ) -> float:
235
237
  """
236
238
  Get the last updated time of a model endpoint.
237
239
  """
238
240
 
239
241
  if endpoint_mode == mm_constants.EndpointMode.REAL_TIME:
240
- last_updated = int(
241
- last_request.timestamp()
242
- - cast(
243
- float,
244
- mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
245
- )
242
+ last_updated = last_request.timestamp() - cast(
243
+ float,
244
+ mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
246
245
  )
246
+ if not not_old_batch_endpoint:
247
+ # If the endpoint does not have a stream, `last_updated` should be
248
+ # the minimum between the current time and the last updated time.
249
+ # This compensates for the bumping mechanism - see
250
+ # `update_model_endpoint_last_request`.
251
+ last_updated = min(datetime_now().timestamp(), last_updated)
252
+ logger.debug(
253
+ "The endpoint does not have a stream", last_updated=last_updated
254
+ )
247
255
 
248
256
  return last_updated
249
- return int(last_request.timestamp())
257
+ return last_request.timestamp()
250
258
 
251
259
  def get_intervals(
252
260
  self,
@@ -255,6 +263,7 @@ class _BatchWindowGenerator(AbstractContextManager):
255
263
  first_request: datetime.datetime,
256
264
  last_request: datetime.datetime,
257
265
  endpoint_mode: mm_constants.EndpointMode,
266
+ not_old_batch_endpoint: bool,
258
267
  ) -> Iterator[_Interval]:
259
268
  """
260
269
  Get the batch window for a specific endpoint and application.
@@ -266,8 +275,10 @@ class _BatchWindowGenerator(AbstractContextManager):
266
275
  schedules_file=self._schedules_file,
267
276
  application=application,
268
277
  timedelta_seconds=self._timedelta,
269
- last_updated=self._get_last_updated_time(last_request, endpoint_mode),
270
- first_request=int(first_request.timestamp()),
278
+ last_updated=self._get_last_updated_time(
279
+ last_request, endpoint_mode, not_old_batch_endpoint
280
+ ),
281
+ first_request=first_request.timestamp(),
271
282
  endpoint_mode=endpoint_mode,
272
283
  )
273
284
  yield from self.batch_window.get_intervals()
@@ -291,6 +302,8 @@ class MonitoringApplicationController:
291
302
  Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
292
303
  """
293
304
 
305
+ _MAX_FEATURE_SET_PER_WORKER = 1000
306
+
294
307
  def __init__(self) -> None:
295
308
  """Initialize Monitoring Application Controller"""
296
309
  self.project = cast(str, mlrun.mlconf.active_project)
@@ -324,6 +337,9 @@ class MonitoringApplicationController:
324
337
  mlrun.platforms.iguazio.KafkaOutputStream,
325
338
  ],
326
339
  ] = {}
340
+ self.feature_sets: collections.OrderedDict[
341
+ str, mlrun.feature_store.FeatureSet
342
+ ] = collections.OrderedDict()
327
343
  self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
328
344
  project=self.project
329
345
  )
@@ -433,15 +449,14 @@ class MonitoringApplicationController:
433
449
  base_period_minutes, current_min_last_analyzed, current_time
434
450
  )
435
451
  and (
436
- int(endpoint.status.last_request.timestamp())
437
- != last_timestamp_sent
452
+ endpoint.status.last_request.timestamp() != last_timestamp_sent
438
453
  or current_min_last_analyzed != last_analyzed_sent
439
454
  )
440
455
  ):
441
456
  # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
442
457
  schedules_file.update_endpoint_timestamps(
443
458
  endpoint_uid=endpoint.metadata.uid,
444
- last_request=int(endpoint.status.last_request.timestamp()),
459
+ last_request=endpoint.status.last_request.timestamp(),
445
460
  last_analyzed=current_min_last_analyzed,
446
461
  )
447
462
  return True
@@ -460,13 +475,14 @@ class MonitoringApplicationController:
460
475
  last_request=endpoint.status.last_request,
461
476
  first_request=endpoint.status.first_request,
462
477
  endpoint_type=endpoint.metadata.endpoint_type,
478
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
463
479
  )
464
480
  return False
465
481
 
466
482
  @staticmethod
467
483
  def _should_send_nop_event(
468
484
  base_period_minutes: int,
469
- min_last_analyzed: int,
485
+ min_last_analyzed: float,
470
486
  current_time: datetime.datetime,
471
487
  ):
472
488
  if min_last_analyzed:
@@ -515,7 +531,7 @@ class MonitoringApplicationController:
515
531
  try:
516
532
  project_name = event[ControllerEvent.PROJECT]
517
533
  endpoint_id = event[ControllerEvent.ENDPOINT_ID]
518
-
534
+ not_old_batch_endpoint = True
519
535
  if (
520
536
  event[ControllerEvent.KIND]
521
537
  == mm_constants.ControllerEventKind.BATCH_COMPLETE
@@ -572,6 +588,10 @@ class MonitoringApplicationController:
572
588
 
573
589
  endpoint_mode = mm_constants.EndpointMode.REAL_TIME
574
590
 
591
+ not_old_batch_endpoint = (
592
+ event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
593
+ )
594
+
575
595
  logger.info(
576
596
  "Starting to analyze", timestamp=last_stream_timestamp.isoformat()
577
597
  )
@@ -590,16 +610,49 @@ class MonitoringApplicationController:
590
610
  first_request=first_request,
591
611
  last_request=last_stream_timestamp,
592
612
  endpoint_mode=endpoint_mode,
613
+ not_old_batch_endpoint=not_old_batch_endpoint,
593
614
  ):
594
615
  data_in_window = False
595
- # Serving endpoint - get the relevant window data from the TSDB
596
- prediction_metric = self.tsdb_connector.read_predictions(
597
- start=start_infer_time,
598
- end=end_infer_time,
599
- endpoint_id=endpoint_id,
600
- )
601
- if prediction_metric.data:
602
- data_in_window = True
616
+ if not_old_batch_endpoint:
617
+ # Serving endpoint - get the relevant window data from the TSDB
618
+ prediction_metric = self.tsdb_connector.read_predictions(
619
+ start=start_infer_time,
620
+ end=end_infer_time,
621
+ endpoint_id=endpoint_id,
622
+ )
623
+ if prediction_metric.data:
624
+ data_in_window = True
625
+ else:
626
+ # Old batch endpoint - get the relevant window data from the parquet target
627
+ warnings.warn(
628
+ "Analyzing batch model endpoints with real time processing events is "
629
+ "deprecated in 1.10.0 and will be removed in 1.12.0. "
630
+ "Instead, use job-based serving to invoke and analyze offline batch model"
631
+ "endpoints.",
632
+ # TODO: Remove this in 1.12.0
633
+ FutureWarning,
634
+ )
635
+
636
+ if endpoint_id not in self.feature_sets:
637
+ self.feature_sets[endpoint_id] = fstore.get_feature_set(
638
+ event[ControllerEvent.FEATURE_SET_URI]
639
+ )
640
+ self.feature_sets.move_to_end(endpoint_id, last=False)
641
+ if (
642
+ len(self.feature_sets)
643
+ > self._MAX_FEATURE_SET_PER_WORKER
644
+ ):
645
+ self.feature_sets.popitem(last=True)
646
+ m_fs = self.feature_sets.get(endpoint_id)
647
+
648
+ df = m_fs.to_dataframe(
649
+ start_time=start_infer_time,
650
+ end_time=end_infer_time,
651
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
652
+ storage_options=self.storage_options,
653
+ )
654
+ if len(df) > 0:
655
+ data_in_window = True
603
656
 
604
657
  if not data_in_window:
605
658
  logger.info(
@@ -616,7 +669,10 @@ class MonitoringApplicationController:
616
669
  endpoint_id=endpoint_id,
617
670
  )
618
671
  self._push_to_applications(
619
- start_infer_time=start_infer_time,
672
+ start_infer_time=start_infer_time
673
+ - datetime.timedelta(
674
+ batch_window_generator.batch_window.TIMESTAMP_RESOLUTION_MICRO
675
+ ), # We subtract a microsecond to ensure that the apps will retrieve start time data.
620
676
  end_infer_time=end_infer_time,
621
677
  endpoint_id=endpoint_id,
622
678
  endpoint_name=endpoint_name,
@@ -653,6 +709,9 @@ class MonitoringApplicationController:
653
709
  ControllerEvent.ENDPOINT_TYPE: event[
654
710
  ControllerEvent.ENDPOINT_TYPE
655
711
  ],
712
+ ControllerEvent.FEATURE_SET_URI: event[
713
+ ControllerEvent.FEATURE_SET_URI
714
+ ],
656
715
  ControllerEvent.FIRST_REQUEST: event[
657
716
  ControllerEvent.FIRST_REQUEST
658
717
  ],
@@ -732,8 +791,17 @@ class MonitoringApplicationController:
732
791
  logger.info("Starting monitoring controller chief")
733
792
  applications_names = []
734
793
  endpoints = self.project_obj.list_model_endpoints(
735
- tsdb_metrics=False, mode=mm_constants.EndpointMode.REAL_TIME
794
+ tsdb_metrics=False,
795
+ modes=[
796
+ mm_constants.EndpointMode.REAL_TIME,
797
+ mm_constants.EndpointMode.BATCH_LEGACY,
798
+ ],
736
799
  ).endpoints
800
+
801
+ if not endpoints:
802
+ logger.info("No model endpoints found", project=self.project)
803
+ return
804
+
737
805
  last_request_dict = self.tsdb_connector.get_last_request(
738
806
  endpoint_ids=[mep.metadata.uid for mep in endpoints]
739
807
  )
@@ -742,9 +810,6 @@ class MonitoringApplicationController:
742
810
  mm_constants.EventFieldType.ENDPOINT_ID
743
811
  )[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
744
812
 
745
- if not endpoints:
746
- logger.info("No model endpoints found", project=self.project)
747
- return
748
813
  monitoring_functions = self.project_obj.list_model_monitoring_functions()
749
814
  if monitoring_functions:
750
815
  # if monitoring_functions: - TODO : ML-7700
@@ -790,7 +855,11 @@ class MonitoringApplicationController:
790
855
  for endpoint in endpoints:
791
856
  last_request = last_request_dict.get(endpoint.metadata.uid, None)
792
857
  if isinstance(last_request, float):
793
- last_request = pd.to_datetime(last_request, unit="s", utc=True)
858
+ last_request = datetime.datetime.fromtimestamp(
859
+ last_request, tz=datetime.UTC
860
+ )
861
+ elif isinstance(last_request, pd.Timestamp):
862
+ last_request = last_request.to_pydatetime()
794
863
  endpoint.status.last_request = (
795
864
  last_request or endpoint.status.last_request
796
865
  )
@@ -842,6 +911,7 @@ class MonitoringApplicationController:
842
911
  sep=" ", timespec="microseconds"
843
912
  ),
844
913
  endpoint_type=endpoint.metadata.endpoint_type,
914
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
845
915
  endpoint_policy=json.dumps(policy),
846
916
  )
847
917
  policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
@@ -859,6 +929,7 @@ class MonitoringApplicationController:
859
929
  sep=" ", timespec="microseconds"
860
930
  ),
861
931
  endpoint_type=endpoint.metadata.endpoint_type.value,
932
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
862
933
  endpoint_policy=policy,
863
934
  )
864
935
 
@@ -871,6 +942,7 @@ class MonitoringApplicationController:
871
942
  timestamp: str,
872
943
  first_request: str,
873
944
  endpoint_type: int,
945
+ feature_set_uri: str,
874
946
  endpoint_policy: dict[str, Any],
875
947
  ) -> None:
876
948
  """
@@ -883,6 +955,7 @@ class MonitoringApplicationController:
883
955
  :param endpoint_id: endpoint id string
884
956
  :param endpoint_name: the endpoint name string
885
957
  :param endpoint_type: Enum of the endpoint type
958
+ :param feature_set_uri: the feature set uri string
886
959
  """
887
960
  event = {
888
961
  ControllerEvent.KIND.value: kind,
@@ -892,6 +965,7 @@ class MonitoringApplicationController:
892
965
  ControllerEvent.TIMESTAMP.value: timestamp,
893
966
  ControllerEvent.FIRST_REQUEST.value: first_request,
894
967
  ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
968
+ ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
895
969
  ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
896
970
  }
897
971
  logger.info(
@@ -13,15 +13,12 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import json
16
- import sys
17
16
  from abc import ABC, abstractmethod
18
17
  from contextlib import AbstractContextManager
19
- from datetime import datetime, timezone
18
+ from datetime import datetime
20
19
  from types import TracebackType
21
20
  from typing import TYPE_CHECKING, Final, Optional
22
21
 
23
- import botocore.exceptions
24
-
25
22
  import mlrun
26
23
  import mlrun.common.schemas as schemas
27
24
  import mlrun.errors
@@ -30,10 +27,7 @@ import mlrun.utils.helpers
30
27
  from mlrun.utils import logger
31
28
 
32
29
  if TYPE_CHECKING:
33
- if sys.version_info >= (3, 11):
34
- from typing import Self
35
- else:
36
- from typing_extensions import Self
30
+ from typing import Self
37
31
 
38
32
 
39
33
  class ModelMonitoringSchedulesFileBase(AbstractContextManager, ABC):
@@ -88,16 +82,8 @@ class ModelMonitoringSchedulesFileBase(AbstractContextManager, ABC):
88
82
  except (
89
83
  mlrun.errors.MLRunNotFoundError,
90
84
  # Different errors are raised for S3 or local storage, see ML-8042
91
- botocore.exceptions.ClientError,
92
85
  FileNotFoundError,
93
- ) as err:
94
- if (
95
- isinstance(err, botocore.exceptions.ClientError)
96
- # Add a log only to "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
97
- and err.response["Error"]["Code"] != "NoSuchKey"
98
- ):
99
- raise
100
-
86
+ ):
101
87
  logger.exception(
102
88
  "The schedules file was not found. It should have been created "
103
89
  "as a part of the model endpoint's creation",
@@ -162,19 +148,29 @@ class ModelMonitoringSchedulesFileEndpoint(ModelMonitoringSchedulesFileBase):
162
148
  endpoint_id=model_endpoint.metadata.uid,
163
149
  )
164
150
 
165
- def get_application_time(self, application: str) -> Optional[int]:
151
+ def get_application_time(self, application: str) -> Optional[float]:
166
152
  self._check_open_schedules()
167
153
  return self._schedules.get(application)
168
154
 
169
- def update_application_time(self, application: str, timestamp: int) -> None:
155
+ def update_application_time(self, application: str, timestamp: float) -> None:
156
+ self._check_open_schedules()
157
+ self._schedules[application] = float(timestamp)
158
+
159
+ def delete_application_time(self, application: str) -> None:
170
160
  self._check_open_schedules()
171
- self._schedules[application] = timestamp
161
+ if application in self._schedules:
162
+ logger.debug(
163
+ "Deleting application time from schedules",
164
+ application=application,
165
+ endpoint_id=self._endpoint_id,
166
+ )
167
+ del self._schedules[application]
172
168
 
173
169
  def get_application_list(self) -> set[str]:
174
170
  self._check_open_schedules()
175
171
  return set(self._schedules.keys())
176
172
 
177
- def get_min_timestamp(self) -> Optional[int]:
173
+ def get_min_timestamp(self) -> Optional[float]:
178
174
  self._check_open_schedules()
179
175
  return min(self._schedules.values(), default=None)
180
176
 
@@ -198,7 +194,7 @@ class ModelMonitoringSchedulesFileChief(ModelMonitoringSchedulesFileBase):
198
194
  project=self._project
199
195
  )
200
196
 
201
- def get_endpoint_last_request(self, endpoint_uid: str) -> Optional[int]:
197
+ def get_endpoint_last_request(self, endpoint_uid: str) -> Optional[float]:
202
198
  self._check_open_schedules()
203
199
  if endpoint_uid in self._schedules:
204
200
  return self._schedules[endpoint_uid].get(
@@ -208,15 +204,19 @@ class ModelMonitoringSchedulesFileChief(ModelMonitoringSchedulesFileBase):
208
204
  return None
209
205
 
210
206
  def update_endpoint_timestamps(
211
- self, endpoint_uid: str, last_request: int, last_analyzed: int
207
+ self, endpoint_uid: str, last_request: float, last_analyzed: float
212
208
  ) -> None:
213
209
  self._check_open_schedules()
214
210
  self._schedules[endpoint_uid] = {
215
- schemas.model_monitoring.constants.ScheduleChiefFields.LAST_REQUEST: last_request,
216
- schemas.model_monitoring.constants.ScheduleChiefFields.LAST_ANALYZED: last_analyzed,
211
+ schemas.model_monitoring.constants.ScheduleChiefFields.LAST_REQUEST: float(
212
+ last_request
213
+ ),
214
+ schemas.model_monitoring.constants.ScheduleChiefFields.LAST_ANALYZED: float(
215
+ last_analyzed
216
+ ),
217
217
  }
218
218
 
219
- def get_endpoint_last_analyzed(self, endpoint_uid: str) -> Optional[int]:
219
+ def get_endpoint_last_analyzed(self, endpoint_uid: str) -> Optional[float]:
220
220
  self._check_open_schedules()
221
221
  if endpoint_uid in self._schedules:
222
222
  return self._schedules[endpoint_uid].get(
@@ -267,9 +267,18 @@ class ModelMonitoringSchedulesFileApplication(ModelMonitoringSchedulesFileBase):
267
267
  self, endpoint_uid: str, last_analyzed: datetime
268
268
  ) -> None:
269
269
  self._check_open_schedules()
270
- self._schedules[endpoint_uid] = last_analyzed.astimezone(
271
- timezone.utc
272
- ).isoformat()
270
+ self._schedules[endpoint_uid] = last_analyzed.isoformat()
271
+
272
+ def delete_endpoints_last_analyzed(self, endpoint_uids: list[str]) -> None:
273
+ self._check_open_schedules()
274
+ for endpoint_uid in endpoint_uids:
275
+ if endpoint_uid in self._schedules:
276
+ logger.debug(
277
+ "Deleting endpoint last analyzed from schedules",
278
+ endpoint_uid=endpoint_uid,
279
+ application=self._application,
280
+ )
281
+ del self._schedules[endpoint_uid]
273
282
 
274
283
 
275
284
  def _delete_folder(folder: str) -> None:
@@ -13,11 +13,11 @@
13
13
  # limitations under the License.
14
14
  import abc
15
15
  import json
16
+ import typing
16
17
  from abc import abstractmethod
17
- from datetime import datetime, timezone
18
+ from datetime import UTC, datetime
18
19
  from typing import cast
19
20
 
20
- import botocore.exceptions
21
21
  import fsspec
22
22
 
23
23
  import mlrun.datastore.base
@@ -73,7 +73,7 @@ class ModelMonitoringStatsFile(abc.ABC):
73
73
  path=self._item.url,
74
74
  )
75
75
 
76
- def read(self) -> tuple[dict, datetime]:
76
+ def read(self) -> tuple[dict, typing.Optional[datetime]]:
77
77
  """
78
78
  Read the stats data and timestamp saved in file
79
79
  :return: tuple[dict, str] dictionary with stats data and timestamp saved in file
@@ -82,30 +82,20 @@ class ModelMonitoringStatsFile(abc.ABC):
82
82
  content = json.loads(self._item.get().decode())
83
83
  timestamp = content.get("timestamp")
84
84
  if timestamp is not None:
85
- timestamp = datetime.fromisoformat(timestamp).astimezone(
86
- tz=timezone.utc
87
- )
85
+ timestamp = datetime.fromisoformat(timestamp).astimezone(tz=UTC)
88
86
  return content.get("data"), timestamp
89
87
  except (
90
88
  mlrun.errors.MLRunNotFoundError,
91
89
  # Different errors are raised for S3 or local storage, see ML-8042
92
- botocore.exceptions.ClientError,
93
90
  FileNotFoundError,
94
91
  ) as err:
95
- if (
96
- isinstance(err, botocore.exceptions.ClientError)
97
- # Add a log only to "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
98
- and err.response["Error"]["Code"] != "NoSuchKey"
99
- ):
100
- raise
101
-
102
- logger.exception(
92
+ logger.warning(
103
93
  "The Stats file was not found. It should have been created "
104
94
  "as a part of the model endpoint's creation",
105
95
  path=self._path,
106
96
  error=err,
107
97
  )
108
- raise
98
+ return {}, None
109
99
 
110
100
  def write(self, stats: dict, timestamp: datetime) -> None:
111
101
  """