mlrun 1.6.0rc21__py3-none-any.whl → 1.6.0rc22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (45) hide show
  1. mlrun/artifacts/base.py +6 -6
  2. mlrun/artifacts/dataset.py +15 -8
  3. mlrun/artifacts/manager.py +1 -1
  4. mlrun/artifacts/model.py +2 -2
  5. mlrun/artifacts/plots.py +8 -8
  6. mlrun/datastore/azure_blob.py +9 -14
  7. mlrun/datastore/base.py +21 -7
  8. mlrun/datastore/dbfs_store.py +10 -10
  9. mlrun/datastore/filestore.py +2 -1
  10. mlrun/datastore/google_cloud_storage.py +9 -8
  11. mlrun/datastore/redis.py +2 -1
  12. mlrun/datastore/s3.py +3 -6
  13. mlrun/datastore/sources.py +2 -12
  14. mlrun/datastore/targets.py +2 -13
  15. mlrun/datastore/v3io.py +16 -19
  16. mlrun/db/httpdb.py +8 -1
  17. mlrun/execution.py +14 -5
  18. mlrun/feature_store/api.py +3 -4
  19. mlrun/launcher/base.py +4 -4
  20. mlrun/lists.py +0 -6
  21. mlrun/model.py +8 -1
  22. mlrun/model_monitoring/api.py +9 -31
  23. mlrun/model_monitoring/batch.py +14 -13
  24. mlrun/model_monitoring/controller.py +91 -69
  25. mlrun/model_monitoring/controller_handler.py +1 -3
  26. mlrun/model_monitoring/helpers.py +19 -8
  27. mlrun/model_monitoring/stream_processing.py +0 -3
  28. mlrun/projects/operations.py +1 -1
  29. mlrun/projects/project.py +5 -4
  30. mlrun/runtimes/base.py +6 -1
  31. mlrun/runtimes/constants.py +11 -0
  32. mlrun/runtimes/kubejob.py +1 -1
  33. mlrun/runtimes/local.py +64 -53
  34. mlrun/serving/routers.py +7 -20
  35. mlrun/serving/server.py +4 -14
  36. mlrun/serving/utils.py +0 -3
  37. mlrun/utils/helpers.py +5 -2
  38. mlrun/utils/logger.py +5 -5
  39. mlrun/utils/version/version.json +2 -2
  40. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/METADATA +3 -1
  41. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/RECORD +45 -45
  42. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/LICENSE +0 -0
  43. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/WHEEL +0 -0
  44. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/entry_points.txt +0 -0
  45. {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/top_level.txt +0 -0
mlrun/db/httpdb.py CHANGED
@@ -707,7 +707,7 @@ class HTTPRunDB(RunDBInterface):
707
707
  :param state: List only runs whose state is specified.
708
708
  :param sort: Whether to sort the result according to their start time. Otherwise, results will be
709
709
  returned by their internal order in the DB (order will not be guaranteed).
710
- :param last: Deprecated - currently not used.
710
+ :param last: Deprecated - currently not used (will be removed in 1.8.0).
711
711
  :param iter: If ``True`` return runs from all iterations. Otherwise, return only runs whose ``iter`` is 0.
712
712
  :param start_time_from: Filter by run start time in ``[start_time_from, start_time_to]``.
713
713
  :param start_time_to: Filter by run start time in ``[start_time_from, start_time_to]``.
@@ -733,6 +733,13 @@ class HTTPRunDB(RunDBInterface):
733
733
  "using the `with_notifications` flag."
734
734
  )
735
735
 
736
+ if last:
737
+ # TODO: Remove this in 1.8.0
738
+ warnings.warn(
739
+ "'last' is deprecated and will be removed in 1.8.0.",
740
+ FutureWarning,
741
+ )
742
+
736
743
  if (
737
744
  not name
738
745
  and not uid
mlrun/execution.py CHANGED
@@ -393,7 +393,7 @@ class MLClientCtx(object):
393
393
  if v:
394
394
  self._set_input(k, v)
395
395
 
396
- if host and not is_api and self.is_logging_worker():
396
+ if host and not is_api:
397
397
  self.set_label("host", host)
398
398
 
399
399
  start = get_in(attrs, "status.start_time")
@@ -411,7 +411,7 @@ class MLClientCtx(object):
411
411
  self._artifacts_manager.artifacts[key] = artifact_obj
412
412
  self._state = status.get("state", self._state)
413
413
 
414
- # Do not store run if not logging worker to avoid conflicts like host label
414
+ # No need to store the run for every worker
415
415
  if store_run and self.is_logging_worker():
416
416
  self.store_run()
417
417
  return self
@@ -434,6 +434,12 @@ class MLClientCtx(object):
434
434
  context.set_label("framework", "sklearn")
435
435
 
436
436
  """
437
+ if not self.is_logging_worker():
438
+ logger.warning(
439
+ "Setting labels is only supported in the logging worker, ignoring"
440
+ )
441
+ return
442
+
437
443
  if replace or not self._labels.get(key):
438
444
  self._labels[key] = str(value)
439
445
 
@@ -974,10 +980,11 @@ class MLClientCtx(object):
974
980
  """
975
981
  # If it's a OpenMPI job, get the global rank and compare to the logging rank (worker) set in MLRun's
976
982
  # configuration:
977
- if self.labels.get("kind", "job") == "mpijob":
983
+ labels = self.labels
984
+ if "host" in labels and labels.get("kind", "job") == "mpijob":
978
985
  # The host (pod name) of each worker is created by k8s, and by default it uses the rank number as the id in
979
986
  # the following template: ...-worker-<rank>
980
- rank = int(self.labels["host"].rsplit("-", 1)[1])
987
+ rank = int(labels["host"].rsplit("-", 1)[1])
981
988
  return rank == mlrun.mlconf.packagers.logging_worker
982
989
 
983
990
  # Single worker is always the logging worker:
@@ -1004,7 +1011,6 @@ class MLClientCtx(object):
1004
1011
  _struct[key] = val
1005
1012
 
1006
1013
  struct = {
1007
- "metadata.labels": self._labels,
1008
1014
  "metadata.annotations": self._annotations,
1009
1015
  "spec.parameters": self._parameters,
1010
1016
  "spec.outputs": self._outputs,
@@ -1019,6 +1025,9 @@ class MLClientCtx(object):
1019
1025
  if self._state != "completed":
1020
1026
  struct["status.state"] = self._state
1021
1027
 
1028
+ if self.is_logging_worker():
1029
+ struct["metadata.labels"] = self._labels
1030
+
1022
1031
  set_if_not_none(struct, "status.error", self._error)
1023
1032
  set_if_not_none(struct, "status.commit", self._commit)
1024
1033
  set_if_not_none(struct, "status.iterations", self._iteration_results)
@@ -933,7 +933,7 @@ def _deploy_ingestion_service_v2(
933
933
  source = HTTPSource()
934
934
  func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
935
935
  config = RunConfig(function=func)
936
- fstore.deploy_ingestion_service(my_set, source, run_config=config)
936
+ my_set.deploy_ingestion_service(source, run_config=config)
937
937
 
938
938
  :param featureset: feature set object or uri
939
939
  :param source: data source object describing the online or offline source
@@ -1025,7 +1025,7 @@ def deploy_ingestion_service(
1025
1025
  source = HTTPSource()
1026
1026
  func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
1027
1027
  config = RunConfig(function=func)
1028
- fstore.deploy_ingestion_service(my_set, source, run_config=config)
1028
+ my_set.deploy_ingestion_service(source, run_config=config)
1029
1029
 
1030
1030
  :param featureset: feature set object or uri
1031
1031
  :param source: data source object describing the online or offline source
@@ -1036,8 +1036,7 @@ def deploy_ingestion_service(
1036
1036
 
1037
1037
  :return: URL to access the deployed ingestion service
1038
1038
  """
1039
- endpoint, _ = deploy_ingestion_service_v2(
1040
- featureset=featureset,
1039
+ endpoint, _ = featureset.deploy_ingestion_service(
1041
1040
  source=source,
1042
1041
  targets=targets,
1043
1042
  name=name,
mlrun/launcher/base.py CHANGED
@@ -396,10 +396,10 @@ class BaseLauncher(abc.ABC):
396
396
  status=run.status.state,
397
397
  name=run.metadata.name,
398
398
  )
399
- if run.status.state in [
400
- mlrun.runtimes.constants.RunStates.error,
401
- mlrun.runtimes.constants.RunStates.aborted,
402
- ]:
399
+ if (
400
+ run.status.state
401
+ in mlrun.runtimes.constants.RunStates.error_and_abortion_states()
402
+ ):
403
403
  if runtime._is_remote and not runtime.is_child:
404
404
  logger.error(
405
405
  "Run did not finish successfully",
mlrun/lists.py CHANGED
@@ -227,9 +227,3 @@ class ArtifactList(list):
227
227
  if artifact:
228
228
  dataitems.append(mlrun.get_dataitem(artifact))
229
229
  return dataitems
230
-
231
-
232
- class FunctionList(list):
233
- def __init__(self):
234
- pass
235
- # TODO
mlrun/model.py CHANGED
@@ -1259,8 +1259,15 @@ class RunObject(RunTemplate):
1259
1259
  """error string if failed"""
1260
1260
  if self.status:
1261
1261
  unknown_error = ""
1262
- if self.status.state in mlrun.runtimes.constants.RunStates.error_states():
1262
+ if (
1263
+ self.status.state
1264
+ in mlrun.runtimes.constants.RunStates.abortion_states()
1265
+ ):
1266
+ unknown_error = "Run was aborted"
1267
+
1268
+ elif self.status.state in mlrun.runtimes.constants.RunStates.error_states():
1263
1269
  unknown_error = "Unknown error"
1270
+
1264
1271
  return (
1265
1272
  self.status.error
1266
1273
  or self.status.reason
@@ -132,7 +132,6 @@ def record_results(
132
132
  drift_threshold: typing.Optional[float] = None,
133
133
  possible_drift_threshold: typing.Optional[float] = None,
134
134
  trigger_monitoring_job: bool = False,
135
- last_in_batch_set: typing.Optional[bool] = True,
136
135
  artifacts_tag: str = "",
137
136
  default_batch_image="mlrun/mlrun",
138
137
  ) -> ModelEndpoint:
@@ -165,14 +164,6 @@ def record_results(
165
164
  :param possible_drift_threshold: The threshold of which to mark possible drifts.
166
165
  :param trigger_monitoring_job: If true, run the batch drift job. If not exists, the monitoring batch function
167
166
  will be registered through MLRun API with the provided image.
168
- :param last_in_batch_set: This flag can (and should only) be used when the model endpoint does not have
169
- model-monitoring set.
170
- If set to `True` (the default), this flag marks the current monitoring window
171
- (on this monitoring endpoint) is completed - the data inferred so far is assumed
172
- to be the total data for this monitoring window.
173
- You may want to set this flag to `False` if you want to record multiple results in
174
- close time proximity ("batch set"). In this case, set this flag to `False` on all
175
- but the last batch in the set.
176
167
  :param artifacts_tag: Tag to use for all the artifacts resulted from the function. Will be relevant
177
168
  only if the monitoring batch job has been triggered.
178
169
 
@@ -206,25 +197,14 @@ def record_results(
206
197
  )
207
198
 
208
199
  if model_endpoint.spec.stream_path == "":
209
- if last_in_batch_set:
210
- logger.info(
211
- "Updating the last request time to mark the current monitoring window as completed",
212
- project=project,
213
- endpoint_id=model_endpoint.metadata.uid,
214
- )
215
- bump_model_endpoint_last_request(
216
- project=project, model_endpoint=model_endpoint, db=db
217
- )
218
- else:
219
- if last_in_batch_set is not None:
220
- logger.warning(
221
- "`last_in_batch_set` is not `None`, but the model endpoint has a stream path. "
222
- "Ignoring `last_in_batch_set`, as it is relevant only when the model "
223
- "endpoint does not have a model monitoring infrastructure in place (i.e. stream path is "
224
- " empty). Set `last_in_batch_set` to `None` to resolve this warning.",
225
- project=project,
226
- endpoint_id=model_endpoint.metadata.uid,
227
- )
200
+ logger.info(
201
+ "Updating the last request time to mark the current monitoring window as completed",
202
+ project=project,
203
+ endpoint_id=model_endpoint.metadata.uid,
204
+ )
205
+ bump_model_endpoint_last_request(
206
+ project=project, model_endpoint=model_endpoint, db=db
207
+ )
228
208
 
229
209
  if trigger_monitoring_job:
230
210
  # Run the monitoring batch drift job
@@ -612,9 +592,7 @@ def read_dataset_as_dataframe(
612
592
  if label_columns is None:
613
593
  label_columns = dataset.status.label_column
614
594
  # Get the features and parse to DataFrame:
615
- dataset = mlrun.feature_store.get_offline_features(
616
- dataset.uri, drop_columns=drop_columns
617
- ).to_dataframe()
595
+ dataset = dataset.get_offline_features(drop_columns=drop_columns).to_dataframe()
618
596
 
619
597
  elif isinstance(dataset, (list, np.ndarray)):
620
598
  if not feature_columns:
@@ -117,20 +117,21 @@ class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
117
117
  def _calc_kl_div(
118
118
  actual_dist: np.array, expected_dist: np.array, kld_scaling: float
119
119
  ) -> float:
120
- """Return the assymetric KL divergence"""
120
+ """Return the asymmetric KL divergence"""
121
+ # We take 0*log(0) == 0 for this calculation
122
+ mask = actual_dist != 0
123
+ actual_dist = actual_dist[mask]
124
+ expected_dist = expected_dist[mask]
121
125
  return np.sum(
122
- np.where(
123
- actual_dist != 0,
124
- (actual_dist)
125
- * np.log(
126
- actual_dist
127
- / np.where(expected_dist != 0, expected_dist, kld_scaling)
128
- ),
129
- 0,
130
- )
126
+ actual_dist
127
+ * np.log(
128
+ actual_dist / np.where(expected_dist != 0, expected_dist, kld_scaling)
129
+ ),
131
130
  )
132
131
 
133
- def compute(self, capping: float = None, kld_scaling: float = 1e-4) -> float:
132
+ def compute(
133
+ self, capping: Optional[float] = None, kld_scaling: float = 1e-4
134
+ ) -> float:
134
135
  """
135
136
  :param capping: A bounded value for the KL Divergence. For infinite distance, the result is replaced with
136
137
  the capping value which indicates a huge differences between the distributions.
@@ -141,8 +142,8 @@ class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
141
142
  t_u = self._calc_kl_div(self.distrib_t, self.distrib_u, kld_scaling)
142
143
  u_t = self._calc_kl_div(self.distrib_u, self.distrib_t, kld_scaling)
143
144
  result = t_u + u_t
144
- if capping:
145
- return capping if result == float("inf") else result
145
+ if capping and result == float("inf"):
146
+ return capping
146
147
  return result
147
148
 
148
149
 
@@ -17,7 +17,7 @@ import datetime
17
17
  import json
18
18
  import os
19
19
  import re
20
- from typing import Any, Iterator, Optional, Tuple, Union, cast
20
+ from typing import Any, Iterator, NamedTuple, Optional, Union, cast
21
21
 
22
22
  from v3io.dataplane.response import HttpResponseError
23
23
 
@@ -35,10 +35,15 @@ from mlrun.model_monitoring.helpers import (
35
35
  get_monitoring_parquet_path,
36
36
  get_stream_path,
37
37
  )
38
- from mlrun.utils import logger
38
+ from mlrun.utils import create_logger, datetime_now, logger
39
39
  from mlrun.utils.v3io_clients import get_v3io_client
40
40
 
41
41
 
42
+ class _Interval(NamedTuple):
43
+ start: datetime.datetime
44
+ end: datetime.datetime
45
+
46
+
42
47
  class _BatchWindow:
43
48
  V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
44
49
 
@@ -60,7 +65,11 @@ class _BatchWindow:
60
65
  self._endpoint = endpoint
61
66
  self._application = application
62
67
  self._first_request = first_request
63
- self._kv_storage = get_v3io_client(endpoint=mlrun.mlconf.v3io_api).kv
68
+ self._kv_storage = get_v3io_client(
69
+ endpoint=mlrun.mlconf.v3io_api,
70
+ # Avoid noisy warning logs before the KV table is created
71
+ logger=create_logger(name="v3io_client", level="error"),
72
+ ).kv
64
73
  self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
65
74
  self._stop = last_updated
66
75
  self._step = timedelta_seconds
@@ -75,24 +84,26 @@ class _BatchWindow:
75
84
  )
76
85
  except HttpResponseError as err:
77
86
  logger.info(
78
- "Failed to get the last analyzed time for this endpoint and application, "
79
- "as this is probably the first time this application is running. ",
80
- "Using the latest between first request time or last update time minus one day instead.",
87
+ "No last analyzed time was found for this endpoint and "
88
+ "application, as this is probably the first time this "
89
+ "application is running. Using the latest between first "
90
+ "request time or last update time minus one day instead",
81
91
  endpoint=self._endpoint,
82
92
  application=self._application,
83
93
  first_request=self._first_request,
84
- last_update=self._stop,
85
- error=err,
86
- )
87
-
88
- # TODO : Change the timedelta according to the policy.
89
- first_period_in_seconds = max(
90
- int(datetime.timedelta(days=1).total_seconds()), self._step
91
- ) # max between one day and the base period
92
- return max(
93
- self._first_request,
94
- self._stop - first_period_in_seconds,
94
+ last_updated=self._stop,
95
95
  )
96
+ logger.debug("Error while getting last analyzed time", err=err)
97
+ if self._first_request and self._stop:
98
+ # TODO : Change the timedelta according to the policy.
99
+ first_period_in_seconds = max(
100
+ int(datetime.timedelta(days=1).total_seconds()), self._step
101
+ ) # max between one day and the base period
102
+ return max(
103
+ self._first_request,
104
+ self._stop - first_period_in_seconds,
105
+ )
106
+ return self._first_request
96
107
 
97
108
  last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
98
109
  logger.info(
@@ -119,20 +130,29 @@ class _BatchWindow:
119
130
 
120
131
  def get_intervals(
121
132
  self,
122
- ) -> Iterator[Tuple[datetime.datetime, datetime.datetime]]:
133
+ ) -> Iterator[_Interval]:
123
134
  """Generate the batch interval time ranges."""
124
135
  if self._start is not None and self._stop is not None:
125
136
  entered = False
126
- for timestamp in range(self._start, self._stop, self._step):
137
+ # Iterate timestamp from start until timestamp <= stop - step
138
+ # so that the last interval will end at (timestamp + step) <= stop.
139
+ # Add 1 to stop - step to get <= and not <.
140
+ for timestamp in range(
141
+ self._start, self._stop - self._step + 1, self._step
142
+ ):
127
143
  entered = True
128
- start_time = datetime.datetime.utcfromtimestamp(timestamp)
129
- end_time = datetime.datetime.utcfromtimestamp(timestamp + self._step)
130
- yield start_time, end_time
144
+ start_time = datetime.datetime.fromtimestamp(
145
+ timestamp, tz=datetime.timezone.utc
146
+ )
147
+ end_time = datetime.datetime.fromtimestamp(
148
+ timestamp + self._step, tz=datetime.timezone.utc
149
+ )
150
+ yield _Interval(start_time, end_time)
131
151
  self._update_last_analyzed(timestamp + self._step)
132
152
  if not entered:
133
153
  logger.info(
134
154
  "All the data is set, but no complete intervals were found. "
135
- "Wait for last_updated to be updated.",
155
+ "Wait for last_updated to be updated",
136
156
  endpoint=self._endpoint,
137
157
  application=self._application,
138
158
  start=self._start,
@@ -141,8 +161,8 @@ class _BatchWindow:
141
161
  )
142
162
  else:
143
163
  logger.warn(
144
- "The first request time is not not found for this endpoint. "
145
- "No intervals will be generated.",
164
+ "The first request time is not found for this endpoint. "
165
+ "No intervals will be generated",
146
166
  endpoint=self._endpoint,
147
167
  application=self._application,
148
168
  start=self._start,
@@ -185,26 +205,38 @@ class _BatchWindowGenerator:
185
205
  )
186
206
 
187
207
  @classmethod
188
- def _get_last_updated_time(cls, last_request: Optional[str]) -> Optional[int]:
208
+ def _get_last_updated_time(
209
+ cls, last_request: Optional[str], has_stream: bool
210
+ ) -> Optional[int]:
189
211
  """
190
212
  Get the last updated time of a model endpoint.
191
213
  """
192
214
  if not last_request:
193
215
  return None
194
- return int(
216
+ last_updated = int(
195
217
  cls._date_string2timestamp(last_request)
196
218
  - cast(
197
219
  float,
198
220
  mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
199
221
  )
200
222
  )
223
+ if not has_stream:
224
+ # If the endpoint does not have a stream, `last_updated` should be
225
+ # the minimum between the current time and the last updated time.
226
+ # This compensates for the bumping mechanism - see
227
+ # `bump_model_endpoint_last_request`.
228
+ last_updated = min(int(datetime_now().timestamp()), last_updated)
229
+ logger.debug(
230
+ "The endpoint does not have a stream", last_updated=last_updated
231
+ )
232
+ return last_updated
201
233
 
202
234
  @classmethod
203
235
  def _normalize_first_request(
204
236
  cls, first_request: Optional[str], endpoint: str
205
237
  ) -> Optional[int]:
206
238
  if not first_request:
207
- logger.warn(
239
+ logger.debug(
208
240
  "There is no first request time for this endpoint.",
209
241
  endpoint=endpoint,
210
242
  first_request=first_request,
@@ -223,6 +255,7 @@ class _BatchWindowGenerator:
223
255
  application: str,
224
256
  first_request: Optional[str],
225
257
  last_request: Optional[str],
258
+ has_stream: bool,
226
259
  ) -> _BatchWindow:
227
260
  """
228
261
  Get the batch window for a specific endpoint and application.
@@ -234,7 +267,7 @@ class _BatchWindowGenerator:
234
267
  endpoint=endpoint,
235
268
  application=application,
236
269
  timedelta_seconds=self._timedelta,
237
- last_updated=self._get_last_updated_time(last_request),
270
+ last_updated=self._get_last_updated_time(last_request, has_stream),
238
271
  first_request=self._normalize_first_request(first_request, endpoint),
239
272
  )
240
273
 
@@ -259,20 +292,12 @@ class MonitoringApplicationController:
259
292
  """
260
293
  self.context = context
261
294
  self.project = project
295
+ self.project_obj = mlrun.get_or_create_project(project)
262
296
 
263
- logger.info(
264
- "Initializing MonitoringApplicationController",
265
- project=project,
266
- )
267
-
268
- # Get a runtime database
297
+ context.logger.debug(f"Initializing {self.__class__.__name__}", project=project)
269
298
 
270
299
  self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
271
300
 
272
- # If an error occurs, it will be raised using the following argument
273
- self.endpoints_exceptions = {}
274
-
275
- # The batch window
276
301
  self._batch_window_generator = _BatchWindowGenerator(
277
302
  batch_dict=context.parameters[
278
303
  mm_constants.EventFieldType.BATCH_INTERVALS_DICT
@@ -285,7 +310,7 @@ class MonitoringApplicationController:
285
310
  )
286
311
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
287
312
  self.parquet_directory = get_monitoring_parquet_path(
288
- project=project,
313
+ self.project_obj,
289
314
  kind=mm_constants.FileTargetKind.APPS_PARQUET,
290
315
  )
291
316
  self.storage_options = None
@@ -311,21 +336,23 @@ class MonitoringApplicationController:
311
336
 
312
337
  def run(self):
313
338
  """
314
- Main method for run all the relevant monitoring application on each endpoint
339
+ Main method for run all the relevant monitoring applications on each endpoint
315
340
  """
316
341
  try:
317
342
  endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
318
- application = mlrun.get_or_create_project(
319
- self.project
320
- ).list_model_monitoring_functions()
321
- if application:
322
- applications_names = list({app.metadata.name for app in application})
343
+ monitoring_functions = self.project_obj.list_model_monitoring_functions()
344
+ if monitoring_functions:
345
+ applications_names = list(
346
+ {app.metadata.name for app in monitoring_functions}
347
+ )
323
348
  else:
324
- logger.info("There are no monitoring application found in this project")
349
+ self.context.logger.info(
350
+ "No monitoring functions found", project=self.project
351
+ )
325
352
  applications_names = []
326
353
 
327
354
  except Exception as e:
328
- logger.error("Failed to list endpoints", exc=e)
355
+ self.context.logger.error("Failed to list endpoints", exc=e)
329
356
  return
330
357
  if endpoints and applications_names:
331
358
  # Initialize a process pool that will be used to run each endpoint applications on a dedicated process
@@ -362,9 +389,7 @@ class MonitoringApplicationController:
362
389
  futures.append(future)
363
390
 
364
391
  for future in concurrent.futures.as_completed(futures):
365
- res = future.result()
366
- if res:
367
- self.endpoints_exceptions[res[0]] = res[1]
392
+ future.result()
368
393
 
369
394
  self._delete_old_parquet(endpoints=endpoints)
370
395
 
@@ -378,7 +403,7 @@ class MonitoringApplicationController:
378
403
  parquet_directory: str,
379
404
  storage_options: dict,
380
405
  model_monitoring_access_key: str,
381
- ) -> Optional[Tuple[str, Exception]]:
406
+ ) -> None:
382
407
  """
383
408
  Process a model endpoint and trigger the monitoring applications. This function running on different process
384
409
  for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
@@ -413,6 +438,7 @@ class MonitoringApplicationController:
413
438
  application=application,
414
439
  first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
415
440
  last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
441
+ has_stream=endpoint[mm_constants.EventFieldType.STREAM_PATH] != "",
416
442
  )
417
443
 
418
444
  for start_infer_time, end_infer_time in batch_window.get_intervals():
@@ -432,22 +458,18 @@ class MonitoringApplicationController:
432
458
  parquet_target_path = offline_response.vector.get_target_path()
433
459
 
434
460
  if len(df) == 0:
435
- logger.warn(
436
- "Not enough model events since the beginning of the batch interval",
437
- featureset_name=m_fs.metadata.name,
461
+ logger.info(
462
+ "During this time window, the endpoint has not received any data",
438
463
  endpoint=endpoint[mm_constants.EventFieldType.UID],
439
- min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
440
464
  start_time=start_infer_time,
441
465
  end_time=end_infer_time,
442
466
  )
443
467
  continue
444
468
 
445
- # Continue if not enough events provided since the deployment of the model endpoint
446
469
  except FileNotFoundError:
447
470
  logger.warn(
448
- "Parquet not found, probably due to not enough model events",
471
+ "No parquets were written yet",
449
472
  endpoint=endpoint[mm_constants.EventFieldType.UID],
450
- min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
451
473
  )
452
474
  continue
453
475
 
@@ -481,12 +503,11 @@ class MonitoringApplicationController:
481
503
  model_monitoring_access_key=model_monitoring_access_key,
482
504
  parquet_target_path=parquet_target_path,
483
505
  )
484
- except Exception as e:
485
- logger.error(
506
+ except Exception:
507
+ logger.exception(
486
508
  "Encountered an exception",
487
509
  endpoint_id=endpoint[mm_constants.EventFieldType.UID],
488
510
  )
489
- return endpoint_id, e
490
511
 
491
512
  def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
492
513
  """
@@ -500,12 +521,14 @@ class MonitoringApplicationController:
500
521
  self.parquet_directory,
501
522
  {"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
502
523
  )
503
- fs = store.get_filesystem()
524
+ fs = store.filesystem
504
525
 
505
526
  # calculate time threshold (keep only files from the last 24 hours)
506
- time_to_keep = float(
507
- (datetime.datetime.now() - datetime.timedelta(days=days)).strftime("%s")
508
- )
527
+ time_to_keep = (
528
+ datetime.datetime.now(tz=datetime.timezone.utc)
529
+ - datetime.timedelta(days=days)
530
+ ).timestamp()
531
+
509
532
  for endpoint in endpoints:
510
533
  try:
511
534
  apps_parquet_directories = fs.listdir(
@@ -619,14 +642,13 @@ class MonitoringApplicationController:
619
642
 
620
643
  # get offline features based on application start and end time.
621
644
  # store the result parquet by partitioning by controller end processing time
622
- offline_response = fstore.get_offline_features(
623
- feature_vector=vector,
645
+ offline_response = vector.get_offline_features(
624
646
  start_time=start_infer_time,
625
647
  end_time=end_infer_time,
626
648
  timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
627
649
  target=ParquetTarget(
628
650
  path=parquet_directory
629
- + f"/key={endpoint_id}/{start_infer_time.strftime('%s')}/{application_name}.parquet",
651
+ + f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
630
652
  storage_options=storage_options,
631
653
  ),
632
654
  )
@@ -16,7 +16,7 @@ import mlrun
16
16
  from mlrun.model_monitoring.controller import MonitoringApplicationController
17
17
 
18
18
 
19
- def handler(context: mlrun.run.MLClientCtx):
19
+ def handler(context: mlrun.run.MLClientCtx) -> None:
20
20
  """
21
21
  Run model monitoring application processor
22
22
 
@@ -27,5 +27,3 @@ def handler(context: mlrun.run.MLClientCtx):
27
27
  project=context.project,
28
28
  )
29
29
  monitor_app_controller.run()
30
- if monitor_app_controller.endpoints_exceptions:
31
- context.logger.error(monitor_app_controller.endpoints_exceptions)