mlrun 1.8.0rc44__py3-none-any.whl → 1.8.0rc46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

@@ -289,6 +289,11 @@ class ModelMonitoringMode(StrEnum):
289
289
  disabled = "disabled"
290
290
 
291
291
 
292
+ class ScheduleChiefFields(StrEnum):
293
+ LAST_REQUEST = "last_request"
294
+ LAST_ANALYZED = "last_analyzed"
295
+
296
+
292
297
  class EndpointType(IntEnum):
293
298
  NODE_EP = 1 # end point that is not a child of a router
294
299
  ROUTER = 2 # endpoint that is router
mlrun/config.py CHANGED
@@ -549,6 +549,10 @@ default_config = {
549
549
  },
550
550
  },
551
551
  "model_endpoint_monitoring": {
552
+ # Scaling Rule
553
+ # The fundamental scaling rule to maintain is: Shards/Partitions = Replicas * Workers
554
+ # In other words, the number of shards (V3IO) or partitions (Kafka) must be equal to the
555
+ # total number of worker processes across all pods.
552
556
  "serving_stream": {
553
557
  "v3io": {
554
558
  "shard_count": 2,
@@ -822,6 +826,8 @@ default_config = {
822
826
  # maximum allowed alert config cache size in alert's CRUD
823
827
  # for the best performance, it is recommended to set this value to the maximum number of alerts
824
828
  "max_allowed_cache_size": 20000,
829
+ # default limit for listing alert configs
830
+ "default_list_alert_configs_limit": 2000,
825
831
  },
826
832
  "auth_with_client_id": {
827
833
  "enabled": False,
@@ -27,8 +27,12 @@ class BaseDataInfer:
27
27
  get_stats = None
28
28
 
29
29
 
30
+ def is_spark_dataframe(df) -> bool:
31
+ return "rdd" in dir(df)
32
+
33
+
30
34
  def get_infer_interface(df) -> BaseDataInfer:
31
- if hasattr(df, "rdd"):
35
+ if is_spark_dataframe(df):
32
36
  from .spark import SparkDataInfer
33
37
 
34
38
  return SparkDataInfer
@@ -40,7 +40,7 @@ from mlrun.utils.helpers import to_parquet
40
40
  from mlrun.utils.v3io_clients import get_frames_client
41
41
 
42
42
  from .. import errors
43
- from ..data_types import ValueType
43
+ from ..data_types import ValueType, is_spark_dataframe
44
44
  from ..platforms.iguazio import parse_path, split_path
45
45
  from .datastore_profile import datastore_profile_read
46
46
  from .spark_utils import spark_session_update_hadoop_options
@@ -86,8 +86,10 @@ def generate_target_run_id():
86
86
 
87
87
 
88
88
  def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
89
+ # TODO: Replace with just df.sparkSession when Spark 3.2 support is dropped
90
+ spark_session = getattr(df, "sparkSession") or df.sql_ctx.sparkSession
89
91
  non_hadoop_spark_options = spark_session_update_hadoop_options(
90
- df.sql_ctx.sparkSession, spark_options
92
+ spark_session, spark_options
91
93
  )
92
94
  if write_format:
93
95
  df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
@@ -510,7 +512,7 @@ class BaseStoreTarget(DataTargetBase):
510
512
  chunk_id=0,
511
513
  **kwargs,
512
514
  ) -> Optional[int]:
513
- if hasattr(df, "rdd"):
515
+ if is_spark_dataframe(df):
514
516
  options = self.get_spark_options(key_column, timestamp_key)
515
517
  options.update(kwargs)
516
518
  df = self.prepare_spark_df(df, key_column, timestamp_key, options)
@@ -1376,7 +1378,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1376
1378
  def write_dataframe(
1377
1379
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1378
1380
  ):
1379
- if hasattr(df, "rdd"):
1381
+ if is_spark_dataframe(df):
1380
1382
  options = self.get_spark_options(key_column, timestamp_key)
1381
1383
  options.update(kwargs)
1382
1384
  df = self.prepare_spark_df(df)
@@ -2108,7 +2110,7 @@ class SQLTarget(BaseStoreTarget):
2108
2110
 
2109
2111
  self._create_sql_table()
2110
2112
 
2111
- if hasattr(df, "rdd"):
2113
+ if is_spark_dataframe(df):
2112
2114
  raise ValueError("Spark is not supported")
2113
2115
  else:
2114
2116
  (
mlrun/db/base.py CHANGED
@@ -889,7 +889,9 @@ class RunDBInterface(ABC):
889
889
  pass
890
890
 
891
891
  @abstractmethod
892
- def list_alerts_configs(self, project=""):
892
+ def list_alerts_configs(
893
+ self, project="", limit: Optional[int] = None, offset: Optional[int] = None
894
+ ):
893
895
  pass
894
896
 
895
897
  @abstractmethod
@@ -1105,12 +1107,6 @@ class RunDBInterface(ABC):
1105
1107
  ) -> bool:
1106
1108
  pass
1107
1109
 
1108
- @abstractmethod
1109
- def deploy_histogram_data_drift_app(
1110
- self, project: str, image: str = "mlrun/mlrun"
1111
- ) -> None:
1112
- pass
1113
-
1114
1110
  @abstractmethod
1115
1111
  def set_model_monitoring_credentials(
1116
1112
  self,
mlrun/db/httpdb.py CHANGED
@@ -4080,21 +4080,6 @@ class HTTPRunDB(RunDBInterface):
4080
4080
  deletion_failed = True
4081
4081
  return not deletion_failed
4082
4082
 
4083
- def deploy_histogram_data_drift_app(
4084
- self, project: str, image: str = "mlrun/mlrun"
4085
- ) -> None:
4086
- """
4087
- Deploy the histogram data drift application.
4088
-
4089
- :param project: Project name.
4090
- :param image: The image on which the application will run.
4091
- """
4092
- self.api_call(
4093
- method=mlrun.common.types.HTTPMethod.PUT,
4094
- path=f"projects/{project}/model-monitoring/histogram-data-drift-app",
4095
- params={"image": image},
4096
- )
4097
-
4098
4083
  def set_model_monitoring_credentials(
4099
4084
  self,
4100
4085
  project: str,
@@ -4818,20 +4803,33 @@ class HTTPRunDB(RunDBInterface):
4818
4803
  response = self.api_call("GET", endpoint_path, error_message)
4819
4804
  return AlertConfig.from_dict(response.json())
4820
4805
 
4821
- def list_alerts_configs(self, project="") -> list[AlertConfig]:
4806
+ def list_alerts_configs(
4807
+ self, project="", limit: Optional[int] = None, offset: Optional[int] = None
4808
+ ) -> list[AlertConfig]:
4822
4809
  """
4823
4810
  Retrieve list of alerts of a project.
4824
4811
 
4825
4812
  :param project: The project name.
4813
+ :param limit: The maximum number of alerts to return.
4814
+ Defaults to `mlconf.alerts.default_list_alert_configs_limit` if not provided.
4815
+ :param offset: The number of alerts to skip.
4826
4816
 
4827
4817
  :returns: All the alerts objects of the project.
4828
4818
  """
4829
4819
  project = project or config.default_project
4830
4820
  endpoint_path = f"projects/{project}/alerts"
4831
4821
  error_message = f"get alerts {project}/alerts"
4832
- response = self.api_call("GET", endpoint_path, error_message).json()
4822
+ params = {}
4823
+ # TODO: Deprecate limit and offset when pagination is implemented
4824
+ if limit:
4825
+ params["page-size"] = limit
4826
+ if offset:
4827
+ params["offset"] = offset
4828
+ response = self.api_call(
4829
+ "GET", endpoint_path, error_message, params=params
4830
+ ).json()
4833
4831
  results = []
4834
- for item in response:
4832
+ for item in response.get("alerts", []):
4835
4833
  results.append(AlertConfig(**item))
4836
4834
  return results
4837
4835
 
mlrun/db/nopdb.py CHANGED
@@ -883,11 +883,6 @@ class NopDB(RunDBInterface):
883
883
  ) -> bool:
884
884
  pass
885
885
 
886
- def deploy_histogram_data_drift_app(
887
- self, project: str, image: str = "mlrun/mlrun"
888
- ) -> None:
889
- pass
890
-
891
886
  def set_model_monitoring_credentials(
892
887
  self,
893
888
  project: str,
@@ -50,8 +50,8 @@ DatasetType = typing.Union[
50
50
 
51
51
  def get_or_create_model_endpoint(
52
52
  project: str,
53
+ model_endpoint_name: str,
53
54
  model_path: str = "",
54
- model_endpoint_name: str = "",
55
55
  endpoint_id: str = "",
56
56
  function_name: str = "",
57
57
  function_tag: str = "latest",
@@ -59,6 +59,7 @@ def get_or_create_model_endpoint(
59
59
  sample_set_statistics: typing.Optional[dict[str, typing.Any]] = None,
60
60
  monitoring_mode: mm_constants.ModelMonitoringMode = mm_constants.ModelMonitoringMode.enabled,
61
61
  db_session=None,
62
+ feature_analysis: bool = False,
62
63
  ) -> ModelEndpoint:
63
64
  """
64
65
  Get a single model endpoint object. If not exist, generate a new model endpoint with the provided parameters. Note
@@ -66,9 +67,9 @@ def get_or_create_model_endpoint(
66
67
  features, set `monitoring_mode=enabled`.
67
68
 
68
69
  :param project: Project name.
69
- :param model_path: The model store path (applicable only to new endpoint_id).
70
70
  :param model_endpoint_name: If a new model endpoint is created, the model endpoint name will be presented
71
71
  under this endpoint (applicable only to new endpoint_id).
72
+ :param model_path: The model store path (applicable only to new endpoint_id).
72
73
  :param endpoint_id: Model endpoint unique ID. If not exist in DB, will generate a new record based
73
74
  on the provided `endpoint_id`.
74
75
  :param function_name: If a new model endpoint is created, use this function name.
@@ -80,6 +81,7 @@ def get_or_create_model_endpoint(
80
81
  :param monitoring_mode: If enabled, apply model monitoring features on the provided endpoint id
81
82
  (applicable only to new endpoint_id).
82
83
  :param db_session: A runtime session that manages the current dialog with the database.
84
+ :param feature_analysis: If True, the model endpoint will be retrieved with the feature analysis mode.
83
85
 
84
86
  :return: A ModelEndpoint object
85
87
  """
@@ -99,6 +101,7 @@ def get_or_create_model_endpoint(
99
101
  endpoint_id=endpoint_id,
100
102
  function_name=function_name,
101
103
  function_tag=function_tag or "latest",
104
+ feature_analysis=feature_analysis,
102
105
  )
103
106
  # If other fields provided, validate that they are correspond to the existing model endpoint data
104
107
  _model_endpoint_validations(
@@ -76,7 +76,6 @@ class MonitoringApplicationContext:
76
76
  :param sample_df: (pd.DataFrame) The new sample DataFrame.
77
77
  :param start_infer_time: (pd.Timestamp) Start time of the monitoring schedule.
78
78
  :param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
79
- :param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
80
79
  :param endpoint_id: (str) ID of the monitored model endpoint
81
80
  :param feature_set: (FeatureSet) the model endpoint feature set
82
81
  :param endpoint_name: (str) Name of the monitored model endpoint
@@ -208,6 +207,20 @@ class MonitoringApplicationContext:
208
207
  @property
209
208
  def sample_df(self) -> pd.DataFrame:
210
209
  if self._sample_df is None:
210
+ if (
211
+ self.endpoint_name is None
212
+ or self.endpoint_id is None
213
+ or pd.isnull(self.start_infer_time)
214
+ or pd.isnull(self.end_infer_time)
215
+ ):
216
+ raise mlrun.errors.MLRunValueError(
217
+ "You have tried to access `monitoring_context.sample_df`, but have not provided it directly "
218
+ "through `sample_data`, nor have you provided the model endpoint's name, ID, and the start and "
219
+ f"end times: `endpoint_name`={self.endpoint_name}, `endpoint_uid`={self.endpoint_id}, "
220
+ f"`start`={self.start_infer_time}, and `end`={self.end_infer_time}. "
221
+ "You can either provide the sample dataframe directly, the model endpoint's details and times, "
222
+ "or adapt the application's logic to not access the sample dataframe."
223
+ )
211
224
  feature_set = self.feature_set
212
225
  features = [f"{feature_set.metadata.name}.*"]
213
226
  vector = fstore.FeatureVector(
@@ -107,16 +107,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
107
107
  * JSON with the general drift value per feature, produced by default.
108
108
  * Plotly table with the various metrics and histograms per feature (disabled by default due to performance issues).
109
109
 
110
- This application is deployed by default when calling:
111
-
112
- .. code-block:: python
113
-
114
- project.enable_model_monitoring()
115
-
110
+ This application is deployed by default when calling
111
+ :py:func:`~mlrun.projects.MlrunProject.enable_model_monitoring`.
116
112
  To avoid it, pass :code:`deploy_histogram_data_drift_app=False`.
117
113
 
118
114
  If you want to change the application defaults, such as the classifier or which artifacts to produce, you
119
115
  need to inherit from this class and deploy it as any other model monitoring application.
116
+ Please make sure to keep the default application name. This ensures that the full functionality of the application,
117
+ including the statistics view in the UI, is available.
120
118
  """
121
119
 
122
120
  NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
@@ -140,8 +138,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
140
138
  produce_plotly_artifact: bool = False,
141
139
  ) -> None:
142
140
  """
143
- :param value_classifier: Classifier object that adheres to the `ValueClassifier` protocol.
144
- If not provided, the default `DataDriftClassifier()` is used.
141
+ :param value_classifier: Classifier object that adheres to the :py:class:`~ValueClassifier` protocol.
142
+ If not provided, the default :py:class:`~DataDriftClassifier` is used.
145
143
  """
146
144
  self._value_classifier = value_classifier or DataDriftClassifier()
147
145
  assert self._REQUIRED_METRICS <= set(
@@ -181,10 +179,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
181
179
  return metrics_per_feature
182
180
 
183
181
  def _get_general_drift_result(
184
- self,
185
- metrics: list[mm_results.ModelMonitoringApplicationMetric],
186
- monitoring_context: mm_context.MonitoringApplicationContext,
187
- metrics_per_feature: DataFrame,
182
+ self, metrics: list[mm_results.ModelMonitoringApplicationMetric]
188
183
  ) -> mm_results.ModelMonitoringApplicationResult:
189
184
  """Get the general drift result from the metrics list"""
190
185
  value = cast(
@@ -237,7 +232,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
237
232
  monitoring_context: mm_context.MonitoringApplicationContext,
238
233
  ) -> list[mm_results._ModelMonitoringApplicationStats]:
239
234
  """
240
- list the application calculated stats
235
+ Return a list of the statistics.
236
+
241
237
  :param metrics: the calculated metrics
242
238
  :param metrics_per_feature: metric calculated per feature
243
239
  :param monitoring_context: context object for current monitoring application
@@ -376,11 +372,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
376
372
  )
377
373
  monitoring_context.logger.debug("Computing average per metric")
378
374
  metrics = self._get_metrics(metrics_per_feature)
379
- result = self._get_general_drift_result(
380
- metrics=metrics,
381
- monitoring_context=monitoring_context,
382
- metrics_per_feature=metrics_per_feature,
383
- )
375
+ result = self._get_general_drift_result(metrics=metrics)
384
376
  stats = self._get_stats(
385
377
  metrics=metrics,
386
378
  monitoring_context=monitoring_context,
@@ -28,6 +28,7 @@ import mlrun
28
28
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
29
29
  import mlrun.feature_store as fstore
30
30
  import mlrun.model_monitoring
31
+ import mlrun.model_monitoring.db._schedules as schedules
31
32
  import mlrun.model_monitoring.helpers
32
33
  from mlrun.common.schemas import EndpointType
33
34
  from mlrun.common.schemas.model_monitoring.constants import (
@@ -36,7 +37,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
36
37
  ControllerEventKind,
37
38
  )
38
39
  from mlrun.errors import err_to_str
39
- from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
40
40
  from mlrun.model_monitoring.helpers import batch_dict2timedelta
41
41
  from mlrun.utils import datetime_now, logger
42
42
 
@@ -53,7 +53,7 @@ class _BatchWindow:
53
53
  def __init__(
54
54
  self,
55
55
  *,
56
- schedules_file: ModelMonitoringSchedulesFile,
56
+ schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
57
57
  application: str,
58
58
  timedelta_seconds: int,
59
59
  last_updated: int,
@@ -153,7 +153,7 @@ class _BatchWindowGenerator(AbstractContextManager):
153
153
  self._project = project
154
154
  self._endpoint_id = endpoint_id
155
155
  self._timedelta = window_length
156
- self._schedules_file = ModelMonitoringSchedulesFile(
156
+ self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
157
157
  project=project, endpoint_id=endpoint_id
158
158
  )
159
159
 
@@ -273,6 +273,7 @@ class MonitoringApplicationController:
273
273
  endpoint: mlrun.common.schemas.ModelEndpoint,
274
274
  application_names: set,
275
275
  base_period_minutes: int,
276
+ schedules_file: schedules.ModelMonitoringSchedulesFileChief,
276
277
  ) -> bool:
277
278
  """
278
279
  checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
@@ -281,11 +282,23 @@ class MonitoringApplicationController:
281
282
  2. first request exists
282
283
  3. last request exists
283
284
  4. endpoint_type is not ROUTER
284
- if the four above conditions apply we require one of the three conditions to monitor:
285
+ if the four above conditions apply we require one of the two condition monitor:
285
286
  1. never monitored the one of the endpoint applications meaning min_last_analyzed is None
286
- 2. last request has a higher timestamp than the min_last_analyzed timestamp
287
- 3. We didn't analyze one of the application for over than _MAX_OPEN_WINDOWS_ALLOWED windows
287
+ 2. min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
288
+ is sent with the combination of current last_request & current last_analyzed per endpoint.
288
289
  """
290
+ last_timestamp_sent = schedules_file.get_endpoint_last_request(
291
+ endpoint.metadata.uid
292
+ )
293
+ last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
294
+ endpoint.metadata.uid
295
+ )
296
+ logger.debug(
297
+ "Chief should monitor endpoint check",
298
+ last_timestamp_sent=last_timestamp_sent,
299
+ last_analyzed_sent=last_analyzed_sent,
300
+ uid=endpoint.metadata.uid,
301
+ )
289
302
  if (
290
303
  # Is the model endpoint monitored?
291
304
  endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
@@ -300,26 +313,43 @@ class MonitoringApplicationController:
300
313
  project=endpoint.metadata.project,
301
314
  endpoint_id=endpoint.metadata.uid,
302
315
  ) as batch_window_generator:
303
- base_period_seconds = base_period_minutes * _SECONDS_IN_MINUTE
304
- if application_names != batch_window_generator.get_application_list():
316
+ current_time = mlrun.utils.datetime_now()
317
+ current_min_last_analyzed = (
318
+ batch_window_generator.get_min_last_analyzed()
319
+ )
320
+ if (
321
+ # Different application names, or last analyzed never updated while there are application to monitor
322
+ application_names
323
+ and (
324
+ application_names
325
+ != batch_window_generator.get_application_list()
326
+ or not current_min_last_analyzed
327
+ )
328
+ ):
305
329
  return True
306
330
  elif (
307
- not batch_window_generator.get_min_last_analyzed()
308
- or batch_window_generator.get_min_last_analyzed()
309
- <= int(endpoint.status.last_request.timestamp())
310
- or mlrun.utils.datetime_now().timestamp()
311
- - batch_window_generator.get_min_last_analyzed()
312
- >= self._MAX_OPEN_WINDOWS_ALLOWED * base_period_seconds
331
+ # Does nop event will be sent to close the relevant window
332
+ self._should_send_nop_event(
333
+ base_period_minutes, current_min_last_analyzed, current_time
334
+ )
335
+ and (
336
+ int(endpoint.status.last_request.timestamp())
337
+ != last_timestamp_sent
338
+ or current_min_last_analyzed != last_analyzed_sent
339
+ )
313
340
  ):
341
+ # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
342
+ schedules_file.update_endpoint_timestamps(
343
+ endpoint_uid=endpoint.metadata.uid,
344
+ last_request=int(endpoint.status.last_request.timestamp()),
345
+ last_analyzed=current_min_last_analyzed,
346
+ )
314
347
  return True
315
348
  else:
316
349
  logger.info(
317
350
  "All the possible intervals were already analyzed, didn't push regular event",
318
351
  endpoint_id=endpoint.metadata.uid,
319
- last_analyzed=datetime.datetime.fromtimestamp(
320
- batch_window_generator.get_min_last_analyzed(),
321
- tz=datetime.timezone.utc,
322
- ),
352
+ last_analyzed=current_min_last_analyzed,
323
353
  last_request=endpoint.status.last_request,
324
354
  )
325
355
  else:
@@ -334,6 +364,21 @@ class MonitoringApplicationController:
334
364
  )
335
365
  return False
336
366
 
367
+ @staticmethod
368
+ def _should_send_nop_event(
369
+ base_period_minutes: int,
370
+ min_last_analyzed: int,
371
+ current_time: datetime.datetime,
372
+ ):
373
+ if min_last_analyzed:
374
+ return (
375
+ current_time.timestamp() - min_last_analyzed
376
+ >= datetime.timedelta(minutes=base_period_minutes).total_seconds()
377
+ + mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
378
+ )
379
+ else:
380
+ return True
381
+
337
382
  def run(self, event: nuclio_sdk.Event) -> None:
338
383
  """
339
384
  Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
@@ -441,9 +486,11 @@ class MonitoringApplicationController:
441
486
  ]
442
487
  current_time = mlrun.utils.datetime_now()
443
488
  if (
444
- current_time.timestamp()
445
- - batch_window_generator.get_min_last_analyzed()
446
- >= datetime.timedelta(minutes=base_period).total_seconds()
489
+ self._should_send_nop_event(
490
+ base_period,
491
+ batch_window_generator.get_min_last_analyzed(),
492
+ current_time,
493
+ )
447
494
  and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
448
495
  ):
449
496
  event = {
@@ -581,29 +628,33 @@ class MonitoringApplicationController:
581
628
  with concurrent.futures.ThreadPoolExecutor(
582
629
  max_workers=min(len(endpoints), 10)
583
630
  ) as pool:
584
- futures = {
585
- pool.submit(
586
- self.endpoint_to_regular_event,
587
- endpoint,
588
- policy,
589
- set(applications_names),
590
- self.v3io_access_key,
591
- ): endpoint
592
- for endpoint in endpoints
593
- }
594
- for future in concurrent.futures.as_completed(futures):
595
- if future.exception():
596
- exception = future.exception()
597
- error = (
598
- f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
599
- f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
600
- )
601
- error += "".join(
602
- traceback.format_exception(
603
- None, exception, exception.__traceback__
631
+ with schedules.ModelMonitoringSchedulesFileChief(
632
+ self.project
633
+ ) as schedule_file:
634
+ futures = {
635
+ pool.submit(
636
+ self.endpoint_to_regular_event,
637
+ endpoint,
638
+ policy,
639
+ set(applications_names),
640
+ self.v3io_access_key,
641
+ schedule_file,
642
+ ): endpoint
643
+ for endpoint in endpoints
644
+ }
645
+ for future in concurrent.futures.as_completed(futures):
646
+ if future.exception():
647
+ exception = future.exception()
648
+ error = (
649
+ f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
650
+ f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
604
651
  )
605
- )
606
- logger.error(error)
652
+ error += "".join(
653
+ traceback.format_exception(
654
+ None, exception, exception.__traceback__
655
+ )
656
+ )
657
+ logger.error(error)
607
658
  logger.info("Finishing monitoring controller chief")
608
659
 
609
660
  def endpoint_to_regular_event(
@@ -612,14 +663,16 @@ class MonitoringApplicationController:
612
663
  policy: dict,
613
664
  applications_names: set,
614
665
  v3io_access_key: str,
666
+ schedule_file: schedules.ModelMonitoringSchedulesFileChief,
615
667
  ) -> None:
616
668
  if self._should_monitor_endpoint(
617
669
  endpoint,
618
670
  set(applications_names),
619
671
  policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
672
+ schedule_file,
620
673
  ):
621
- logger.info(
622
- "Regular event is being pushed to controller stream for model endpoint",
674
+ logger.debug(
675
+ "Endpoint data is being prepared for regular event",
623
676
  endpoint_id=endpoint.metadata.uid,
624
677
  endpoint_name=endpoint.metadata.name,
625
678
  timestamp=endpoint.status.last_request.isoformat(