mlrun 1.8.0rc45__py3-none-any.whl → 1.8.0rc47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

@@ -12,15 +12,17 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import collections
15
16
  import concurrent.futures
16
17
  import datetime
17
18
  import json
18
19
  import os
19
20
  import traceback
21
+ from collections import OrderedDict
20
22
  from collections.abc import Iterator
21
23
  from contextlib import AbstractContextManager
22
24
  from types import TracebackType
23
- from typing import Any, NamedTuple, Optional, cast
25
+ from typing import Any, NamedTuple, Optional, Union, cast
24
26
 
25
27
  import nuclio_sdk
26
28
 
@@ -28,7 +30,9 @@ import mlrun
28
30
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
29
31
  import mlrun.feature_store as fstore
30
32
  import mlrun.model_monitoring
33
+ import mlrun.model_monitoring.db._schedules as schedules
31
34
  import mlrun.model_monitoring.helpers
35
+ import mlrun.platforms.iguazio
32
36
  from mlrun.common.schemas import EndpointType
33
37
  from mlrun.common.schemas.model_monitoring.constants import (
34
38
  ControllerEvent,
@@ -36,7 +40,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
36
40
  ControllerEventKind,
37
41
  )
38
42
  from mlrun.errors import err_to_str
39
- from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
40
43
  from mlrun.model_monitoring.helpers import batch_dict2timedelta
41
44
  from mlrun.utils import datetime_now, logger
42
45
 
@@ -53,7 +56,7 @@ class _BatchWindow:
53
56
  def __init__(
54
57
  self,
55
58
  *,
56
- schedules_file: ModelMonitoringSchedulesFile,
59
+ schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
57
60
  application: str,
58
61
  timedelta_seconds: int,
59
62
  last_updated: int,
@@ -153,7 +156,7 @@ class _BatchWindowGenerator(AbstractContextManager):
153
156
  self._project = project
154
157
  self._endpoint_id = endpoint_id
155
158
  self._timedelta = window_length
156
- self._schedules_file = ModelMonitoringSchedulesFile(
159
+ self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
157
160
  project=project, endpoint_id=endpoint_id
158
161
  )
159
162
 
@@ -243,7 +246,7 @@ class MonitoringApplicationController:
243
246
  Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
244
247
  """
245
248
 
246
- _MAX_OPEN_WINDOWS_ALLOWED = 5
249
+ _MAX_FEATURE_SET_PER_WORKER = 1000
247
250
 
248
251
  def __init__(self) -> None:
249
252
  """Initialize Monitoring Application Controller"""
@@ -259,6 +262,61 @@ class MonitoringApplicationController:
259
262
  mlrun.mlconf.artifact_path
260
263
  )
261
264
  self.storage_options = store.get_storage_options()
265
+ self._controller_stream: Optional[
266
+ Union[
267
+ mlrun.platforms.iguazio.OutputStream,
268
+ mlrun.platforms.iguazio.KafkaOutputStream,
269
+ ]
270
+ ] = None
271
+ self._model_monitoring_stream: Optional[
272
+ Union[
273
+ mlrun.platforms.iguazio.OutputStream,
274
+ mlrun.platforms.iguazio.KafkaOutputStream,
275
+ ]
276
+ ] = None
277
+ self.applications_streams: dict[
278
+ str,
279
+ Union[
280
+ mlrun.platforms.iguazio.OutputStream,
281
+ mlrun.platforms.iguazio.KafkaOutputStream,
282
+ ],
283
+ ] = {}
284
+ self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
285
+ collections.OrderedDict()
286
+ )
287
+ self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
288
+ project=self.project
289
+ )
290
+
291
+ @property
292
+ def controller_stream(
293
+ self,
294
+ ) -> Union[
295
+ mlrun.platforms.iguazio.OutputStream,
296
+ mlrun.platforms.iguazio.KafkaOutputStream,
297
+ ]:
298
+ if self._controller_stream is None:
299
+ self._controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
300
+ project=self.project,
301
+ function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
302
+ v3io_access_key=self.v3io_access_key,
303
+ )
304
+ return self._controller_stream
305
+
306
+ @property
307
+ def model_monitoring_stream(
308
+ self,
309
+ ) -> Union[
310
+ mlrun.platforms.iguazio.OutputStream,
311
+ mlrun.platforms.iguazio.KafkaOutputStream,
312
+ ]:
313
+ if self._model_monitoring_stream is None:
314
+ self._model_monitoring_stream = mlrun.model_monitoring.helpers.get_output_stream(
315
+ project=self.project,
316
+ function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
317
+ v3io_access_key=self.v3io_access_key,
318
+ )
319
+ return self._model_monitoring_stream
262
320
 
263
321
  @staticmethod
264
322
  def _get_model_monitoring_access_key() -> Optional[str]:
@@ -273,6 +331,7 @@ class MonitoringApplicationController:
273
331
  endpoint: mlrun.common.schemas.ModelEndpoint,
274
332
  application_names: set,
275
333
  base_period_minutes: int,
334
+ schedules_file: schedules.ModelMonitoringSchedulesFileChief,
276
335
  ) -> bool:
277
336
  """
278
337
  checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
@@ -281,11 +340,23 @@ class MonitoringApplicationController:
281
340
  2. first request exists
282
341
  3. last request exists
283
342
  4. endpoint_type is not ROUTER
284
- if the four above conditions apply we require one of the three conditions to monitor:
343
+ if the four above conditions apply we require one of the two condition monitor:
285
344
  1. never monitored the one of the endpoint applications meaning min_last_analyzed is None
286
- 2. last request has a higher timestamp than the min_last_analyzed timestamp
287
- 3. We didn't analyze one of the application for over than _MAX_OPEN_WINDOWS_ALLOWED windows
345
+ 2. min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
346
+ is sent with the combination of current last_request & current last_analyzed per endpoint.
288
347
  """
348
+ last_timestamp_sent = schedules_file.get_endpoint_last_request(
349
+ endpoint.metadata.uid
350
+ )
351
+ last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
352
+ endpoint.metadata.uid
353
+ )
354
+ logger.debug(
355
+ "Chief should monitor endpoint check",
356
+ last_timestamp_sent=last_timestamp_sent,
357
+ last_analyzed_sent=last_analyzed_sent,
358
+ uid=endpoint.metadata.uid,
359
+ )
289
360
  if (
290
361
  # Is the model endpoint monitored?
291
362
  endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
@@ -300,26 +371,43 @@ class MonitoringApplicationController:
300
371
  project=endpoint.metadata.project,
301
372
  endpoint_id=endpoint.metadata.uid,
302
373
  ) as batch_window_generator:
303
- base_period_seconds = base_period_minutes * _SECONDS_IN_MINUTE
304
- if application_names != batch_window_generator.get_application_list():
374
+ current_time = mlrun.utils.datetime_now()
375
+ current_min_last_analyzed = (
376
+ batch_window_generator.get_min_last_analyzed()
377
+ )
378
+ if (
379
+ # Different application names, or last analyzed never updated while there are application to monitor
380
+ application_names
381
+ and (
382
+ application_names
383
+ != batch_window_generator.get_application_list()
384
+ or not current_min_last_analyzed
385
+ )
386
+ ):
305
387
  return True
306
388
  elif (
307
- not batch_window_generator.get_min_last_analyzed()
308
- or batch_window_generator.get_min_last_analyzed()
309
- <= int(endpoint.status.last_request.timestamp())
310
- or mlrun.utils.datetime_now().timestamp()
311
- - batch_window_generator.get_min_last_analyzed()
312
- >= self._MAX_OPEN_WINDOWS_ALLOWED * base_period_seconds
389
+ # Does nop event will be sent to close the relevant window
390
+ self._should_send_nop_event(
391
+ base_period_minutes, current_min_last_analyzed, current_time
392
+ )
393
+ and (
394
+ int(endpoint.status.last_request.timestamp())
395
+ != last_timestamp_sent
396
+ or current_min_last_analyzed != last_analyzed_sent
397
+ )
313
398
  ):
399
+ # Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
400
+ schedules_file.update_endpoint_timestamps(
401
+ endpoint_uid=endpoint.metadata.uid,
402
+ last_request=int(endpoint.status.last_request.timestamp()),
403
+ last_analyzed=current_min_last_analyzed,
404
+ )
314
405
  return True
315
406
  else:
316
407
  logger.info(
317
408
  "All the possible intervals were already analyzed, didn't push regular event",
318
409
  endpoint_id=endpoint.metadata.uid,
319
- last_analyzed=datetime.datetime.fromtimestamp(
320
- batch_window_generator.get_min_last_analyzed(),
321
- tz=datetime.timezone.utc,
322
- ),
410
+ last_analyzed=current_min_last_analyzed,
323
411
  last_request=endpoint.status.last_request,
324
412
  )
325
413
  else:
@@ -334,6 +422,21 @@ class MonitoringApplicationController:
334
422
  )
335
423
  return False
336
424
 
425
+ @staticmethod
426
+ def _should_send_nop_event(
427
+ base_period_minutes: int,
428
+ min_last_analyzed: int,
429
+ current_time: datetime.datetime,
430
+ ):
431
+ if min_last_analyzed:
432
+ return (
433
+ current_time.timestamp() - min_last_analyzed
434
+ >= datetime.timedelta(minutes=base_period_minutes).total_seconds()
435
+ + mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
436
+ )
437
+ else:
438
+ return True
439
+
337
440
  def run(self, event: nuclio_sdk.Event) -> None:
338
441
  """
339
442
  Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
@@ -377,9 +480,9 @@ class MonitoringApplicationController:
377
480
  ]
378
481
 
379
482
  not_batch_endpoint = (
380
- event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
483
+ event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
381
484
  )
382
- m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
485
+
383
486
  logger.info(
384
487
  "Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
385
488
  )
@@ -404,13 +507,39 @@ class MonitoringApplicationController:
404
507
  first_request=first_request,
405
508
  last_request=last_stream_timestamp,
406
509
  ):
407
- df = m_fs.to_dataframe(
408
- start_time=start_infer_time,
409
- end_time=end_infer_time,
410
- time_column=mm_constants.EventFieldType.TIMESTAMP,
411
- storage_options=self.storage_options,
412
- )
413
- if len(df) == 0:
510
+ data_in_window = False
511
+ if not_batch_endpoint:
512
+ # Serving endpoint - get the relevant window data from the TSDB
513
+ prediction_metric = self.tsdb_connector.read_predictions(
514
+ start=start_infer_time,
515
+ end=end_infer_time,
516
+ endpoint_id=endpoint_id,
517
+ )
518
+ if prediction_metric.data:
519
+ data_in_window = True
520
+ else:
521
+ if endpoint_id not in self.feature_sets:
522
+ self.feature_sets[endpoint_id] = fstore.get_feature_set(
523
+ event[ControllerEvent.FEATURE_SET_URI]
524
+ )
525
+ self.feature_sets.move_to_end(endpoint_id, last=False)
526
+ if (
527
+ len(self.feature_sets)
528
+ > self._MAX_FEATURE_SET_PER_WORKER
529
+ ):
530
+ self.feature_sets.popitem(last=True)
531
+ m_fs = self.feature_sets.get(endpoint_id)
532
+
533
+ # Batch endpoint - get the relevant window data from the parquet target
534
+ df = m_fs.to_dataframe(
535
+ start_time=start_infer_time,
536
+ end_time=end_infer_time,
537
+ time_column=mm_constants.EventFieldType.TIMESTAMP,
538
+ storage_options=self.storage_options,
539
+ )
540
+ if len(df) > 0:
541
+ data_in_window = True
542
+ if not data_in_window:
414
543
  logger.info(
415
544
  "No data found for the given interval",
416
545
  start=start_infer_time,
@@ -441,9 +570,11 @@ class MonitoringApplicationController:
441
570
  ]
442
571
  current_time = mlrun.utils.datetime_now()
443
572
  if (
444
- current_time.timestamp()
445
- - batch_window_generator.get_min_last_analyzed()
446
- >= datetime.timedelta(minutes=base_period).total_seconds()
573
+ self._should_send_nop_event(
574
+ base_period,
575
+ batch_window_generator.get_min_last_analyzed(),
576
+ current_time,
577
+ )
447
578
  and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
448
579
  ):
449
580
  event = {
@@ -481,8 +612,8 @@ class MonitoringApplicationController:
481
612
  endpoint_id=event[ControllerEvent.ENDPOINT_ID],
482
613
  )
483
614
 
484
- @staticmethod
485
615
  def _push_to_applications(
616
+ self,
486
617
  start_infer_time: datetime.datetime,
487
618
  end_infer_time: datetime.datetime,
488
619
  endpoint_id: str,
@@ -516,12 +647,15 @@ class MonitoringApplicationController:
516
647
  }
517
648
  for app_name in applications_names:
518
649
  data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
519
-
520
- app_stream = mlrun.model_monitoring.helpers.get_output_stream(
521
- project=project,
522
- function_name=app_name,
523
- v3io_access_key=model_monitoring_access_key,
524
- )
650
+ if app_name not in self.applications_streams:
651
+ self.applications_streams[app_name] = (
652
+ mlrun.model_monitoring.helpers.get_output_stream(
653
+ project=project,
654
+ function_name=app_name,
655
+ v3io_access_key=model_monitoring_access_key,
656
+ )
657
+ )
658
+ app_stream = self.applications_streams.get(app_name)
525
659
 
526
660
  logger.info(
527
661
  "Pushing data to application stream",
@@ -534,7 +668,6 @@ class MonitoringApplicationController:
534
668
  def push_regular_event_to_controller_stream(self) -> None:
535
669
  """
536
670
  pushes a regular event to the controller stream.
537
- :param event: the nuclio trigger event
538
671
  """
539
672
  logger.info("Starting monitoring controller chief")
540
673
  applications_names = []
@@ -581,29 +714,32 @@ class MonitoringApplicationController:
581
714
  with concurrent.futures.ThreadPoolExecutor(
582
715
  max_workers=min(len(endpoints), 10)
583
716
  ) as pool:
584
- futures = {
585
- pool.submit(
586
- self.endpoint_to_regular_event,
587
- endpoint,
588
- policy,
589
- set(applications_names),
590
- self.v3io_access_key,
591
- ): endpoint
592
- for endpoint in endpoints
593
- }
594
- for future in concurrent.futures.as_completed(futures):
595
- if future.exception():
596
- exception = future.exception()
597
- error = (
598
- f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
599
- f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
600
- )
601
- error += "".join(
602
- traceback.format_exception(
603
- None, exception, exception.__traceback__
717
+ with schedules.ModelMonitoringSchedulesFileChief(
718
+ self.project
719
+ ) as schedule_file:
720
+ futures = {
721
+ pool.submit(
722
+ self.endpoint_to_regular_event,
723
+ endpoint,
724
+ policy,
725
+ set(applications_names),
726
+ schedule_file,
727
+ ): endpoint
728
+ for endpoint in endpoints
729
+ }
730
+ for future in concurrent.futures.as_completed(futures):
731
+ if future.exception():
732
+ exception = future.exception()
733
+ error = (
734
+ f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
735
+ f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
604
736
  )
605
- )
606
- logger.error(error)
737
+ error += "".join(
738
+ traceback.format_exception(
739
+ None, exception, exception.__traceback__
740
+ )
741
+ )
742
+ logger.error(error)
607
743
  logger.info("Finishing monitoring controller chief")
608
744
 
609
745
  def endpoint_to_regular_event(
@@ -611,15 +747,16 @@ class MonitoringApplicationController:
611
747
  endpoint: mlrun.common.schemas.ModelEndpoint,
612
748
  policy: dict,
613
749
  applications_names: set,
614
- v3io_access_key: str,
750
+ schedule_file: schedules.ModelMonitoringSchedulesFileChief,
615
751
  ) -> None:
616
752
  if self._should_monitor_endpoint(
617
753
  endpoint,
618
754
  set(applications_names),
619
755
  policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
756
+ schedule_file,
620
757
  ):
621
- logger.info(
622
- "Regular event is being pushed to controller stream for model endpoint",
758
+ logger.debug(
759
+ "Endpoint data is being prepared for regular event",
623
760
  endpoint_id=endpoint.metadata.uid,
624
761
  endpoint_name=endpoint.metadata.name,
625
762
  timestamp=endpoint.status.last_request.isoformat(
@@ -635,12 +772,11 @@ class MonitoringApplicationController:
635
772
  policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
636
773
  endpoint.metadata.updated.isoformat()
637
774
  )
638
- MonitoringApplicationController.push_to_controller_stream(
775
+ self.push_to_controller_stream(
639
776
  kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
640
777
  project=endpoint.metadata.project,
641
778
  endpoint_id=endpoint.metadata.uid,
642
779
  endpoint_name=endpoint.metadata.name,
643
- stream_access_key=v3io_access_key,
644
780
  timestamp=endpoint.status.last_request.isoformat(
645
781
  sep=" ", timespec="microseconds"
646
782
  ),
@@ -652,13 +788,12 @@ class MonitoringApplicationController:
652
788
  endpoint_policy=policy,
653
789
  )
654
790
 
655
- @staticmethod
656
791
  def push_to_controller_stream(
792
+ self,
657
793
  kind: str,
658
794
  project: str,
659
795
  endpoint_id: str,
660
796
  endpoint_name: str,
661
- stream_access_key: str,
662
797
  timestamp: str,
663
798
  first_request: str,
664
799
  endpoint_type: int,
@@ -676,7 +811,6 @@ class MonitoringApplicationController:
676
811
  :param endpoint_name: the endpoint name string
677
812
  :param endpoint_type: Enum of the endpoint type
678
813
  :param feature_set_uri: the feature set uri string
679
- :param stream_access_key: access key to apply the model monitoring process.
680
814
  """
681
815
  event = {
682
816
  ControllerEvent.KIND.value: kind,
@@ -689,18 +823,13 @@ class MonitoringApplicationController:
689
823
  ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
690
824
  ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
691
825
  }
692
- controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
693
- project=project,
694
- function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
695
- v3io_access_key=stream_access_key,
696
- )
697
826
  logger.info(
698
827
  "Pushing data to controller stream",
699
828
  event=event,
700
829
  endpoint_id=endpoint_id,
701
- controller_stream_type=str(type(controller_stream)),
830
+ controller_stream_type=str(type(self.controller_stream)),
702
831
  )
703
- controller_stream.push([event], partition_key=endpoint_id)
832
+ self.controller_stream.push([event], partition_key=endpoint_id)
704
833
 
705
834
  def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
706
835
  """
@@ -708,18 +837,13 @@ class MonitoringApplicationController:
708
837
  :param event: event dictionary to push to stream
709
838
  :param endpoint_id: endpoint id string
710
839
  """
711
- mm_stream = mlrun.model_monitoring.helpers.get_output_stream(
712
- project=event.get(ControllerEvent.PROJECT),
713
- function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
714
- v3io_access_key=self.v3io_access_key,
715
- )
716
840
  logger.info(
717
841
  "Pushing data to main stream, NOP event is been generated",
718
842
  event=json.dumps(event),
719
843
  endpoint_id=endpoint_id,
720
- mm_stream_type=str(type(mm_stream)),
844
+ mm_stream_type=str(type(self.model_monitoring_stream)),
721
845
  )
722
- mm_stream.push([event], partition_key=endpoint_id)
846
+ self.model_monitoring_stream.push([event], partition_key=endpoint_id)
723
847
 
724
848
 
725
849
  def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None: