mlrun 1.10.0rc16__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (98) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/document.py +6 -1
  3. mlrun/artifacts/llm_prompt.py +21 -15
  4. mlrun/artifacts/model.py +3 -3
  5. mlrun/common/constants.py +9 -0
  6. mlrun/common/formatters/artifact.py +1 -0
  7. mlrun/common/model_monitoring/helpers.py +86 -0
  8. mlrun/common/schemas/__init__.py +2 -0
  9. mlrun/common/schemas/auth.py +2 -0
  10. mlrun/common/schemas/function.py +10 -0
  11. mlrun/common/schemas/hub.py +30 -18
  12. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  13. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  14. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  15. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  16. mlrun/common/schemas/pipeline.py +1 -1
  17. mlrun/common/schemas/serving.py +3 -0
  18. mlrun/common/schemas/workflow.py +1 -0
  19. mlrun/common/secrets.py +22 -1
  20. mlrun/config.py +32 -10
  21. mlrun/datastore/__init__.py +11 -3
  22. mlrun/datastore/azure_blob.py +162 -47
  23. mlrun/datastore/datastore.py +9 -4
  24. mlrun/datastore/datastore_profile.py +61 -5
  25. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  26. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  27. mlrun/datastore/model_provider/model_provider.py +211 -74
  28. mlrun/datastore/model_provider/openai_provider.py +243 -71
  29. mlrun/datastore/s3.py +24 -2
  30. mlrun/datastore/storeytargets.py +2 -3
  31. mlrun/datastore/utils.py +15 -3
  32. mlrun/db/base.py +27 -19
  33. mlrun/db/httpdb.py +57 -48
  34. mlrun/db/nopdb.py +25 -10
  35. mlrun/execution.py +55 -13
  36. mlrun/hub/__init__.py +15 -0
  37. mlrun/hub/module.py +181 -0
  38. mlrun/k8s_utils.py +105 -16
  39. mlrun/launcher/base.py +13 -6
  40. mlrun/launcher/local.py +2 -0
  41. mlrun/model.py +9 -3
  42. mlrun/model_monitoring/api.py +66 -27
  43. mlrun/model_monitoring/applications/__init__.py +1 -1
  44. mlrun/model_monitoring/applications/base.py +372 -136
  45. mlrun/model_monitoring/applications/context.py +2 -4
  46. mlrun/model_monitoring/applications/results.py +4 -7
  47. mlrun/model_monitoring/controller.py +239 -101
  48. mlrun/model_monitoring/db/_schedules.py +36 -13
  49. mlrun/model_monitoring/db/_stats.py +4 -3
  50. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  51. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
  52. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
  53. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  54. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  55. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
  56. mlrun/model_monitoring/helpers.py +28 -5
  57. mlrun/model_monitoring/stream_processing.py +45 -14
  58. mlrun/model_monitoring/writer.py +220 -1
  59. mlrun/platforms/__init__.py +3 -2
  60. mlrun/platforms/iguazio.py +7 -3
  61. mlrun/projects/operations.py +6 -1
  62. mlrun/projects/pipelines.py +2 -2
  63. mlrun/projects/project.py +128 -45
  64. mlrun/run.py +94 -17
  65. mlrun/runtimes/__init__.py +18 -0
  66. mlrun/runtimes/base.py +14 -6
  67. mlrun/runtimes/daskjob.py +1 -0
  68. mlrun/runtimes/local.py +5 -2
  69. mlrun/runtimes/mounts.py +20 -2
  70. mlrun/runtimes/nuclio/__init__.py +1 -0
  71. mlrun/runtimes/nuclio/application/application.py +147 -17
  72. mlrun/runtimes/nuclio/function.py +70 -27
  73. mlrun/runtimes/nuclio/serving.py +85 -4
  74. mlrun/runtimes/pod.py +213 -21
  75. mlrun/runtimes/utils.py +49 -9
  76. mlrun/secrets.py +54 -13
  77. mlrun/serving/remote.py +79 -6
  78. mlrun/serving/routers.py +23 -41
  79. mlrun/serving/server.py +211 -40
  80. mlrun/serving/states.py +536 -156
  81. mlrun/serving/steps.py +62 -0
  82. mlrun/serving/system_steps.py +136 -81
  83. mlrun/serving/v2_serving.py +9 -10
  84. mlrun/utils/helpers.py +212 -82
  85. mlrun/utils/logger.py +3 -1
  86. mlrun/utils/notifications/notification/base.py +18 -0
  87. mlrun/utils/notifications/notification/git.py +2 -4
  88. mlrun/utils/notifications/notification/slack.py +2 -4
  89. mlrun/utils/notifications/notification/webhook.py +2 -5
  90. mlrun/utils/notifications/notification_pusher.py +1 -1
  91. mlrun/utils/version/version.json +2 -2
  92. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +44 -45
  93. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +97 -92
  94. mlrun/api/schemas/__init__.py +0 -259
  95. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  96. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  97. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  98. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
@@ -18,14 +18,16 @@ from abc import ABC, abstractmethod
18
18
  from collections import defaultdict
19
19
  from collections.abc import Iterator
20
20
  from contextlib import contextmanager, nullcontext
21
- from datetime import datetime, timedelta
21
+ from datetime import datetime, timedelta, timezone
22
22
  from typing import Any, Literal, Optional, Union, cast
23
23
 
24
24
  import pandas as pd
25
25
 
26
26
  import mlrun
27
27
  import mlrun.common.constants as mlrun_constants
28
+ import mlrun.common.helpers
28
29
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
30
+ import mlrun.common.types
29
31
  import mlrun.datastore.datastore_profile as ds_profile
30
32
  import mlrun.errors
31
33
  import mlrun.model_monitoring.api as mm_api
@@ -33,10 +35,17 @@ import mlrun.model_monitoring.applications.context as mm_context
33
35
  import mlrun.model_monitoring.applications.results as mm_results
34
36
  import mlrun.model_monitoring.db._schedules as mm_schedules
35
37
  import mlrun.model_monitoring.helpers as mm_helpers
38
+ import mlrun.utils
36
39
  from mlrun.serving.utils import MonitoringApplicationToDict
37
40
  from mlrun.utils import logger
38
41
 
39
42
 
43
+ class ExistingDataHandling(mlrun.common.types.StrEnum):
44
+ fail_on_overlap = "fail_on_overlap"
45
+ skip_overlap = "skip_overlap"
46
+ delete_all = "delete_all"
47
+
48
+
40
49
  def _serialize_context_and_result(
41
50
  *,
42
51
  context: mm_context.MonitoringApplicationContext,
@@ -194,7 +203,25 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
194
203
  Optional[mm_schedules.ModelMonitoringSchedulesFileApplication],
195
204
  ]
196
205
  ]:
197
- endpoints_output: dict[str, list[tuple]] = defaultdict(list)
206
+ endpoints_output: dict[
207
+ str,
208
+ list[
209
+ tuple[
210
+ mm_context.MonitoringApplicationContext,
211
+ Union[
212
+ mm_results.ModelMonitoringApplicationResult,
213
+ mm_results.ModelMonitoringApplicationMetric,
214
+ list[
215
+ Union[
216
+ mm_results.ModelMonitoringApplicationResult,
217
+ mm_results.ModelMonitoringApplicationMetric,
218
+ mm_results._ModelMonitoringApplicationStats,
219
+ ]
220
+ ],
221
+ ],
222
+ ]
223
+ ],
224
+ ] = defaultdict(list)
198
225
  application_schedules = nullcontext()
199
226
  if write_output:
200
227
  cls._check_writer_is_up(project)
@@ -206,7 +233,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
206
233
  try:
207
234
  yield endpoints_output, application_schedules.__enter__()
208
235
  finally:
209
- if write_output:
236
+ if write_output and any(endpoints_output.values()):
210
237
  logger.debug(
211
238
  "Pushing model monitoring application job data to the writer stream",
212
239
  passed_stream_profile=str(stream_profile),
@@ -220,11 +247,21 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
220
247
  profile=stream_profile,
221
248
  )
222
249
  for endpoint_id, outputs in endpoints_output.items():
250
+ writer_events = []
251
+ for ctx, res in outputs:
252
+ if isinstance(res, list):
253
+ writer_events.extend(
254
+ _serialize_context_and_result(
255
+ context=ctx, result=sub_res
256
+ )
257
+ for sub_res in res
258
+ )
259
+ else:
260
+ writer_events.append(
261
+ _serialize_context_and_result(context=ctx, result=res)
262
+ )
223
263
  writer_stream.push(
224
- [
225
- _serialize_context_and_result(context=ctx, result=res)
226
- for ctx, res in outputs
227
- ],
264
+ writer_events,
228
265
  partition_key=endpoint_id,
229
266
  )
230
267
  logger.debug(
@@ -238,6 +275,14 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
238
275
  )
239
276
  application_schedules.__exit__(None, None, None)
240
277
 
278
+ @classmethod
279
+ def _get_application_name(cls, context: "mlrun.MLClientCtx") -> str:
280
+ """Get the application name from the context via the function URI"""
281
+ _, application_name, _, _ = mlrun.common.helpers.parse_versioned_object_uri(
282
+ context.to_dict().get("spec", {}).get("function", "")
283
+ )
284
+ return application_name
285
+
241
286
  def _handler(
242
287
  self,
243
288
  context: "mlrun.MLClientCtx",
@@ -250,7 +295,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
250
295
  end: Optional[str] = None,
251
296
  base_period: Optional[int] = None,
252
297
  write_output: bool = False,
253
- allow_unordered_data: bool = False,
298
+ existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
254
299
  stream_profile: Optional[ds_profile.DatastoreProfile] = None,
255
300
  ):
256
301
  """
@@ -271,7 +316,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
271
316
  "working with endpoints, without any custom data-frame input"
272
317
  )
273
318
 
274
- application_name = self.__class__.__name__
319
+ application_name = self._get_application_name(context)
275
320
 
276
321
  feature_stats = (
277
322
  mm_api.get_sample_set_statistics(reference_data)
@@ -287,21 +332,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
287
332
  project=project,
288
333
  ) as (endpoints_output, application_schedules):
289
334
 
290
- def call_do_tracking(event: Optional[dict] = None):
335
+ def call_do_tracking(
336
+ monitoring_context: mm_context.MonitoringApplicationContext,
337
+ ):
291
338
  nonlocal endpoints_output
292
339
 
293
- if event is None:
294
- event = {}
295
- monitoring_context = (
296
- mm_context.MonitoringApplicationContext._from_ml_ctx(
297
- event=event,
298
- application_name=application_name,
299
- context=context,
300
- project=project,
301
- sample_df=sample_data,
302
- feature_stats=feature_stats,
303
- )
304
- )
305
340
  result = self.do_tracking(monitoring_context)
306
341
  endpoints_output[monitoring_context.endpoint_id].append(
307
342
  (monitoring_context, result)
@@ -309,99 +344,184 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
309
344
  return result
310
345
 
311
346
  if endpoints is not None:
312
- resolved_endpoints = self._handle_endpoints_type_evaluate(
347
+ resolved_endpoints = self._normalize_and_validate_endpoints(
313
348
  project=project, endpoints=endpoints
314
349
  )
350
+ if (
351
+ write_output
352
+ and existing_data_handling == ExistingDataHandling.delete_all
353
+ ):
354
+ endpoint_ids = [
355
+ endpoint_id for _, endpoint_id in resolved_endpoints
356
+ ]
357
+ context.logger.info(
358
+ "Deleting all the application data before running the application",
359
+ application_name=application_name,
360
+ endpoint_ids=endpoint_ids,
361
+ )
362
+ self._delete_application_data(
363
+ project_name=project.name,
364
+ application_name=application_name,
365
+ endpoint_ids=endpoint_ids,
366
+ application_schedules=application_schedules,
367
+ )
315
368
  for endpoint_name, endpoint_id in resolved_endpoints:
316
- for window_start, window_end in self._window_generator(
369
+ for monitoring_ctx in self._window_generator(
317
370
  start=start,
318
371
  end=end,
319
372
  base_period=base_period,
320
373
  application_schedules=application_schedules,
321
374
  endpoint_id=endpoint_id,
375
+ endpoint_name=endpoint_name,
322
376
  application_name=application_name,
323
- allow_unordered_data=allow_unordered_data,
377
+ existing_data_handling=existing_data_handling,
378
+ sample_data=sample_data,
379
+ context=context,
380
+ project=project,
324
381
  ):
325
- result = call_do_tracking(
326
- event={
327
- mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
328
- mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
329
- mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
330
- mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
331
- }
332
- )
382
+ result = call_do_tracking(monitoring_ctx)
333
383
  result_key = (
334
- f"{endpoint_name}-{endpoint_id}_{window_start.isoformat()}_{window_end.isoformat()}"
335
- if window_start and window_end
384
+ f"{endpoint_name}-{endpoint_id}_{monitoring_ctx.start_infer_time.isoformat()}_{monitoring_ctx.end_infer_time.isoformat()}"
385
+ if monitoring_ctx.start_infer_time
386
+ and monitoring_ctx.end_infer_time
336
387
  else f"{endpoint_name}-{endpoint_id}"
337
388
  )
338
389
 
339
390
  context.log_result(
340
391
  result_key, self._flatten_data_result(result)
341
392
  )
393
+ # Check if no result was produced for any endpoint (e.g., due to no data in all windows)
394
+ if not any(endpoints_output.values()):
395
+ context.logger.warning(
396
+ "No data was found for any of the specified endpoints. "
397
+ "No results were produced",
398
+ application_name=application_name,
399
+ endpoints=endpoints,
400
+ start=start,
401
+ end=end,
402
+ )
342
403
  else:
343
- return self._flatten_data_result(call_do_tracking())
404
+ result = call_do_tracking(
405
+ mm_context.MonitoringApplicationContext._from_ml_ctx(
406
+ context=context,
407
+ project=project,
408
+ application_name=application_name,
409
+ event={},
410
+ sample_df=sample_data,
411
+ feature_stats=feature_stats,
412
+ )
413
+ )
414
+ return self._flatten_data_result(result)
344
415
 
345
416
  @staticmethod
346
- def _handle_endpoints_type_evaluate(
417
+ def _check_endpoints_first_request(
418
+ endpoints: list[mlrun.common.schemas.ModelEndpoint],
419
+ ) -> None:
420
+ """Make sure that all the endpoints have had at least one request"""
421
+ endpoints_no_requests = [
422
+ (endpoint.metadata.name, endpoint.metadata.uid)
423
+ for endpoint in endpoints
424
+ if not endpoint.status.first_request
425
+ ]
426
+ if endpoints_no_requests:
427
+ raise mlrun.errors.MLRunValueError(
428
+ "The following model endpoints have not had any requests yet and "
429
+ "have no data, cannot run the model monitoring application on them: "
430
+ f"{endpoints_no_requests}"
431
+ )
432
+
433
+ @classmethod
434
+ def _normalize_and_validate_endpoints(
435
+ cls,
347
436
  project: "mlrun.MlrunProject",
348
437
  endpoints: Union[
349
438
  list[tuple[str, str]], list[list[str]], list[str], Literal["all"]
350
439
  ],
351
- ) -> Union[list[tuple[str, str]], list[list[str]]]:
352
- if not endpoints:
353
- raise mlrun.errors.MLRunValueError(
354
- "The endpoints list cannot be empty. If you want to run on all the endpoints, "
355
- 'use `endpoints="all"`.'
356
- )
357
-
358
- if isinstance(endpoints, list) and isinstance(endpoints[0], (tuple, list)):
359
- return endpoints
360
-
361
- if not (isinstance(endpoints, list) and isinstance(endpoints[0], str)):
362
- if isinstance(endpoints, str):
363
- if endpoints != "all":
364
- raise mlrun.errors.MLRunValueError(
365
- 'A string input for `endpoints` can only be "all" for all the model endpoints in '
366
- "the project. If you want to select a single model endpoint with the given name, "
367
- f'use a list: `endpoints=["{endpoints}"]`.'
440
+ ) -> list[tuple[str, str]]:
441
+ if isinstance(endpoints, list):
442
+ if all(
443
+ isinstance(endpoint, (tuple, list)) and len(endpoint) == 2
444
+ for endpoint in endpoints
445
+ ):
446
+ # A list of [(name, uid), ...] / [[name, uid], ...] tuples/lists
447
+ endpoint_uids_to_names = {
448
+ endpoint[1]: endpoint[0] for endpoint in endpoints
449
+ }
450
+ endpoints_list = project.list_model_endpoints(
451
+ uids=list(endpoint_uids_to_names.keys()), latest_only=True
452
+ ).endpoints
453
+
454
+ # Check for missing endpoint uids or name/uid mismatches
455
+ for endpoint in endpoints_list:
456
+ if (
457
+ endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]
458
+ != endpoint.metadata.name
459
+ ):
460
+ raise mlrun.errors.MLRunNotFoundError(
461
+ "Could not find model endpoint with name "
462
+ f"'{endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]}' "
463
+ f"and uid '{endpoint.metadata.uid}'"
464
+ )
465
+ missing = set(endpoint_uids_to_names.keys()) - {
466
+ cast(str, endpoint.metadata.uid) for endpoint in endpoints_list
467
+ }
468
+ if missing:
469
+ raise mlrun.errors.MLRunNotFoundError(
470
+ "Could not find model endpoints with the following uids: "
471
+ f"{missing}"
368
472
  )
369
- else:
370
- raise mlrun.errors.MLRunValueError(
371
- f"Could not resolve endpoints as list of [(name, uid)], {endpoints=}"
372
- )
373
473
 
374
- if endpoints == "all":
375
- endpoint_names = None
376
- else:
377
- endpoint_names = endpoints
378
-
379
- endpoints_list = project.list_model_endpoints(
380
- names=endpoint_names, latest_only=True
381
- ).endpoints
382
- if endpoints_list:
383
- list_endpoints_result = [
384
- (endpoint.metadata.name, endpoint.metadata.uid)
385
- for endpoint in endpoints_list
386
- ]
387
- if endpoints != "all":
474
+ elif all(isinstance(endpoint, str) for endpoint in endpoints):
475
+ # A list of [name, ...] strings
476
+ endpoint_names = cast(list[str], endpoints)
477
+ endpoints_list = project.list_model_endpoints(
478
+ names=endpoint_names, latest_only=True
479
+ ).endpoints
480
+
481
+ # Check for missing endpoint names
388
482
  missing = set(endpoints) - {
389
- endpoint[0] for endpoint in list_endpoints_result
483
+ endpoint.metadata.name for endpoint in endpoints_list
390
484
  }
391
485
  if missing:
392
486
  logger.warning(
393
487
  "Could not list all the required endpoints",
394
- missing_endpoint=missing,
395
- endpoints=list_endpoints_result,
488
+ missing_endpoints=missing,
489
+ endpoints_list=endpoints_list,
396
490
  )
397
- return list_endpoints_result
491
+ else:
492
+ raise mlrun.errors.MLRunValueError(
493
+ "Could not resolve the following list as a list of endpoints:\n"
494
+ f"{endpoints}\n"
495
+ "The list must be either a list of (name, uid) tuples/lists or a list of names."
496
+ )
497
+ elif endpoints == "all":
498
+ endpoints_list = project.list_model_endpoints(latest_only=True).endpoints
499
+ elif isinstance(endpoints, str):
500
+ raise mlrun.errors.MLRunValueError(
501
+ 'A string input for `endpoints` can only be "all" for all the model endpoints in '
502
+ "the project. If you want to select a single model endpoint with the given name, "
503
+ f'use a list: `endpoints=["{endpoints}"]`.'
504
+ )
398
505
  else:
399
- if endpoints != "all":
400
- err_msg_suffix = f" named '{endpoints}'"
506
+ raise mlrun.errors.MLRunValueError(
507
+ "Could not resolve the `endpoints` parameter. The parameter must be either:\n"
508
+ "- a list of (name, uid) tuples/lists\n"
509
+ "- a list of names\n"
510
+ '- the string "all" for all the model endpoints in the project.'
511
+ )
512
+
513
+ if not endpoints_list:
401
514
  raise mlrun.errors.MLRunNotFoundError(
402
- f"Did not find any model endpoints {err_msg_suffix}"
515
+ f"Did not find any model endpoints {endpoints=}"
403
516
  )
404
517
 
518
+ cls._check_endpoints_first_request(endpoints_list)
519
+
520
+ return [
521
+ (endpoint.metadata.name, cast(str, endpoint.metadata.uid))
522
+ for endpoint in endpoints_list
523
+ ]
524
+
405
525
  @staticmethod
406
526
  def _validate_and_get_window_length(
407
527
  *, base_period: int, start_dt: datetime, end_dt: datetime
@@ -443,7 +563,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
443
563
  end_dt: datetime,
444
564
  base_period: Optional[int],
445
565
  application_name: str,
446
- allow_unordered_data: bool,
566
+ existing_data_handling: ExistingDataHandling,
447
567
  ) -> datetime:
448
568
  """Make sure that the (app, endpoint) pair doesn't write output before the last analyzed window"""
449
569
  if application_schedules:
@@ -452,7 +572,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
452
572
  )
453
573
  if last_analyzed:
454
574
  if start_dt < last_analyzed:
455
- if allow_unordered_data:
575
+ if existing_data_handling == ExistingDataHandling.skip_overlap:
456
576
  if last_analyzed < end_dt and base_period is None:
457
577
  logger.warn(
458
578
  "Setting the start time to last_analyzed since the original start time precedes "
@@ -466,15 +586,17 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
466
586
  else:
467
587
  raise mlrun.errors.MLRunValueError(
468
588
  "The start time for the application and endpoint precedes the last analyzed time: "
469
- f"{start_dt=}, {last_analyzed=}, {application_name=}, {endpoint_id=}. "
589
+ f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
590
+ f"{endpoint_id=}. "
470
591
  "Writing data out of order is not supported, and the start time could not be "
471
592
  "dynamically reset, as last_analyzed is later than the given end time or that "
472
- f"base_period was specified ({end_dt=}, {base_period=})."
593
+ f"base_period was specified (end_dt='{end_dt}', {base_period=})."
473
594
  )
474
595
  else:
475
596
  raise mlrun.errors.MLRunValueError(
476
597
  "The start time for the application and endpoint precedes the last analyzed time: "
477
- f"{start_dt=}, {last_analyzed=}, {application_name=}, {endpoint_id=}. "
598
+ f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
599
+ f"{endpoint_id=}. "
478
600
  "Writing data out of order is not supported. You should change the start time to "
479
601
  f"'{last_analyzed}' or later."
480
602
  )
@@ -487,6 +609,25 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
487
609
  )
488
610
  return start_dt
489
611
 
612
+ @staticmethod
613
+ def _delete_application_data(
614
+ project_name: str,
615
+ application_name: str,
616
+ endpoint_ids: list[str],
617
+ application_schedules: Optional[
618
+ mm_schedules.ModelMonitoringSchedulesFileApplication
619
+ ],
620
+ ) -> None:
621
+ mlrun.get_run_db().delete_model_monitoring_metrics(
622
+ project=project_name,
623
+ application_name=application_name,
624
+ endpoint_ids=endpoint_ids,
625
+ )
626
+ if application_schedules:
627
+ application_schedules.delete_endpoints_last_analyzed(
628
+ endpoint_uids=endpoint_ids
629
+ )
630
+
490
631
  @classmethod
491
632
  def _window_generator(
492
633
  cls,
@@ -497,34 +638,79 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
497
638
  application_schedules: Optional[
498
639
  mm_schedules.ModelMonitoringSchedulesFileApplication
499
640
  ],
641
+ endpoint_name: str,
500
642
  endpoint_id: str,
501
643
  application_name: str,
502
- allow_unordered_data: bool,
503
- ) -> Iterator[tuple[Optional[datetime], Optional[datetime]]]:
644
+ existing_data_handling: ExistingDataHandling,
645
+ context: "mlrun.MLClientCtx",
646
+ project: "mlrun.MlrunProject",
647
+ sample_data: Optional[pd.DataFrame],
648
+ ) -> Iterator[mm_context.MonitoringApplicationContext]:
649
+ def yield_monitoring_ctx(
650
+ window_start: Optional[datetime], window_end: Optional[datetime]
651
+ ) -> Iterator[mm_context.MonitoringApplicationContext]:
652
+ ctx = mm_context.MonitoringApplicationContext._from_ml_ctx(
653
+ event={
654
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
655
+ mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
656
+ mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
657
+ mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
658
+ },
659
+ application_name=application_name,
660
+ context=context,
661
+ project=project,
662
+ sample_df=sample_data,
663
+ )
664
+
665
+ if ctx.sample_df.empty:
666
+ # The current sample is empty
667
+ context.logger.debug(
668
+ "No sample data available for tracking",
669
+ application_name=application_name,
670
+ endpoint_id=ctx.endpoint_id,
671
+ start_time=ctx.start_infer_time,
672
+ end_time=ctx.end_infer_time,
673
+ )
674
+ return
675
+
676
+ yield ctx
677
+
678
+ if application_schedules and window_end:
679
+ application_schedules.update_endpoint_last_analyzed(
680
+ endpoint_uid=endpoint_id, last_analyzed=window_end
681
+ )
682
+
504
683
  if start is None or end is None:
505
684
  # A single window based on the `sample_data` input - see `_handler`.
506
- yield None, None
685
+ yield from yield_monitoring_ctx(None, None)
507
686
  return
508
687
 
509
688
  start_dt = datetime.fromisoformat(start)
510
689
  end_dt = datetime.fromisoformat(end)
511
690
 
512
- start_dt = cls._validate_monotonically_increasing_data(
513
- application_schedules=application_schedules,
514
- endpoint_id=endpoint_id,
515
- start_dt=start_dt,
516
- end_dt=end_dt,
517
- base_period=base_period,
518
- application_name=application_name,
519
- allow_unordered_data=allow_unordered_data,
520
- )
691
+ # If `start_dt` and `end_dt` do not include time zone information - change them to UTC
692
+ if (start_dt.tzinfo is None) and (end_dt.tzinfo is None):
693
+ start_dt = start_dt.replace(tzinfo=timezone.utc)
694
+ end_dt = end_dt.replace(tzinfo=timezone.utc)
695
+ elif (start_dt.tzinfo is None) or (end_dt.tzinfo is None):
696
+ raise mlrun.errors.MLRunValueError(
697
+ "The start and end times must either both include time zone information or both be naive (no time "
698
+ f"zone). Asserting the above failed, aborting the evaluate request: start={start}, end={end}."
699
+ )
700
+
701
+ if existing_data_handling != ExistingDataHandling.delete_all:
702
+ start_dt = cls._validate_monotonically_increasing_data(
703
+ application_schedules=application_schedules,
704
+ endpoint_id=endpoint_id,
705
+ start_dt=start_dt,
706
+ end_dt=end_dt,
707
+ base_period=base_period,
708
+ application_name=application_name,
709
+ existing_data_handling=existing_data_handling,
710
+ )
521
711
 
522
712
  if base_period is None:
523
- yield start_dt, end_dt
524
- if application_schedules:
525
- application_schedules.update_endpoint_last_analyzed(
526
- endpoint_uid=endpoint_id, last_analyzed=end_dt
527
- )
713
+ yield from yield_monitoring_ctx(start_dt, end_dt)
528
714
  return
529
715
 
530
716
  window_length = cls._validate_and_get_window_length(
@@ -534,11 +720,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
534
720
  current_start_time = start_dt
535
721
  while current_start_time < end_dt:
536
722
  current_end_time = min(current_start_time + window_length, end_dt)
537
- yield current_start_time, current_end_time
538
- if application_schedules:
539
- application_schedules.update_endpoint_last_analyzed(
540
- endpoint_uid=endpoint_id, last_analyzed=current_end_time
541
- )
723
+ yield from yield_monitoring_ctx(current_start_time, current_end_time)
542
724
  current_start_time = current_end_time
543
725
 
544
726
  @classmethod
@@ -589,6 +771,45 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
589
771
  """
590
772
  return f"{handler_to_class}::{cls._handler.__name__}"
591
773
 
774
+ @classmethod
775
+ def _determine_job_name(
776
+ cls,
777
+ *,
778
+ func_name: Optional[str],
779
+ class_handler: Optional[str],
780
+ handler_to_class: str,
781
+ ) -> str:
782
+ """
783
+ Determine the batch app's job name. This name is used also as the application name,
784
+ which is retrieved in `_get_application_name`.
785
+ """
786
+ if func_name:
787
+ job_name = func_name
788
+ else:
789
+ if not class_handler:
790
+ class_name = cls.__name__
791
+ else:
792
+ class_name = handler_to_class.split(".")[-1].split("::")[0]
793
+
794
+ job_name = mlrun.utils.normalize_name(class_name)
795
+
796
+ if not mm_constants.APP_NAME_REGEX.fullmatch(job_name):
797
+ raise mlrun.errors.MLRunValueError(
798
+ "The function name does not comply with the required pattern "
799
+ f"`{mm_constants.APP_NAME_REGEX.pattern}`. "
800
+ "Please choose another `func_name`."
801
+ )
802
+ job_name, was_renamed, suffix = mlrun.utils.helpers.ensure_batch_job_suffix(
803
+ job_name
804
+ )
805
+ if was_renamed:
806
+ mlrun.utils.logger.info(
807
+ f'Changing function name - adding `"{suffix}"` suffix',
808
+ func_name=job_name,
809
+ )
810
+
811
+ return job_name
812
+
592
813
  @classmethod
593
814
  def to_job(
594
815
  cls,
@@ -628,7 +849,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
628
849
  * ``end``, ``datetime``
629
850
  * ``base_period``, ``int``
630
851
  * ``write_output``, ``bool``
631
- * ``allow_unordered_data``, ``bool``
852
+ * ``existing_data_handling``, ``str``
632
853
 
633
854
  For Git sources, add the source archive to the returned job and change the handler:
634
855
 
@@ -647,7 +868,10 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
647
868
  :py:class:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase`,
648
869
  is used.
649
870
  :param func_path: The path to the function. If ``None``, the current notebook is used.
650
- :param func_name: The name of the function. If not ``None``, the class name is used.
871
+ :param func_name: The name of the function. If ``None``, the normalized class name is used
872
+ (:py:meth:`mlrun.utils.helpers.normalize_name`).
873
+ A ``"-batch"`` suffix is guaranteed to be added if not already there.
874
+ The function name is also used as the application name to use for the results.
651
875
  :param tag: Tag for the function.
652
876
  :param image: Docker image to run the job on (when running remotely).
653
877
  :param with_repo: Whether to clone the current repo to the build source.
@@ -668,12 +892,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
668
892
  handler_to_class = class_handler or cls.__name__
669
893
  handler = cls.get_job_handler(handler_to_class)
670
894
 
671
- if not class_handler:
672
- class_name = cls.__name__
673
- else:
674
- class_name = handler_to_class.split(".")[-1].split("::")[-1]
675
-
676
- job_name = func_name if func_name else class_name
895
+ job_name = cls._determine_job_name(
896
+ func_name=func_name,
897
+ class_handler=class_handler,
898
+ handler_to_class=handler_to_class,
899
+ )
677
900
 
678
901
  job = cast(
679
902
  mlrun.runtimes.KubejobRuntime,
@@ -712,7 +935,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
712
935
  end: Optional[datetime] = None,
713
936
  base_period: Optional[int] = None,
714
937
  write_output: bool = False,
715
- allow_unordered_data: bool = False,
938
+ existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
716
939
  stream_profile: Optional[ds_profile.DatastoreProfile] = None,
717
940
  ) -> "mlrun.RunObject":
718
941
  """
@@ -720,11 +943,14 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
720
943
  :py:meth:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase.do_tracking`
721
944
  model monitoring logic as a :py:class:`~mlrun.runtimes.KubejobRuntime`, which is an MLRun function.
722
945
 
723
- This function has default values for all of its arguments. You should be change them when you want to pass
946
+ This function has default values for all of its arguments. You should change them when you want to pass
724
947
  data to the application.
725
948
 
726
949
  :param func_path: The path to the function. If ``None``, the current notebook is used.
727
- :param func_name: The name of the function. If not ``None``, the class name is used.
950
+ :param func_name: The name of the function. If ``None``, the normalized class name is used
951
+ (:py:meth:`mlrun.utils.helpers.normalize_name`).
952
+ A ``"-batch"`` suffix is guaranteed to be added if not already there.
953
+ The function name is also used as the application name to use for the results.
728
954
  :param tag: Tag for the function.
729
955
  :param run_local: Whether to run the function locally or remotely.
730
956
  :param auto_build: Whether to auto build the function.
@@ -734,6 +960,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
734
960
  :param reference_data: Pandas data-frame or :py:class:`~mlrun.artifacts.dataset.DatasetArtifact` URI as
735
961
  the reference dataset.
736
962
  When set, its statistics override the model endpoint's feature statistics.
963
+ You do not need to have a model endpoint to use this option.
737
964
  :param image: Docker image to run the job on (when running remotely).
738
965
  :param with_repo: Whether to clone the current repo to the build source.
739
966
  :param class_handler: The relative path to the class, useful when using Git sources or code from images.
@@ -754,8 +981,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
754
981
  :param start: The start time of the endpoint's data, not included.
755
982
  If you want the model endpoint's data at ``start`` included, you need to subtract a
756
983
  small ``datetime.timedelta`` from it.
757
- Make sure to include the time zone when constructing `datetime.datetime` objects
758
- manually.
984
+ Make sure to include the time zone when constructing ``datetime.datetime`` objects
985
+ manually. When both ``start`` and ``end`` times do not include a time zone, they will
986
+ be treated as UTC.
759
987
  :param end: The end time of the endpoint's data, included.
760
988
  Please note: when ``start`` and ``end`` are set, they create a left-open time interval
761
989
  ("window") :math:`(\\operatorname{start}, \\operatorname{end}]` that excludes the
@@ -777,11 +1005,18 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
777
1005
  :param write_output: Whether to write the results and metrics to the time-series DB. Can be ``True`` only
778
1006
  if ``endpoints`` are passed.
779
1007
  Note: the model monitoring infrastructure must be up for the writing to work.
780
- :param allow_unordered_data: Relevant only when writing outputs to the database. When ``False``, and the
781
- requested ``start`` time precedes the ``end`` time of a previous run that also
782
- wrote to the database - an error is raised.
783
- If ``True``, when the previously described situation occurs, the relevant time
784
- window is cut so that it starts at the earliest possible time after ``start``.
1008
+ :param existing_data_handling:
1009
+ How to handle the existing application data for the model endpoints when writing
1010
+ new data whose requested ``start`` time precedes the ``end`` time of a previous run
1011
+ that also wrote to the database. Relevant only when ``write_output=True``.
1012
+ The options are:
1013
+
1014
+ - ``"fail_on_overlap"``: Default. An error is raised.
1015
+ - ``"skip_overlap"``: the overlapping data is ignored and the
1016
+ time window is cut so that it starts at the earliest possible time after ``start``.
1017
+ - ``"delete_all"``: delete all the data that was written by the application to the
1018
+ model endpoints, regardless of the time window, and write the new data.
1019
+
785
1020
  :param stream_profile: The stream datastore profile. It should be provided only when running locally and
786
1021
  writing the outputs to the database (i.e., when both ``run_local`` and
787
1022
  ``write_output`` are set to ``True``).
@@ -820,18 +1055,6 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
820
1055
  )
821
1056
  params["end"] = end.isoformat() if isinstance(end, datetime) else end
822
1057
  params["base_period"] = base_period
823
- params["write_output"] = write_output
824
- params["allow_unordered_data"] = allow_unordered_data
825
- if stream_profile:
826
- if not run_local:
827
- raise mlrun.errors.MLRunValueError(
828
- "Passing a `stream_profile` is relevant only when running locally"
829
- )
830
- if not write_output:
831
- raise mlrun.errors.MLRunValueError(
832
- "Passing a `stream_profile` is relevant only when writing the outputs"
833
- )
834
- params["stream_profile"] = stream_profile
835
1058
  elif start or end or base_period:
836
1059
  raise mlrun.errors.MLRunValueError(
837
1060
  "Custom `start` and `end` times or base_period are supported only with endpoints data"
@@ -841,6 +1064,19 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
841
1064
  "Writing the application output or passing `stream_profile` are supported only with endpoints data"
842
1065
  )
843
1066
 
1067
+ params["write_output"] = write_output
1068
+ params["existing_data_handling"] = existing_data_handling
1069
+ if stream_profile:
1070
+ if not run_local:
1071
+ raise mlrun.errors.MLRunValueError(
1072
+ "Passing a `stream_profile` is relevant only when running locally"
1073
+ )
1074
+ if not write_output:
1075
+ raise mlrun.errors.MLRunValueError(
1076
+ "Passing a `stream_profile` is relevant only when writing the outputs"
1077
+ )
1078
+ params["stream_profile"] = stream_profile
1079
+
844
1080
  inputs: dict[str, str] = {}
845
1081
  for data, identifier in [
846
1082
  (sample_data, "sample_data"),