mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/base.py +0 -31
  3. mlrun/artifacts/document.py +6 -1
  4. mlrun/artifacts/llm_prompt.py +123 -25
  5. mlrun/artifacts/manager.py +0 -5
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/common/constants.py +10 -1
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/common/model_monitoring/helpers.py +86 -0
  10. mlrun/common/schemas/__init__.py +3 -0
  11. mlrun/common/schemas/auth.py +2 -0
  12. mlrun/common/schemas/function.py +10 -0
  13. mlrun/common/schemas/hub.py +30 -18
  14. mlrun/common/schemas/model_monitoring/__init__.py +3 -0
  15. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  16. mlrun/common/schemas/model_monitoring/functions.py +14 -5
  17. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
  18. mlrun/common/schemas/pipeline.py +1 -1
  19. mlrun/common/schemas/serving.py +3 -0
  20. mlrun/common/schemas/workflow.py +3 -1
  21. mlrun/common/secrets.py +22 -1
  22. mlrun/config.py +33 -11
  23. mlrun/datastore/__init__.py +11 -3
  24. mlrun/datastore/azure_blob.py +162 -47
  25. mlrun/datastore/datastore.py +9 -4
  26. mlrun/datastore/datastore_profile.py +61 -5
  27. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  28. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  29. mlrun/datastore/model_provider/model_provider.py +230 -65
  30. mlrun/datastore/model_provider/openai_provider.py +295 -42
  31. mlrun/datastore/s3.py +24 -2
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +47 -19
  35. mlrun/db/httpdb.py +120 -56
  36. mlrun/db/nopdb.py +38 -10
  37. mlrun/execution.py +70 -19
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +15 -0
  43. mlrun/model.py +24 -3
  44. mlrun/model_monitoring/__init__.py +1 -0
  45. mlrun/model_monitoring/api.py +66 -27
  46. mlrun/model_monitoring/applications/__init__.py +1 -1
  47. mlrun/model_monitoring/applications/base.py +509 -117
  48. mlrun/model_monitoring/applications/context.py +2 -4
  49. mlrun/model_monitoring/applications/results.py +4 -7
  50. mlrun/model_monitoring/controller.py +239 -101
  51. mlrun/model_monitoring/db/_schedules.py +116 -33
  52. mlrun/model_monitoring/db/_stats.py +4 -3
  53. mlrun/model_monitoring/db/tsdb/base.py +100 -9
  54. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
  55. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
  56. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  57. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  58. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
  59. mlrun/model_monitoring/helpers.py +54 -9
  60. mlrun/model_monitoring/stream_processing.py +45 -14
  61. mlrun/model_monitoring/writer.py +220 -1
  62. mlrun/platforms/__init__.py +3 -2
  63. mlrun/platforms/iguazio.py +7 -3
  64. mlrun/projects/operations.py +6 -1
  65. mlrun/projects/pipelines.py +46 -26
  66. mlrun/projects/project.py +166 -58
  67. mlrun/run.py +94 -17
  68. mlrun/runtimes/__init__.py +18 -0
  69. mlrun/runtimes/base.py +14 -6
  70. mlrun/runtimes/daskjob.py +7 -0
  71. mlrun/runtimes/local.py +5 -2
  72. mlrun/runtimes/mounts.py +20 -2
  73. mlrun/runtimes/mpijob/abstract.py +6 -0
  74. mlrun/runtimes/mpijob/v1.py +6 -0
  75. mlrun/runtimes/nuclio/__init__.py +1 -0
  76. mlrun/runtimes/nuclio/application/application.py +149 -17
  77. mlrun/runtimes/nuclio/function.py +76 -27
  78. mlrun/runtimes/nuclio/serving.py +97 -15
  79. mlrun/runtimes/pod.py +234 -21
  80. mlrun/runtimes/remotesparkjob.py +6 -0
  81. mlrun/runtimes/sparkjob/spark3job.py +6 -0
  82. mlrun/runtimes/utils.py +49 -11
  83. mlrun/secrets.py +54 -13
  84. mlrun/serving/__init__.py +2 -0
  85. mlrun/serving/remote.py +79 -6
  86. mlrun/serving/routers.py +23 -41
  87. mlrun/serving/server.py +320 -80
  88. mlrun/serving/states.py +725 -157
  89. mlrun/serving/steps.py +62 -0
  90. mlrun/serving/system_steps.py +200 -119
  91. mlrun/serving/v2_serving.py +9 -10
  92. mlrun/utils/helpers.py +288 -88
  93. mlrun/utils/logger.py +3 -1
  94. mlrun/utils/notifications/notification/base.py +18 -0
  95. mlrun/utils/notifications/notification/git.py +2 -4
  96. mlrun/utils/notifications/notification/slack.py +2 -4
  97. mlrun/utils/notifications/notification/webhook.py +2 -5
  98. mlrun/utils/notifications/notification_pusher.py +1 -1
  99. mlrun/utils/retryer.py +15 -2
  100. mlrun/utils/version/version.json +2 -2
  101. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
  102. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
  103. mlrun/api/schemas/__init__.py +0 -259
  104. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  105. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  106. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  107. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
@@ -17,25 +17,35 @@ import socket
17
17
  from abc import ABC, abstractmethod
18
18
  from collections import defaultdict
19
19
  from collections.abc import Iterator
20
- from contextlib import contextmanager
21
- from datetime import datetime, timedelta
20
+ from contextlib import contextmanager, nullcontext
21
+ from datetime import datetime, timedelta, timezone
22
22
  from typing import Any, Literal, Optional, Union, cast
23
23
 
24
24
  import pandas as pd
25
25
 
26
26
  import mlrun
27
27
  import mlrun.common.constants as mlrun_constants
28
+ import mlrun.common.helpers
28
29
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
30
+ import mlrun.common.types
29
31
  import mlrun.datastore.datastore_profile as ds_profile
30
32
  import mlrun.errors
31
33
  import mlrun.model_monitoring.api as mm_api
32
34
  import mlrun.model_monitoring.applications.context as mm_context
33
35
  import mlrun.model_monitoring.applications.results as mm_results
36
+ import mlrun.model_monitoring.db._schedules as mm_schedules
34
37
  import mlrun.model_monitoring.helpers as mm_helpers
38
+ import mlrun.utils
35
39
  from mlrun.serving.utils import MonitoringApplicationToDict
36
40
  from mlrun.utils import logger
37
41
 
38
42
 
43
+ class ExistingDataHandling(mlrun.common.types.StrEnum):
44
+ fail_on_overlap = "fail_on_overlap"
45
+ skip_overlap = "skip_overlap"
46
+ delete_all = "delete_all"
47
+
48
+
39
49
  def _serialize_context_and_result(
40
50
  *,
41
51
  context: mm_context.MonitoringApplicationContext,
@@ -183,16 +193,47 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
183
193
  cls,
184
194
  *,
185
195
  write_output: bool,
196
+ application_name: str,
197
+ artifact_path: str,
186
198
  stream_profile: Optional[ds_profile.DatastoreProfile],
187
199
  project: "mlrun.MlrunProject",
188
- ) -> Iterator[dict[str, list[tuple]]]:
189
- endpoints_output: dict[str, list[tuple]] = defaultdict(list)
200
+ ) -> Iterator[
201
+ tuple[
202
+ dict[str, list[tuple]],
203
+ Optional[mm_schedules.ModelMonitoringSchedulesFileApplication],
204
+ ]
205
+ ]:
206
+ endpoints_output: dict[
207
+ str,
208
+ list[
209
+ tuple[
210
+ mm_context.MonitoringApplicationContext,
211
+ Union[
212
+ mm_results.ModelMonitoringApplicationResult,
213
+ mm_results.ModelMonitoringApplicationMetric,
214
+ list[
215
+ Union[
216
+ mm_results.ModelMonitoringApplicationResult,
217
+ mm_results.ModelMonitoringApplicationMetric,
218
+ mm_results._ModelMonitoringApplicationStats,
219
+ ]
220
+ ],
221
+ ],
222
+ ]
223
+ ],
224
+ ] = defaultdict(list)
225
+ application_schedules = nullcontext()
190
226
  if write_output:
191
227
  cls._check_writer_is_up(project)
228
+ application_schedules = (
229
+ mm_schedules.ModelMonitoringSchedulesFileApplication(
230
+ artifact_path, application=application_name
231
+ )
232
+ )
192
233
  try:
193
- yield endpoints_output
234
+ yield endpoints_output, application_schedules.__enter__()
194
235
  finally:
195
- if write_output:
236
+ if write_output and any(endpoints_output.values()):
196
237
  logger.debug(
197
238
  "Pushing model monitoring application job data to the writer stream",
198
239
  passed_stream_profile=str(stream_profile),
@@ -206,11 +247,21 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
206
247
  profile=stream_profile,
207
248
  )
208
249
  for endpoint_id, outputs in endpoints_output.items():
250
+ writer_events = []
251
+ for ctx, res in outputs:
252
+ if isinstance(res, list):
253
+ writer_events.extend(
254
+ _serialize_context_and_result(
255
+ context=ctx, result=sub_res
256
+ )
257
+ for sub_res in res
258
+ )
259
+ else:
260
+ writer_events.append(
261
+ _serialize_context_and_result(context=ctx, result=res)
262
+ )
209
263
  writer_stream.push(
210
- [
211
- _serialize_context_and_result(context=ctx, result=res)
212
- for ctx, res in outputs
213
- ],
264
+ writer_events,
214
265
  partition_key=endpoint_id,
215
266
  )
216
267
  logger.debug(
@@ -218,6 +269,20 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
218
269
  endpoints_output=endpoints_output,
219
270
  )
220
271
 
272
+ logger.debug(
273
+ "Saving the application schedules",
274
+ application_name=application_name,
275
+ )
276
+ application_schedules.__exit__(None, None, None)
277
+
278
+ @classmethod
279
+ def _get_application_name(cls, context: "mlrun.MLClientCtx") -> str:
280
+ """Get the application name from the context via the function URI"""
281
+ _, application_name, _, _ = mlrun.common.helpers.parse_versioned_object_uri(
282
+ context.to_dict().get("spec", {}).get("function", "")
283
+ )
284
+ return application_name
285
+
221
286
  def _handler(
222
287
  self,
223
288
  context: "mlrun.MLClientCtx",
@@ -230,6 +295,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
230
295
  end: Optional[str] = None,
231
296
  base_period: Optional[int] = None,
232
297
  write_output: bool = False,
298
+ existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
233
299
  stream_profile: Optional[ds_profile.DatastoreProfile] = None,
234
300
  ):
235
301
  """
@@ -250,6 +316,8 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
250
316
  "working with endpoints, without any custom data-frame input"
251
317
  )
252
318
 
319
+ application_name = self._get_application_name(context)
320
+
253
321
  feature_stats = (
254
322
  mm_api.get_sample_set_statistics(reference_data)
255
323
  if reference_data is not None
@@ -257,24 +325,18 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
257
325
  )
258
326
 
259
327
  with self._push_to_writer(
260
- write_output=write_output, stream_profile=stream_profile, project=project
261
- ) as endpoints_output:
328
+ write_output=write_output,
329
+ stream_profile=stream_profile,
330
+ application_name=application_name,
331
+ artifact_path=context.artifact_path,
332
+ project=project,
333
+ ) as (endpoints_output, application_schedules):
262
334
 
263
- def call_do_tracking(event: Optional[dict] = None):
335
+ def call_do_tracking(
336
+ monitoring_context: mm_context.MonitoringApplicationContext,
337
+ ):
264
338
  nonlocal endpoints_output
265
339
 
266
- if event is None:
267
- event = {}
268
- monitoring_context = (
269
- mm_context.MonitoringApplicationContext._from_ml_ctx(
270
- event=event,
271
- application_name=self.__class__.__name__,
272
- context=context,
273
- project=project,
274
- sample_df=sample_data,
275
- feature_stats=feature_stats,
276
- )
277
- )
278
340
  result = self.do_tracking(monitoring_context)
279
341
  endpoints_output[monitoring_context.endpoint_id].append(
280
342
  (monitoring_context, result)
@@ -282,119 +344,383 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
282
344
  return result
283
345
 
284
346
  if endpoints is not None:
285
- resolved_endpoints = self._handle_endpoints_type_evaluate(
347
+ resolved_endpoints = self._normalize_and_validate_endpoints(
286
348
  project=project, endpoints=endpoints
287
349
  )
288
- for window_start, window_end in self._window_generator(
289
- start, end, base_period
350
+ if (
351
+ write_output
352
+ and existing_data_handling == ExistingDataHandling.delete_all
290
353
  ):
291
- for endpoint_name, endpoint_id in resolved_endpoints:
292
- result = call_do_tracking(
293
- event={
294
- mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
295
- mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
296
- mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
297
- mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
298
- }
299
- )
354
+ endpoint_ids = [
355
+ endpoint_id for _, endpoint_id in resolved_endpoints
356
+ ]
357
+ context.logger.info(
358
+ "Deleting all the application data before running the application",
359
+ application_name=application_name,
360
+ endpoint_ids=endpoint_ids,
361
+ )
362
+ self._delete_application_data(
363
+ project_name=project.name,
364
+ application_name=application_name,
365
+ endpoint_ids=endpoint_ids,
366
+ application_schedules=application_schedules,
367
+ )
368
+ for endpoint_name, endpoint_id in resolved_endpoints:
369
+ for monitoring_ctx in self._window_generator(
370
+ start=start,
371
+ end=end,
372
+ base_period=base_period,
373
+ application_schedules=application_schedules,
374
+ endpoint_id=endpoint_id,
375
+ endpoint_name=endpoint_name,
376
+ application_name=application_name,
377
+ existing_data_handling=existing_data_handling,
378
+ sample_data=sample_data,
379
+ context=context,
380
+ project=project,
381
+ ):
382
+ result = call_do_tracking(monitoring_ctx)
300
383
  result_key = (
301
- f"{endpoint_name}-{endpoint_id}_{window_start.isoformat()}_{window_end.isoformat()}"
302
- if window_start and window_end
384
+ f"{endpoint_name}-{endpoint_id}_{monitoring_ctx.start_infer_time.isoformat()}_{monitoring_ctx.end_infer_time.isoformat()}"
385
+ if monitoring_ctx.start_infer_time
386
+ and monitoring_ctx.end_infer_time
303
387
  else f"{endpoint_name}-{endpoint_id}"
304
388
  )
305
389
 
306
390
  context.log_result(
307
391
  result_key, self._flatten_data_result(result)
308
392
  )
393
+ # Check if no result was produced for any endpoint (e.g., due to no data in all windows)
394
+ if not any(endpoints_output.values()):
395
+ context.logger.warning(
396
+ "No data was found for any of the specified endpoints. "
397
+ "No results were produced",
398
+ application_name=application_name,
399
+ endpoints=endpoints,
400
+ start=start,
401
+ end=end,
402
+ )
309
403
  else:
310
- return self._flatten_data_result(call_do_tracking())
404
+ result = call_do_tracking(
405
+ mm_context.MonitoringApplicationContext._from_ml_ctx(
406
+ context=context,
407
+ project=project,
408
+ application_name=application_name,
409
+ event={},
410
+ sample_df=sample_data,
411
+ feature_stats=feature_stats,
412
+ )
413
+ )
414
+ return self._flatten_data_result(result)
311
415
 
312
416
  @staticmethod
313
- def _handle_endpoints_type_evaluate(
417
+ def _check_endpoints_first_request(
418
+ endpoints: list[mlrun.common.schemas.ModelEndpoint],
419
+ ) -> None:
420
+ """Make sure that all the endpoints have had at least one request"""
421
+ endpoints_no_requests = [
422
+ (endpoint.metadata.name, endpoint.metadata.uid)
423
+ for endpoint in endpoints
424
+ if not endpoint.status.first_request
425
+ ]
426
+ if endpoints_no_requests:
427
+ raise mlrun.errors.MLRunValueError(
428
+ "The following model endpoints have not had any requests yet and "
429
+ "have no data, cannot run the model monitoring application on them: "
430
+ f"{endpoints_no_requests}"
431
+ )
432
+
433
+ @classmethod
434
+ def _normalize_and_validate_endpoints(
435
+ cls,
314
436
  project: "mlrun.MlrunProject",
315
437
  endpoints: Union[
316
438
  list[tuple[str, str]], list[list[str]], list[str], Literal["all"]
317
439
  ],
318
- ) -> Union[list[tuple[str, str]], list[list[str]]]:
319
- if not endpoints:
320
- raise mlrun.errors.MLRunValueError(
321
- "The endpoints list cannot be empty. If you want to run on all the endpoints, "
322
- 'use `endpoints="all"`.'
323
- )
324
-
325
- if isinstance(endpoints, list) and isinstance(endpoints[0], (tuple, list)):
326
- return endpoints
327
-
328
- if not (isinstance(endpoints, list) and isinstance(endpoints[0], str)):
329
- if isinstance(endpoints, str):
330
- if endpoints != "all":
331
- raise mlrun.errors.MLRunValueError(
332
- 'A string input for `endpoints` can only be "all" for all the model endpoints in '
333
- "the project. If you want to select a single model endpoint with the given name, "
334
- f'use a list: `endpoints=["{endpoints}"]`.'
440
+ ) -> list[tuple[str, str]]:
441
+ if isinstance(endpoints, list):
442
+ if all(
443
+ isinstance(endpoint, (tuple, list)) and len(endpoint) == 2
444
+ for endpoint in endpoints
445
+ ):
446
+ # A list of [(name, uid), ...] / [[name, uid], ...] tuples/lists
447
+ endpoint_uids_to_names = {
448
+ endpoint[1]: endpoint[0] for endpoint in endpoints
449
+ }
450
+ endpoints_list = project.list_model_endpoints(
451
+ uids=list(endpoint_uids_to_names.keys()), latest_only=True
452
+ ).endpoints
453
+
454
+ # Check for missing endpoint uids or name/uid mismatches
455
+ for endpoint in endpoints_list:
456
+ if (
457
+ endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]
458
+ != endpoint.metadata.name
459
+ ):
460
+ raise mlrun.errors.MLRunNotFoundError(
461
+ "Could not find model endpoint with name "
462
+ f"'{endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]}' "
463
+ f"and uid '{endpoint.metadata.uid}'"
464
+ )
465
+ missing = set(endpoint_uids_to_names.keys()) - {
466
+ cast(str, endpoint.metadata.uid) for endpoint in endpoints_list
467
+ }
468
+ if missing:
469
+ raise mlrun.errors.MLRunNotFoundError(
470
+ "Could not find model endpoints with the following uids: "
471
+ f"{missing}"
335
472
  )
336
- else:
337
- raise mlrun.errors.MLRunValueError(
338
- f"Could not resolve endpoints as list of [(name, uid)], {endpoints=}"
339
- )
340
473
 
341
- if endpoints == "all":
342
- endpoint_names = None
343
- else:
344
- endpoint_names = endpoints
345
-
346
- endpoints_list = project.list_model_endpoints(
347
- names=endpoint_names, latest_only=True
348
- ).endpoints
349
- if endpoints_list:
350
- list_endpoints_result = [
351
- (endpoint.metadata.name, endpoint.metadata.uid)
352
- for endpoint in endpoints_list
353
- ]
354
- if endpoints != "all":
474
+ elif all(isinstance(endpoint, str) for endpoint in endpoints):
475
+ # A list of [name, ...] strings
476
+ endpoint_names = cast(list[str], endpoints)
477
+ endpoints_list = project.list_model_endpoints(
478
+ names=endpoint_names, latest_only=True
479
+ ).endpoints
480
+
481
+ # Check for missing endpoint names
355
482
  missing = set(endpoints) - {
356
- endpoint[0] for endpoint in list_endpoints_result
483
+ endpoint.metadata.name for endpoint in endpoints_list
357
484
  }
358
485
  if missing:
359
486
  logger.warning(
360
487
  "Could not list all the required endpoints",
361
- missing_endpoint=missing,
362
- endpoints=list_endpoints_result,
488
+ missing_endpoints=missing,
489
+ endpoints_list=endpoints_list,
363
490
  )
364
- return list_endpoints_result
491
+ else:
492
+ raise mlrun.errors.MLRunValueError(
493
+ "Could not resolve the following list as a list of endpoints:\n"
494
+ f"{endpoints}\n"
495
+ "The list must be either a list of (name, uid) tuples/lists or a list of names."
496
+ )
497
+ elif endpoints == "all":
498
+ endpoints_list = project.list_model_endpoints(latest_only=True).endpoints
499
+ elif isinstance(endpoints, str):
500
+ raise mlrun.errors.MLRunValueError(
501
+ 'A string input for `endpoints` can only be "all" for all the model endpoints in '
502
+ "the project. If you want to select a single model endpoint with the given name, "
503
+ f'use a list: `endpoints=["{endpoints}"]`.'
504
+ )
365
505
  else:
366
- if endpoints != "all":
367
- err_msg_suffix = f" named '{endpoints}'"
506
+ raise mlrun.errors.MLRunValueError(
507
+ "Could not resolve the `endpoints` parameter. The parameter must be either:\n"
508
+ "- a list of (name, uid) tuples/lists\n"
509
+ "- a list of names\n"
510
+ '- the string "all" for all the model endpoints in the project.'
511
+ )
512
+
513
+ if not endpoints_list:
368
514
  raise mlrun.errors.MLRunNotFoundError(
369
- f"Did not find any model endpoints {err_msg_suffix}"
515
+ f"Did not find any model endpoints {endpoints=}"
370
516
  )
371
517
 
518
+ cls._check_endpoints_first_request(endpoints_list)
519
+
520
+ return [
521
+ (endpoint.metadata.name, cast(str, endpoint.metadata.uid))
522
+ for endpoint in endpoints_list
523
+ ]
524
+
372
525
  @staticmethod
526
+ def _validate_and_get_window_length(
527
+ *, base_period: int, start_dt: datetime, end_dt: datetime
528
+ ) -> timedelta:
529
+ if not isinstance(base_period, int) or base_period <= 0:
530
+ raise mlrun.errors.MLRunValueError(
531
+ "`base_period` must be a nonnegative integer - the number of minutes in a monitoring window"
532
+ )
533
+
534
+ window_length = timedelta(minutes=base_period)
535
+
536
+ full_interval_length = end_dt - start_dt
537
+ remainder = full_interval_length % window_length
538
+ if remainder:
539
+ if full_interval_length < window_length:
540
+ extra_msg = (
541
+ "The `base_period` is longer than the difference between `end` and `start`: "
542
+ f"{full_interval_length}. Consider not specifying `base_period`."
543
+ )
544
+ else:
545
+ extra_msg = (
546
+ f"Consider changing the `end` time to `end`={end_dt - remainder}"
547
+ )
548
+ raise mlrun.errors.MLRunValueError(
549
+ "The difference between `end` and `start` must be a multiple of `base_period`: "
550
+ f"`base_period`={window_length}, `start`={start_dt}, `end`={end_dt}. "
551
+ f"{extra_msg}"
552
+ )
553
+ return window_length
554
+
555
+ @staticmethod
556
+ def _validate_monotonically_increasing_data(
557
+ *,
558
+ application_schedules: Optional[
559
+ mm_schedules.ModelMonitoringSchedulesFileApplication
560
+ ],
561
+ endpoint_id: str,
562
+ start_dt: datetime,
563
+ end_dt: datetime,
564
+ base_period: Optional[int],
565
+ application_name: str,
566
+ existing_data_handling: ExistingDataHandling,
567
+ ) -> datetime:
568
+ """Make sure that the (app, endpoint) pair doesn't write output before the last analyzed window"""
569
+ if application_schedules:
570
+ last_analyzed = application_schedules.get_endpoint_last_analyzed(
571
+ endpoint_id
572
+ )
573
+ if last_analyzed:
574
+ if start_dt < last_analyzed:
575
+ if existing_data_handling == ExistingDataHandling.skip_overlap:
576
+ if last_analyzed < end_dt and base_period is None:
577
+ logger.warn(
578
+ "Setting the start time to last_analyzed since the original start time precedes "
579
+ "last_analyzed",
580
+ original_start=start_dt,
581
+ new_start=last_analyzed,
582
+ application_name=application_name,
583
+ endpoint_id=endpoint_id,
584
+ )
585
+ start_dt = last_analyzed
586
+ else:
587
+ raise mlrun.errors.MLRunValueError(
588
+ "The start time for the application and endpoint precedes the last analyzed time: "
589
+ f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
590
+ f"{endpoint_id=}. "
591
+ "Writing data out of order is not supported, and the start time could not be "
592
+ "dynamically reset, as last_analyzed is later than the given end time or that "
593
+ f"base_period was specified (end_dt='{end_dt}', {base_period=})."
594
+ )
595
+ else:
596
+ raise mlrun.errors.MLRunValueError(
597
+ "The start time for the application and endpoint precedes the last analyzed time: "
598
+ f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
599
+ f"{endpoint_id=}. "
600
+ "Writing data out of order is not supported. You should change the start time to "
601
+ f"'{last_analyzed}' or later."
602
+ )
603
+ else:
604
+ logger.debug(
605
+ "The application is running on the endpoint for the first time",
606
+ endpoint_id=endpoint_id,
607
+ start_dt=start_dt,
608
+ application_name=application_name,
609
+ )
610
+ return start_dt
611
+
612
+ @staticmethod
613
+ def _delete_application_data(
614
+ project_name: str,
615
+ application_name: str,
616
+ endpoint_ids: list[str],
617
+ application_schedules: Optional[
618
+ mm_schedules.ModelMonitoringSchedulesFileApplication
619
+ ],
620
+ ) -> None:
621
+ mlrun.get_run_db().delete_model_monitoring_metrics(
622
+ project=project_name,
623
+ application_name=application_name,
624
+ endpoint_ids=endpoint_ids,
625
+ )
626
+ if application_schedules:
627
+ application_schedules.delete_endpoints_last_analyzed(
628
+ endpoint_uids=endpoint_ids
629
+ )
630
+
631
+ @classmethod
373
632
  def _window_generator(
374
- start: Optional[str], end: Optional[str], base_period: Optional[int]
375
- ) -> Iterator[tuple[Optional[datetime], Optional[datetime]]]:
633
+ cls,
634
+ *,
635
+ start: Optional[str],
636
+ end: Optional[str],
637
+ base_period: Optional[int],
638
+ application_schedules: Optional[
639
+ mm_schedules.ModelMonitoringSchedulesFileApplication
640
+ ],
641
+ endpoint_name: str,
642
+ endpoint_id: str,
643
+ application_name: str,
644
+ existing_data_handling: ExistingDataHandling,
645
+ context: "mlrun.MLClientCtx",
646
+ project: "mlrun.MlrunProject",
647
+ sample_data: Optional[pd.DataFrame],
648
+ ) -> Iterator[mm_context.MonitoringApplicationContext]:
649
+ def yield_monitoring_ctx(
650
+ window_start: Optional[datetime], window_end: Optional[datetime]
651
+ ) -> Iterator[mm_context.MonitoringApplicationContext]:
652
+ ctx = mm_context.MonitoringApplicationContext._from_ml_ctx(
653
+ event={
654
+ mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
655
+ mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
656
+ mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
657
+ mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
658
+ },
659
+ application_name=application_name,
660
+ context=context,
661
+ project=project,
662
+ sample_df=sample_data,
663
+ )
664
+
665
+ if ctx.sample_df.empty:
666
+ # The current sample is empty
667
+ context.logger.debug(
668
+ "No sample data available for tracking",
669
+ application_name=application_name,
670
+ endpoint_id=ctx.endpoint_id,
671
+ start_time=ctx.start_infer_time,
672
+ end_time=ctx.end_infer_time,
673
+ )
674
+ return
675
+
676
+ yield ctx
677
+
678
+ if application_schedules and window_end:
679
+ application_schedules.update_endpoint_last_analyzed(
680
+ endpoint_uid=endpoint_id, last_analyzed=window_end
681
+ )
682
+
376
683
  if start is None or end is None:
377
684
  # A single window based on the `sample_data` input - see `_handler`.
378
- yield None, None
685
+ yield from yield_monitoring_ctx(None, None)
379
686
  return
380
687
 
381
688
  start_dt = datetime.fromisoformat(start)
382
689
  end_dt = datetime.fromisoformat(end)
383
690
 
691
+ # If `start_dt` and `end_dt` do not include time zone information - change them to UTC
692
+ if (start_dt.tzinfo is None) and (end_dt.tzinfo is None):
693
+ start_dt = start_dt.replace(tzinfo=timezone.utc)
694
+ end_dt = end_dt.replace(tzinfo=timezone.utc)
695
+ elif (start_dt.tzinfo is None) or (end_dt.tzinfo is None):
696
+ raise mlrun.errors.MLRunValueError(
697
+ "The start and end times must either both include time zone information or both be naive (no time "
698
+ f"zone). Asserting the above failed, aborting the evaluate request: start={start}, end={end}."
699
+ )
700
+
701
+ if existing_data_handling != ExistingDataHandling.delete_all:
702
+ start_dt = cls._validate_monotonically_increasing_data(
703
+ application_schedules=application_schedules,
704
+ endpoint_id=endpoint_id,
705
+ start_dt=start_dt,
706
+ end_dt=end_dt,
707
+ base_period=base_period,
708
+ application_name=application_name,
709
+ existing_data_handling=existing_data_handling,
710
+ )
711
+
384
712
  if base_period is None:
385
- yield start_dt, end_dt
713
+ yield from yield_monitoring_ctx(start_dt, end_dt)
386
714
  return
387
715
 
388
- if not isinstance(base_period, int) or base_period <= 0:
389
- raise mlrun.errors.MLRunValueError(
390
- "`base_period` must be a nonnegative integer - the number of minutes in a monitoring window"
391
- )
716
+ window_length = cls._validate_and_get_window_length(
717
+ base_period=base_period, start_dt=start_dt, end_dt=end_dt
718
+ )
392
719
 
393
- window_length = timedelta(minutes=base_period)
394
720
  current_start_time = start_dt
395
721
  while current_start_time < end_dt:
396
722
  current_end_time = min(current_start_time + window_length, end_dt)
397
- yield current_start_time, current_end_time
723
+ yield from yield_monitoring_ctx(current_start_time, current_end_time)
398
724
  current_start_time = current_end_time
399
725
 
400
726
  @classmethod
@@ -445,6 +771,45 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
445
771
  """
446
772
  return f"{handler_to_class}::{cls._handler.__name__}"
447
773
 
774
+ @classmethod
775
+ def _determine_job_name(
776
+ cls,
777
+ *,
778
+ func_name: Optional[str],
779
+ class_handler: Optional[str],
780
+ handler_to_class: str,
781
+ ) -> str:
782
+ """
783
+ Determine the batch app's job name. This name is used also as the application name,
784
+ which is retrieved in `_get_application_name`.
785
+ """
786
+ if func_name:
787
+ job_name = func_name
788
+ else:
789
+ if not class_handler:
790
+ class_name = cls.__name__
791
+ else:
792
+ class_name = handler_to_class.split(".")[-1].split("::")[0]
793
+
794
+ job_name = mlrun.utils.normalize_name(class_name)
795
+
796
+ if not mm_constants.APP_NAME_REGEX.fullmatch(job_name):
797
+ raise mlrun.errors.MLRunValueError(
798
+ "The function name does not comply with the required pattern "
799
+ f"`{mm_constants.APP_NAME_REGEX.pattern}`. "
800
+ "Please choose another `func_name`."
801
+ )
802
+ job_name, was_renamed, suffix = mlrun.utils.helpers.ensure_batch_job_suffix(
803
+ job_name
804
+ )
805
+ if was_renamed:
806
+ mlrun.utils.logger.info(
807
+ f'Changing function name - adding `"{suffix}"` suffix',
808
+ func_name=job_name,
809
+ )
810
+
811
+ return job_name
812
+
448
813
  @classmethod
449
814
  def to_job(
450
815
  cls,
@@ -484,6 +849,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
484
849
  * ``end``, ``datetime``
485
850
  * ``base_period``, ``int``
486
851
  * ``write_output``, ``bool``
852
+ * ``existing_data_handling``, ``str``
487
853
 
488
854
  For Git sources, add the source archive to the returned job and change the handler:
489
855
 
@@ -502,7 +868,10 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
502
868
  :py:class:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase`,
503
869
  is used.
504
870
  :param func_path: The path to the function. If ``None``, the current notebook is used.
505
- :param func_name: The name of the function. If not ``None``, the class name is used.
871
+ :param func_name: The name of the function. If ``None``, the normalized class name is used
872
+ (:py:meth:`mlrun.utils.helpers.normalize_name`).
873
+ A ``"-batch"`` suffix is guaranteed to be added if not already there.
874
+ The function name is also used as the application name to use for the results.
506
875
  :param tag: Tag for the function.
507
876
  :param image: Docker image to run the job on (when running remotely).
508
877
  :param with_repo: Whether to clone the current repo to the build source.
@@ -523,12 +892,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
523
892
  handler_to_class = class_handler or cls.__name__
524
893
  handler = cls.get_job_handler(handler_to_class)
525
894
 
526
- if not class_handler:
527
- class_name = cls.__name__
528
- else:
529
- class_name = handler_to_class.split(".")[-1].split("::")[-1]
530
-
531
- job_name = func_name if func_name else class_name
895
+ job_name = cls._determine_job_name(
896
+ func_name=func_name,
897
+ class_handler=class_handler,
898
+ handler_to_class=handler_to_class,
899
+ )
532
900
 
533
901
  job = cast(
534
902
  mlrun.runtimes.KubejobRuntime,
@@ -567,6 +935,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
567
935
  end: Optional[datetime] = None,
568
936
  base_period: Optional[int] = None,
569
937
  write_output: bool = False,
938
+ existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
570
939
  stream_profile: Optional[ds_profile.DatastoreProfile] = None,
571
940
  ) -> "mlrun.RunObject":
572
941
  """
@@ -574,11 +943,14 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
574
943
  :py:meth:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase.do_tracking`
575
944
  model monitoring logic as a :py:class:`~mlrun.runtimes.KubejobRuntime`, which is an MLRun function.
576
945
 
577
- This function has default values for all of its arguments. You should be change them when you want to pass
946
+ This function has default values for all of its arguments. You should change them when you want to pass
578
947
  data to the application.
579
948
 
580
949
  :param func_path: The path to the function. If ``None``, the current notebook is used.
581
- :param func_name: The name of the function. If not ``None``, the class name is used.
950
+ :param func_name: The name of the function. If ``None``, the normalized class name is used
951
+ (:py:meth:`mlrun.utils.helpers.normalize_name`).
952
+ A ``"-batch"`` suffix is guaranteed to be added if not already there.
953
+ The function name is also used as the application name to use for the results.
582
954
  :param tag: Tag for the function.
583
955
  :param run_local: Whether to run the function locally or remotely.
584
956
  :param auto_build: Whether to auto build the function.
@@ -588,6 +960,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
588
960
  :param reference_data: Pandas data-frame or :py:class:`~mlrun.artifacts.dataset.DatasetArtifact` URI as
589
961
  the reference dataset.
590
962
  When set, its statistics override the model endpoint's feature statistics.
963
+ You do not need to have a model endpoint to use this option.
591
964
  :param image: Docker image to run the job on (when running remotely).
592
965
  :param with_repo: Whether to clone the current repo to the build source.
593
966
  :param class_handler: The relative path to the class, useful when using Git sources or code from images.
@@ -608,6 +981,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
608
981
  :param start: The start time of the endpoint's data, not included.
609
982
  If you want the model endpoint's data at ``start`` included, you need to subtract a
610
983
  small ``datetime.timedelta`` from it.
984
+ Make sure to include the time zone when constructing ``datetime.datetime`` objects
985
+ manually. When both ``start`` and ``end`` times do not include a time zone, they will
986
+ be treated as UTC.
611
987
  :param end: The end time of the endpoint's data, included.
612
988
  Please note: when ``start`` and ``end`` are set, they create a left-open time interval
613
989
  ("window") :math:`(\\operatorname{start}, \\operatorname{end}]` that excludes the
@@ -616,17 +992,31 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
616
992
  taken in the window's data.
617
993
  :param base_period: The window length in minutes. If ``None``, the whole window from ``start`` to ``end``
618
994
  is taken. If an integer is specified, the application is run from ``start`` to ``end``
619
- in ``base_period`` length windows, except for the last window that ends at ``end`` and
620
- therefore may be shorter:
995
+ in ``base_period`` length windows:
621
996
  :math:`(\\operatorname{start}, \\operatorname{start} + \\operatorname{base\\_period}],
622
997
  (\\operatorname{start} + \\operatorname{base\\_period},
623
998
  \\operatorname{start} + 2\\cdot\\operatorname{base\\_period}],
624
999
  ..., (\\operatorname{start} +
625
- m\\cdot\\operatorname{base\\_period}, \\operatorname{end}]`,
626
- where :math:`m` is some positive integer.
1000
+ (m - 1)\\cdot\\operatorname{base\\_period}, \\operatorname{end}]`,
1001
+ where :math:`m` is a positive integer and :math:`\\operatorname{end} =
1002
+ \\operatorname{start} + m\\cdot\\operatorname{base\\_period}`.
1003
+ Please note that the difference between ``end`` and ``start`` must be a multiple of
1004
+ ``base_period``.
627
1005
  :param write_output: Whether to write the results and metrics to the time-series DB. Can be ``True`` only
628
1006
  if ``endpoints`` are passed.
629
1007
  Note: the model monitoring infrastructure must be up for the writing to work.
1008
+ :param existing_data_handling:
1009
+ How to handle the existing application data for the model endpoints when writing
1010
+ new data whose requested ``start`` time precedes the ``end`` time of a previous run
1011
+ that also wrote to the database. Relevant only when ``write_output=True``.
1012
+ The options are:
1013
+
1014
+ - ``"fail_on_overlap"``: Default. An error is raised.
1015
+ - ``"skip_overlap"``: the overlapping data is ignored and the
1016
+ time window is cut so that it starts at the earliest possible time after ``start``.
1017
+ - ``"delete_all"``: delete all the data that was written by the application to the
1018
+ model endpoints, regardless of the time window, and write the new data.
1019
+
630
1020
  :param stream_profile: The stream datastore profile. It should be provided only when running locally and
631
1021
  writing the outputs to the database (i.e., when both ``run_local`` and
632
1022
  ``write_output`` are set to ``True``).
@@ -665,17 +1055,6 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
665
1055
  )
666
1056
  params["end"] = end.isoformat() if isinstance(end, datetime) else end
667
1057
  params["base_period"] = base_period
668
- params["write_output"] = write_output
669
- if stream_profile:
670
- if not run_local:
671
- raise mlrun.errors.MLRunValueError(
672
- "Passing a `stream_profile` is relevant only when running locally"
673
- )
674
- if not write_output:
675
- raise mlrun.errors.MLRunValueError(
676
- "Passing a `stream_profile` is relevant only when writing the outputs"
677
- )
678
- params["stream_profile"] = stream_profile
679
1058
  elif start or end or base_period:
680
1059
  raise mlrun.errors.MLRunValueError(
681
1060
  "Custom `start` and `end` times or base_period are supported only with endpoints data"
@@ -685,6 +1064,19 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
685
1064
  "Writing the application output or passing `stream_profile` are supported only with endpoints data"
686
1065
  )
687
1066
 
1067
+ params["write_output"] = write_output
1068
+ params["existing_data_handling"] = existing_data_handling
1069
+ if stream_profile:
1070
+ if not run_local:
1071
+ raise mlrun.errors.MLRunValueError(
1072
+ "Passing a `stream_profile` is relevant only when running locally"
1073
+ )
1074
+ if not write_output:
1075
+ raise mlrun.errors.MLRunValueError(
1076
+ "Passing a `stream_profile` is relevant only when writing the outputs"
1077
+ )
1078
+ params["stream_profile"] = stream_profile
1079
+
688
1080
  inputs: dict[str, str] = {}
689
1081
  for data, identifier in [
690
1082
  (sample_data, "sample_data"),