mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/base.py +0 -31
  3. mlrun/artifacts/document.py +6 -1
  4. mlrun/artifacts/llm_prompt.py +123 -25
  5. mlrun/artifacts/manager.py +0 -5
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/common/constants.py +10 -1
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/common/model_monitoring/helpers.py +86 -0
  10. mlrun/common/schemas/__init__.py +3 -0
  11. mlrun/common/schemas/auth.py +2 -0
  12. mlrun/common/schemas/function.py +10 -0
  13. mlrun/common/schemas/hub.py +30 -18
  14. mlrun/common/schemas/model_monitoring/__init__.py +3 -0
  15. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  16. mlrun/common/schemas/model_monitoring/functions.py +14 -5
  17. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
  18. mlrun/common/schemas/pipeline.py +1 -1
  19. mlrun/common/schemas/serving.py +3 -0
  20. mlrun/common/schemas/workflow.py +3 -1
  21. mlrun/common/secrets.py +22 -1
  22. mlrun/config.py +33 -11
  23. mlrun/datastore/__init__.py +11 -3
  24. mlrun/datastore/azure_blob.py +162 -47
  25. mlrun/datastore/datastore.py +9 -4
  26. mlrun/datastore/datastore_profile.py +61 -5
  27. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  28. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  29. mlrun/datastore/model_provider/model_provider.py +230 -65
  30. mlrun/datastore/model_provider/openai_provider.py +295 -42
  31. mlrun/datastore/s3.py +24 -2
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +47 -19
  35. mlrun/db/httpdb.py +120 -56
  36. mlrun/db/nopdb.py +38 -10
  37. mlrun/execution.py +70 -19
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +15 -0
  43. mlrun/model.py +24 -3
  44. mlrun/model_monitoring/__init__.py +1 -0
  45. mlrun/model_monitoring/api.py +66 -27
  46. mlrun/model_monitoring/applications/__init__.py +1 -1
  47. mlrun/model_monitoring/applications/base.py +509 -117
  48. mlrun/model_monitoring/applications/context.py +2 -4
  49. mlrun/model_monitoring/applications/results.py +4 -7
  50. mlrun/model_monitoring/controller.py +239 -101
  51. mlrun/model_monitoring/db/_schedules.py +116 -33
  52. mlrun/model_monitoring/db/_stats.py +4 -3
  53. mlrun/model_monitoring/db/tsdb/base.py +100 -9
  54. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
  55. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
  56. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  57. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  58. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
  59. mlrun/model_monitoring/helpers.py +54 -9
  60. mlrun/model_monitoring/stream_processing.py +45 -14
  61. mlrun/model_monitoring/writer.py +220 -1
  62. mlrun/platforms/__init__.py +3 -2
  63. mlrun/platforms/iguazio.py +7 -3
  64. mlrun/projects/operations.py +6 -1
  65. mlrun/projects/pipelines.py +46 -26
  66. mlrun/projects/project.py +166 -58
  67. mlrun/run.py +94 -17
  68. mlrun/runtimes/__init__.py +18 -0
  69. mlrun/runtimes/base.py +14 -6
  70. mlrun/runtimes/daskjob.py +7 -0
  71. mlrun/runtimes/local.py +5 -2
  72. mlrun/runtimes/mounts.py +20 -2
  73. mlrun/runtimes/mpijob/abstract.py +6 -0
  74. mlrun/runtimes/mpijob/v1.py +6 -0
  75. mlrun/runtimes/nuclio/__init__.py +1 -0
  76. mlrun/runtimes/nuclio/application/application.py +149 -17
  77. mlrun/runtimes/nuclio/function.py +76 -27
  78. mlrun/runtimes/nuclio/serving.py +97 -15
  79. mlrun/runtimes/pod.py +234 -21
  80. mlrun/runtimes/remotesparkjob.py +6 -0
  81. mlrun/runtimes/sparkjob/spark3job.py +6 -0
  82. mlrun/runtimes/utils.py +49 -11
  83. mlrun/secrets.py +54 -13
  84. mlrun/serving/__init__.py +2 -0
  85. mlrun/serving/remote.py +79 -6
  86. mlrun/serving/routers.py +23 -41
  87. mlrun/serving/server.py +320 -80
  88. mlrun/serving/states.py +725 -157
  89. mlrun/serving/steps.py +62 -0
  90. mlrun/serving/system_steps.py +200 -119
  91. mlrun/serving/v2_serving.py +9 -10
  92. mlrun/utils/helpers.py +288 -88
  93. mlrun/utils/logger.py +3 -1
  94. mlrun/utils/notifications/notification/base.py +18 -0
  95. mlrun/utils/notifications/notification/git.py +2 -4
  96. mlrun/utils/notifications/notification/slack.py +2 -4
  97. mlrun/utils/notifications/notification/webhook.py +2 -5
  98. mlrun/utils/notifications/notification_pusher.py +1 -1
  99. mlrun/utils/retryer.py +15 -2
  100. mlrun/utils/version/version.json +2 -2
  101. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
  102. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
  103. mlrun/api/schemas/__init__.py +0 -259
  104. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  105. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  106. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  107. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import math
15
- from datetime import datetime, timedelta
15
+ from datetime import datetime, timedelta, timezone
16
16
  from io import StringIO
17
- from typing import Callable, Literal, Optional, Union
17
+ from typing import Literal, Optional, Union
18
18
 
19
19
  import pandas as pd
20
20
  import v3io_frames
@@ -25,6 +25,7 @@ import mlrun.common.schemas.model_monitoring as mm_schemas
25
25
  import mlrun.feature_store.steps
26
26
  import mlrun.utils.v3io_clients
27
27
  from mlrun.common.schemas import EventFieldType
28
+ from mlrun.config import config
28
29
  from mlrun.model_monitoring.db import TSDBConnector
29
30
  from mlrun.model_monitoring.helpers import get_invocations_fqn, get_start_end
30
31
  from mlrun.utils import logger
@@ -369,6 +370,49 @@ class V3IOTSDBConnector(TSDBConnector):
369
370
  apply_storey_filter()
370
371
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
371
372
 
373
+ def apply_writer_steps(self, graph, after, **kwargs) -> None:
374
+ graph.add_step(
375
+ "storey.TSDBTarget",
376
+ name="tsdb_metrics",
377
+ after=after,
378
+ path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.METRICS]}",
379
+ time_col=mm_schemas.WriterEvent.END_INFER_TIME,
380
+ container=self.container,
381
+ v3io_frames=self.v3io_framesd,
382
+ infer_columns_from_data=True,
383
+ graph_shape="cylinder",
384
+ index_cols=[
385
+ mm_schemas.WriterEvent.APPLICATION_NAME,
386
+ mm_schemas.WriterEvent.ENDPOINT_NAME,
387
+ mm_schemas.WriterEvent.ENDPOINT_ID,
388
+ mm_schemas.MetricData.METRIC_NAME,
389
+ ],
390
+ max_events=config.model_endpoint_monitoring.writer_graph.max_events,
391
+ flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
392
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
393
+ )
394
+
395
+ graph.add_step(
396
+ "storey.TSDBTarget",
397
+ name="tsdb_app_results",
398
+ after=after,
399
+ path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]}",
400
+ time_col=mm_schemas.WriterEvent.END_INFER_TIME,
401
+ container=self.container,
402
+ v3io_frames=self.v3io_framesd,
403
+ infer_columns_from_data=True,
404
+ graph_shape="cylinder",
405
+ index_cols=[
406
+ mm_schemas.WriterEvent.APPLICATION_NAME,
407
+ mm_schemas.WriterEvent.ENDPOINT_NAME,
408
+ mm_schemas.WriterEvent.ENDPOINT_ID,
409
+ mm_schemas.ResultData.RESULT_NAME,
410
+ ],
411
+ max_events=config.model_endpoint_monitoring.writer_graph.max_events,
412
+ flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
413
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
414
+ )
415
+
372
416
  def handle_model_error(
373
417
  self,
374
418
  graph,
@@ -492,7 +536,8 @@ class V3IOTSDBConnector(TSDBConnector):
492
536
  # Split the endpoint ids into chunks to avoid exceeding the v3io-engine filter-expression limit
493
537
  for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
494
538
  endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
495
- filter_query = f"endpoint_id IN({str(endpoint_id_chunk)[1:-1]}) "
539
+ endpoints_list = "', '".join(endpoint_id_chunk)
540
+ filter_query = f"endpoint_id IN('{endpoints_list}')"
496
541
  for table in tables:
497
542
  try:
498
543
  self.frames_client.delete(
@@ -532,6 +577,43 @@ class V3IOTSDBConnector(TSDBConnector):
532
577
  project=self.project,
533
578
  )
534
579
 
580
+ def delete_application_records(
581
+ self, application_name: str, endpoint_ids: Optional[list[str]] = None
582
+ ) -> None:
583
+ """
584
+ Delete application records from the TSDB for the given model endpoints or all if ``endpoint_ids`` is ``None``.
585
+ """
586
+ base_filter_query = f"application_name=='{application_name}'"
587
+
588
+ filter_queries: list[str] = []
589
+ if endpoint_ids:
590
+ for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
591
+ endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
592
+ endpoints_list = "', '".join(endpoint_id_chunk)
593
+ filter_queries.append(
594
+ f"{base_filter_query} AND endpoint_id IN ('{endpoints_list}')"
595
+ )
596
+ else:
597
+ filter_queries = [base_filter_query]
598
+
599
+ for table in [
600
+ self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS],
601
+ self.tables[mm_schemas.V3IOTSDBTables.METRICS],
602
+ ]:
603
+ logger.debug(
604
+ "Deleting application records from TSDB",
605
+ table=table,
606
+ filter_queries=filter_queries,
607
+ project=self.project,
608
+ )
609
+ for filter_query in filter_queries:
610
+ self.frames_client.delete(
611
+ backend=_TSDB_BE,
612
+ table=table,
613
+ filter=filter_query,
614
+ start="0",
615
+ )
616
+
535
617
  def get_model_endpoint_real_time_metrics(
536
618
  self, endpoint_id: str, metrics: list[str], start: str, end: str
537
619
  ) -> dict[str, list[tuple[str, float]]]:
@@ -935,6 +1017,9 @@ class V3IOTSDBConnector(TSDBConnector):
935
1017
  start: Optional[datetime] = None,
936
1018
  end: Optional[datetime] = None,
937
1019
  ) -> dict[str, float]:
1020
+ if not endpoint_ids:
1021
+ return {}
1022
+
938
1023
  # Get the last request timestamp for each endpoint from the KV table.
939
1024
  # The result of the query is a list of dictionaries,
940
1025
  # each dictionary contains the endpoint id and the last request timestamp.
@@ -1145,11 +1230,9 @@ class V3IOTSDBConnector(TSDBConnector):
1145
1230
  )
1146
1231
  return df.reset_index(drop=True)
1147
1232
 
1148
- async def add_basic_metrics(
1233
+ def add_basic_metrics(
1149
1234
  self,
1150
1235
  model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
1151
- project: str,
1152
- run_in_threadpool: Callable,
1153
1236
  metric_list: Optional[list[str]] = None,
1154
1237
  ) -> list[mlrun.common.schemas.ModelEndpoint]:
1155
1238
  """
@@ -1157,8 +1240,6 @@ class V3IOTSDBConnector(TSDBConnector):
1157
1240
 
1158
1241
  :param model_endpoint_objects: A list of `ModelEndpoint` objects that will
1159
1242
  be filled with the relevant basic metrics.
1160
- :param project: The name of the project.
1161
- :param run_in_threadpool: A function that runs another function in a thread pool.
1162
1243
  :param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
1163
1244
 
1164
1245
  :return: A list of `ModelEndpointMonitoringMetric` objects.
@@ -1187,8 +1268,7 @@ class V3IOTSDBConnector(TSDBConnector):
1187
1268
  function,
1188
1269
  _,
1189
1270
  ) in metric_name_to_function_and_column_name.items():
1190
- metric_name_to_result[metric_name] = await run_in_threadpool(
1191
- function,
1271
+ metric_name_to_result[metric_name] = function(
1192
1272
  endpoint_ids=uids,
1193
1273
  get_raw=True,
1194
1274
  )
@@ -1259,7 +1339,7 @@ class V3IOTSDBConnector(TSDBConnector):
1259
1339
  else:
1260
1340
  filter_query = app_filter_query
1261
1341
 
1262
- df = self._get_records(
1342
+ raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
1263
1343
  table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
1264
1344
  start=start,
1265
1345
  end=end,
@@ -1268,39 +1348,33 @@ class V3IOTSDBConnector(TSDBConnector):
1268
1348
  mm_schemas.ResultData.RESULT_STATUS,
1269
1349
  ],
1270
1350
  filter_query=filter_query,
1351
+ get_raw=True,
1271
1352
  )
1272
1353
 
1273
- # filter result status
1274
- if result_status_list and not df.empty:
1275
- df = df[df[mm_schemas.ResultData.RESULT_STATUS].isin(result_status_list)]
1276
-
1277
- if df.empty:
1354
+ if not raw_frames:
1278
1355
  return {}
1279
- else:
1280
- # convert application name to lower case
1281
- df[mm_schemas.ApplicationEvent.APPLICATION_NAME] = df[
1282
- mm_schemas.ApplicationEvent.APPLICATION_NAME
1283
- ].str.lower()
1284
-
1285
- df = (
1286
- df[
1287
- [
1288
- mm_schemas.ApplicationEvent.APPLICATION_NAME,
1289
- mm_schemas.ResultData.RESULT_STATUS,
1290
- mm_schemas.ResultData.RESULT_VALUE,
1291
- ]
1292
- ]
1293
- .groupby(
1294
- [
1295
- mm_schemas.ApplicationEvent.APPLICATION_NAME,
1296
- mm_schemas.ResultData.RESULT_STATUS,
1297
- ],
1298
- observed=True,
1299
- )
1300
- .count()
1301
- )
1302
1356
 
1303
- return df[mm_schemas.ResultData.RESULT_VALUE].to_dict()
1357
+ # Count occurrences by (application_name, result_status) from RawFrame objects
1358
+ count_dict = {}
1359
+
1360
+ for frame in raw_frames:
1361
+ # Extract column data from each RawFrame
1362
+ app_name = frame.column_data(mm_schemas.ApplicationEvent.APPLICATION_NAME)[
1363
+ 0
1364
+ ]
1365
+ statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
1366
+
1367
+ for status in statuses:
1368
+ # Filter by result status if specified
1369
+ if result_status_list and status not in result_status_list:
1370
+ continue
1371
+
1372
+ # Convert application name to lower case
1373
+ key = (app_name.lower(), status)
1374
+
1375
+ # Update the count in the dictionary
1376
+ count_dict[key] = count_dict.get(key, 0) + 1
1377
+ return count_dict
1304
1378
 
1305
1379
  def count_processed_model_endpoints(
1306
1380
  self,
@@ -1450,3 +1524,148 @@ class V3IOTSDBConnector(TSDBConnector):
1450
1524
  return metric_objects
1451
1525
 
1452
1526
  return build_metric_objects()
1527
+
1528
+ def get_drift_data(
1529
+ self,
1530
+ start: datetime,
1531
+ end: datetime,
1532
+ ) -> mm_schemas.ModelEndpointDriftValues:
1533
+ table = mm_schemas.V3IOTSDBTables.APP_RESULTS
1534
+ start, end, interval = self._prepare_aligned_start_end(start, end)
1535
+ raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
1536
+ table=table,
1537
+ start=start,
1538
+ end=end,
1539
+ columns=[mm_schemas.ResultData.RESULT_STATUS],
1540
+ get_raw=True,
1541
+ )
1542
+
1543
+ if not raw_frames:
1544
+ return mm_schemas.ModelEndpointDriftValues(values=[])
1545
+
1546
+ aggregated_data = self._aggregate_raw_drift_data(
1547
+ raw_frames=raw_frames, start=start, end=end, interval=interval
1548
+ )
1549
+ if not aggregated_data:
1550
+ return mm_schemas.ModelEndpointDriftValues(values=[])
1551
+
1552
+ # Filter to only include entries with max result_status >= 1
1553
+ filtered_data = [
1554
+ (endpoint_id, timestamp, max_status)
1555
+ for endpoint_id, timestamp, max_status in aggregated_data
1556
+ if max_status >= 1
1557
+ ]
1558
+
1559
+ if not filtered_data:
1560
+ return mm_schemas.ModelEndpointDriftValues(values=[])
1561
+
1562
+ return self._convert_drift_data_to_values(aggregated_data=filtered_data)
1563
+
1564
+ @staticmethod
1565
+ def _aggregate_raw_drift_data(
1566
+ raw_frames: list[v3io_frames.client.RawFrame],
1567
+ start: datetime,
1568
+ end: datetime,
1569
+ interval: str,
1570
+ ) -> list[tuple[str, datetime, float]]:
1571
+ """
1572
+ Aggregate raw drift data from RawFrame objects.
1573
+
1574
+ :param raw_frames: List of RawFrame objects containing drift data.
1575
+ :param start: Start datetime for filtering data.
1576
+ :param end: End datetime for filtering data.
1577
+ :param interval: Time interval string (e.g., '5min') for aggregation
1578
+
1579
+ :returns: list of tuples: (endpoint_id, timestamp, max_result_status)
1580
+ """
1581
+ if not raw_frames:
1582
+ return []
1583
+
1584
+ # Parse interval to get timedelta
1585
+ interval_td = pd.Timedelta(interval)
1586
+
1587
+ # Collect all data points from RawFrame objects
1588
+ data_points = []
1589
+ for frame in raw_frames:
1590
+ endpoint_id = frame.column_data(EventFieldType.ENDPOINT_ID)[0]
1591
+ result_statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
1592
+ timestamps = frame.indices()[0].times
1593
+
1594
+ # Combine data from this frame
1595
+ for i, (status, timestamp) in enumerate(zip(result_statuses, timestamps)):
1596
+ # V3IO TSDB returns timestamps in nanoseconds
1597
+ timestamp_dt = pd.Timestamp(
1598
+ timestamp, unit="ns", tzinfo=timezone.utc
1599
+ ).to_pydatetime()
1600
+
1601
+ # Filter by time window
1602
+ if start <= timestamp_dt < end:
1603
+ data_points.append((endpoint_id, timestamp_dt, status))
1604
+
1605
+ if not data_points:
1606
+ return []
1607
+
1608
+ # Group by endpoint_id and time intervals, then find max status
1609
+ # Create time buckets aligned to start
1610
+ grouped_data = {}
1611
+ for endpoint_id, timestamp, status in data_points:
1612
+ # Calculate which interval bucket this timestamp falls into
1613
+ time_diff = timestamp - start
1614
+ bucket_index = int(time_diff / interval_td)
1615
+ bucket_start = start + (bucket_index * interval_td)
1616
+
1617
+ key = (endpoint_id, bucket_start)
1618
+ if key not in grouped_data:
1619
+ grouped_data[key] = status
1620
+ else:
1621
+ # Keep the maximum status value
1622
+ grouped_data[key] = max(grouped_data[key], status)
1623
+
1624
+ # Convert to list of tuples
1625
+ result = [
1626
+ (endpoint_id, timestamp, max_status)
1627
+ for (endpoint_id, timestamp), max_status in grouped_data.items()
1628
+ ]
1629
+
1630
+ return result
1631
+
1632
+ @staticmethod
1633
+ def _convert_drift_data_to_values(
1634
+ aggregated_data: list[tuple[str, datetime, float]],
1635
+ ) -> mm_schemas.ModelEndpointDriftValues:
1636
+ """
1637
+ Convert aggregated drift data to ModelEndpointDriftValues format.
1638
+
1639
+ :param aggregated_data: List of tuples (endpoint_id, timestamp, max_result_status)
1640
+ :return: ModelEndpointDriftValues with counts of suspected and detected per timestamp
1641
+ """
1642
+ suspected_val = mm_schemas.constants.ResultStatusApp.potential_detection.value
1643
+ detected_val = mm_schemas.constants.ResultStatusApp.detected.value
1644
+
1645
+ # Group by timestamp and result status, then count occurrences
1646
+ timestamp_status_counts = {}
1647
+ for _, timestamp, max_status in aggregated_data:
1648
+ key = (timestamp, max_status)
1649
+ timestamp_status_counts[key] = timestamp_status_counts.get(key, 0) + 1
1650
+
1651
+ # Organize by timestamp with counts for suspected and detected
1652
+ timestamp_counts = {}
1653
+ for (timestamp, status), count in timestamp_status_counts.items():
1654
+ if timestamp not in timestamp_counts:
1655
+ timestamp_counts[timestamp] = {
1656
+ "count_suspected": 0,
1657
+ "count_detected": 0,
1658
+ }
1659
+
1660
+ if status == suspected_val:
1661
+ timestamp_counts[timestamp]["count_suspected"] = count
1662
+ elif status == detected_val:
1663
+ timestamp_counts[timestamp]["count_detected"] = count
1664
+
1665
+ # Convert to the expected format: list of (timestamp, count_suspected, count_detected)
1666
+ values = [
1667
+ (timestamp, counts["count_suspected"], counts["count_detected"])
1668
+ for timestamp, counts in sorted(timestamp_counts.items())
1669
+ ]
1670
+
1671
+ return mm_schemas.ModelEndpointDriftValues(values=values)
@@ -143,7 +143,7 @@ def get_stream_path(
143
143
  return stream_uri.replace("v3io://", f"ds://{profile.name}")
144
144
 
145
145
  elif isinstance(
146
- profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
146
+ profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
147
147
  ):
148
148
  topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
149
149
  project=project, function_name=function_name
@@ -152,7 +152,7 @@ def get_stream_path(
152
152
  else:
153
153
  raise mlrun.errors.MLRunValueError(
154
154
  f"Received an unexpected stream profile type: {type(profile)}\n"
155
- "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
155
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
156
156
  )
157
157
 
158
158
 
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
300
300
 
301
301
  def _get_kafka_output_stream(
302
302
  *,
303
- kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource,
303
+ kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
304
304
  project: str,
305
305
  function_name: str,
306
306
  mock: bool = False,
@@ -356,7 +356,7 @@ def get_output_stream(
356
356
  )
357
357
 
358
358
  elif isinstance(
359
- profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
359
+ profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
360
360
  ):
361
361
  return _get_kafka_output_stream(
362
362
  kafka_profile=profile,
@@ -368,7 +368,7 @@ def get_output_stream(
368
368
  else:
369
369
  raise mlrun.errors.MLRunValueError(
370
370
  f"Received an unexpected stream profile type: {type(profile)}\n"
371
- "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
371
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
372
372
  )
373
373
 
374
374
 
@@ -549,6 +549,10 @@ def _get_monitoring_schedules_folder_path(project: str) -> str:
549
549
  )
550
550
 
551
551
 
552
+ def _get_monitoring_schedules_user_folder_path(out_path: str) -> str:
553
+ return os.path.join(out_path, mm_constants.FileTargetKind.MONITORING_SCHEDULES)
554
+
555
+
552
556
  def _get_monitoring_schedules_file_endpoint_path(
553
557
  *, project: str, endpoint_id: str
554
558
  ) -> str:
@@ -570,10 +574,7 @@ def get_monitoring_schedules_endpoint_data(
570
574
  )
571
575
 
572
576
 
573
- def get_monitoring_schedules_chief_data(
574
- *,
575
- project: str,
576
- ) -> "DataItem":
577
+ def get_monitoring_schedules_chief_data(*, project: str) -> "DataItem":
577
578
  """
578
579
  Get the model monitoring schedules' data item of the project's model endpoint.
579
580
  """
@@ -582,6 +583,19 @@ def get_monitoring_schedules_chief_data(
582
583
  )
583
584
 
584
585
 
586
+ def get_monitoring_schedules_user_application_data(
587
+ *, out_path: str, application: str
588
+ ) -> "DataItem":
589
+ """
590
+ Get the model monitoring schedules' data item of user application runs.
591
+ """
592
+ return mlrun.datastore.store_manager.object(
593
+ _get_monitoring_schedules_file_user_application_path(
594
+ out_path=out_path, application=application
595
+ )
596
+ )
597
+
598
+
585
599
  def _get_monitoring_schedules_file_chief_path(
586
600
  *,
587
601
  project: str,
@@ -591,6 +605,14 @@ def _get_monitoring_schedules_file_chief_path(
591
605
  )
592
606
 
593
607
 
608
+ def _get_monitoring_schedules_file_user_application_path(
609
+ *, out_path: str, application: str
610
+ ) -> str:
611
+ return os.path.join(
612
+ _get_monitoring_schedules_user_folder_path(out_path), f"{application}.json"
613
+ )
614
+
615
+
594
616
  def get_start_end(
595
617
  start: Union[datetime.datetime, None],
596
618
  end: Union[datetime.datetime, None],
@@ -637,3 +659,26 @@ def get_start_end(
637
659
  )
638
660
 
639
661
  return start, end
662
+
663
+
664
+ def validate_time_range(
665
+ start: Optional[datetime.datetime] = None, end: Optional[datetime.datetime] = None
666
+ ) -> tuple[datetime.datetime, datetime.datetime]:
667
+ """
668
+ validate start and end parameters and set default values if needed.
669
+ :param start: Either None or datetime, None is handled as datetime.now(tz=timezone.utc) - timedelta(days=1)
670
+ :param end: Either None or datetime, None is handled as datetime.now(tz=timezone.utc)
671
+ :return: start datetime, end datetime
672
+ """
673
+ end = end or mlrun.utils.helpers.datetime_now()
674
+ start = start or (end - datetime.timedelta(days=1))
675
+ if start.tzinfo is None or end.tzinfo is None:
676
+ raise mlrun.errors.MLRunInvalidArgumentTypeError(
677
+ "Custom start and end times must contain the timezone."
678
+ )
679
+ if start > end:
680
+ raise mlrun.errors.MLRunInvalidArgumentError(
681
+ "The start time must be before the end time. Note that if end time is not provided, "
682
+ "the current time is used by default."
683
+ )
684
+ return start, end
@@ -11,7 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import asyncio
15
15
  import datetime
16
16
  import typing
17
17
 
@@ -134,6 +134,9 @@ class EventStreamProcessor:
134
134
  the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
135
135
  using CE, the parquet target path is based on the defined MLRun artifact path.
136
136
 
137
+ In a separate branch, "batch complete" events are forwarded to the controller stream with an intentional delay,
138
+ to allow for data to first be written to parquet.
139
+
137
140
  :param fn: A serving function.
138
141
  :param tsdb_connector: Time series database connector.
139
142
  :param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
@@ -145,6 +148,20 @@ class EventStreamProcessor:
145
148
  fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
146
149
  )
147
150
 
151
+ # forward back complete events to controller
152
+ graph.add_step(
153
+ "storey.Filter",
154
+ "FilterBatchComplete",
155
+ _fn="(event.get('kind') == 'batch_complete')",
156
+ )
157
+
158
+ graph.add_step(
159
+ "Delay",
160
+ name="BatchDelay",
161
+ after="FilterBatchComplete",
162
+ delay=self.parquet_batching_timeout_secs + 5, # add margin
163
+ )
164
+
148
165
  # split the graph between event with error vs valid event
149
166
  graph.add_step(
150
167
  "storey.Filter",
@@ -261,7 +278,7 @@ class EventStreamProcessor:
261
278
  "controller_stream",
262
279
  path=stream_uri,
263
280
  sharding_func=ControllerEvent.ENDPOINT_ID,
264
- after="ForwardNOP",
281
+ after=["ForwardNOP", "BatchDelay"],
265
282
  # Force using the pipeline key instead of the one in the profile in case of v3io profile.
266
283
  # In case of Kafka, this parameter will be ignored.
267
284
  alternative_v3io_access_key="V3IO_ACCESS_KEY",
@@ -309,6 +326,16 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
309
326
  return event
310
327
 
311
328
 
329
+ class Delay(mlrun.feature_store.steps.MapClass):
330
+ def __init__(self, delay: int, **kwargs):
331
+ super().__init__(**kwargs)
332
+ self._delay = delay
333
+
334
+ async def do(self, event):
335
+ await asyncio.sleep(self._delay)
336
+ return event
337
+
338
+
312
339
  class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
313
340
  def __init__(
314
341
  self,
@@ -369,6 +396,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
369
396
  request_id = event.get("request", {}).get("id") or event.get("resp", {}).get(
370
397
  "id"
371
398
  )
399
+ feature_names = event.get("request", {}).get("input_schema")
400
+ labels_names = event.get("resp", {}).get("output_schema")
372
401
  latency = event.get("microsec")
373
402
  features = event.get("request", {}).get("inputs")
374
403
  predictions = event.get("resp", {}).get("outputs")
@@ -469,6 +498,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
469
498
  ),
470
499
  EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
471
500
  EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
501
+ EventFieldType.FEATURE_NAMES: feature_names,
502
+ EventFieldType.LABEL_NAMES: labels_names,
472
503
  }
473
504
  )
474
505
 
@@ -575,19 +606,19 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
575
606
  self.endpoint_type = {}
576
607
 
577
608
  def _infer_feature_names_from_data(self, event):
578
- for endpoint_id in self.feature_names:
579
- if len(self.feature_names[endpoint_id]) >= len(
580
- event[EventFieldType.FEATURES]
581
- ):
582
- return self.feature_names[endpoint_id]
609
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
610
+ if endpoint_id in self.feature_names and len(
611
+ self.feature_names[endpoint_id]
612
+ ) >= len(event[EventFieldType.FEATURES]):
613
+ return self.feature_names[endpoint_id]
583
614
  return None
584
615
 
585
616
  def _infer_label_columns_from_data(self, event):
586
- for endpoint_id in self.label_columns:
587
- if len(self.label_columns[endpoint_id]) >= len(
588
- event[EventFieldType.PREDICTION]
589
- ):
590
- return self.label_columns[endpoint_id]
617
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
618
+ if endpoint_id in self.label_columns and len(
619
+ self.label_columns[endpoint_id]
620
+ ) >= len(event[EventFieldType.PREDICTION]):
621
+ return self.label_columns[endpoint_id]
591
622
  return None
592
623
 
593
624
  def do(self, event: dict):
@@ -632,7 +663,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
632
663
  "Feature names are not initialized, they will be automatically generated",
633
664
  endpoint_id=endpoint_id,
634
665
  )
635
- feature_names = [
666
+ feature_names = event.get(EventFieldType.FEATURE_NAMES) or [
636
667
  f"f{i}" for i, _ in enumerate(event[EventFieldType.FEATURES])
637
668
  ]
638
669
 
@@ -655,7 +686,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
655
686
  "label column names are not initialized, they will be automatically generated",
656
687
  endpoint_id=endpoint_id,
657
688
  )
658
- label_columns = [
689
+ label_columns = event.get(EventFieldType.LABEL_NAMES) or [
659
690
  f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
660
691
  ]
661
692
  attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns