mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show
  1. mlrun/__main__.py +4 -2
  2. mlrun/alerts/alert.py +75 -8
  3. mlrun/artifacts/base.py +1 -0
  4. mlrun/artifacts/manager.py +9 -2
  5. mlrun/common/constants.py +4 -1
  6. mlrun/common/db/sql_session.py +3 -2
  7. mlrun/common/formatters/__init__.py +1 -0
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
  10. mlrun/common/formatters/run.py +3 -0
  11. mlrun/common/helpers.py +0 -1
  12. mlrun/common/schemas/__init__.py +3 -1
  13. mlrun/common/schemas/alert.py +15 -12
  14. mlrun/common/schemas/api_gateway.py +6 -6
  15. mlrun/common/schemas/auth.py +5 -0
  16. mlrun/common/schemas/client_spec.py +0 -1
  17. mlrun/common/schemas/common.py +7 -4
  18. mlrun/common/schemas/frontend_spec.py +7 -0
  19. mlrun/common/schemas/function.py +7 -0
  20. mlrun/common/schemas/model_monitoring/__init__.py +4 -3
  21. mlrun/common/schemas/model_monitoring/constants.py +41 -26
  22. mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
  23. mlrun/common/schemas/notification.py +69 -12
  24. mlrun/common/schemas/project.py +45 -12
  25. mlrun/common/schemas/workflow.py +10 -2
  26. mlrun/common/types.py +1 -0
  27. mlrun/config.py +91 -35
  28. mlrun/data_types/data_types.py +6 -1
  29. mlrun/data_types/spark.py +2 -2
  30. mlrun/data_types/to_pandas.py +57 -25
  31. mlrun/datastore/__init__.py +1 -0
  32. mlrun/datastore/alibaba_oss.py +3 -2
  33. mlrun/datastore/azure_blob.py +125 -37
  34. mlrun/datastore/base.py +42 -21
  35. mlrun/datastore/datastore.py +4 -2
  36. mlrun/datastore/datastore_profile.py +1 -1
  37. mlrun/datastore/dbfs_store.py +3 -7
  38. mlrun/datastore/filestore.py +1 -3
  39. mlrun/datastore/google_cloud_storage.py +85 -29
  40. mlrun/datastore/inmem.py +4 -1
  41. mlrun/datastore/redis.py +1 -0
  42. mlrun/datastore/s3.py +25 -12
  43. mlrun/datastore/sources.py +76 -4
  44. mlrun/datastore/spark_utils.py +30 -0
  45. mlrun/datastore/storeytargets.py +151 -0
  46. mlrun/datastore/targets.py +102 -131
  47. mlrun/datastore/v3io.py +1 -0
  48. mlrun/db/base.py +15 -6
  49. mlrun/db/httpdb.py +57 -28
  50. mlrun/db/nopdb.py +29 -5
  51. mlrun/errors.py +20 -3
  52. mlrun/execution.py +46 -5
  53. mlrun/feature_store/api.py +25 -1
  54. mlrun/feature_store/common.py +6 -11
  55. mlrun/feature_store/feature_vector.py +3 -1
  56. mlrun/feature_store/retrieval/job.py +4 -1
  57. mlrun/feature_store/retrieval/spark_merger.py +10 -39
  58. mlrun/feature_store/steps.py +8 -0
  59. mlrun/frameworks/_common/plan.py +3 -3
  60. mlrun/frameworks/_ml_common/plan.py +1 -1
  61. mlrun/frameworks/parallel_coordinates.py +2 -3
  62. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  63. mlrun/k8s_utils.py +48 -2
  64. mlrun/launcher/client.py +6 -6
  65. mlrun/launcher/local.py +2 -2
  66. mlrun/model.py +215 -34
  67. mlrun/model_monitoring/api.py +38 -24
  68. mlrun/model_monitoring/applications/__init__.py +1 -2
  69. mlrun/model_monitoring/applications/_application_steps.py +60 -29
  70. mlrun/model_monitoring/applications/base.py +2 -174
  71. mlrun/model_monitoring/applications/context.py +197 -70
  72. mlrun/model_monitoring/applications/evidently_base.py +11 -85
  73. mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
  74. mlrun/model_monitoring/applications/results.py +4 -4
  75. mlrun/model_monitoring/controller.py +110 -282
  76. mlrun/model_monitoring/db/stores/__init__.py +8 -3
  77. mlrun/model_monitoring/db/stores/base/store.py +3 -0
  78. mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
  79. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
  80. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
  81. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
  82. mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
  83. mlrun/model_monitoring/db/tsdb/base.py +147 -15
  84. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
  85. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  86. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
  87. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
  88. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
  89. mlrun/model_monitoring/helpers.py +70 -50
  90. mlrun/model_monitoring/stream_processing.py +96 -195
  91. mlrun/model_monitoring/writer.py +13 -5
  92. mlrun/package/packagers/default_packager.py +2 -2
  93. mlrun/projects/operations.py +16 -8
  94. mlrun/projects/pipelines.py +126 -115
  95. mlrun/projects/project.py +286 -129
  96. mlrun/render.py +3 -3
  97. mlrun/run.py +38 -19
  98. mlrun/runtimes/__init__.py +19 -8
  99. mlrun/runtimes/base.py +4 -1
  100. mlrun/runtimes/daskjob.py +1 -1
  101. mlrun/runtimes/funcdoc.py +1 -1
  102. mlrun/runtimes/kubejob.py +6 -6
  103. mlrun/runtimes/local.py +12 -5
  104. mlrun/runtimes/nuclio/api_gateway.py +68 -8
  105. mlrun/runtimes/nuclio/application/application.py +307 -70
  106. mlrun/runtimes/nuclio/function.py +63 -14
  107. mlrun/runtimes/nuclio/serving.py +10 -10
  108. mlrun/runtimes/pod.py +25 -19
  109. mlrun/runtimes/remotesparkjob.py +2 -5
  110. mlrun/runtimes/sparkjob/spark3job.py +16 -17
  111. mlrun/runtimes/utils.py +34 -0
  112. mlrun/serving/routers.py +2 -5
  113. mlrun/serving/server.py +37 -19
  114. mlrun/serving/states.py +30 -3
  115. mlrun/serving/v2_serving.py +44 -35
  116. mlrun/track/trackers/mlflow_tracker.py +5 -0
  117. mlrun/utils/async_http.py +1 -1
  118. mlrun/utils/db.py +18 -0
  119. mlrun/utils/helpers.py +150 -36
  120. mlrun/utils/http.py +1 -1
  121. mlrun/utils/notifications/notification/__init__.py +0 -1
  122. mlrun/utils/notifications/notification/webhook.py +8 -1
  123. mlrun/utils/notifications/notification_pusher.py +1 -1
  124. mlrun/utils/v3io_clients.py +2 -2
  125. mlrun/utils/version/version.json +2 -2
  126. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
  127. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
  128. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
  129. mlrun/feature_store/retrieval/conversion.py +0 -271
  130. mlrun/model_monitoring/controller_handler.py +0 -37
  131. mlrun/model_monitoring/evidently_application.py +0 -20
  132. mlrun/model_monitoring/prometheus.py +0 -216
  133. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
  134. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
  135. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from datetime import datetime
15
+ from datetime import datetime, timezone
16
16
  from io import StringIO
17
17
  from typing import Literal, Optional, Union
18
18
 
@@ -24,6 +24,7 @@ import mlrun.common.model_monitoring
24
24
  import mlrun.common.schemas.model_monitoring as mm_schemas
25
25
  import mlrun.feature_store.steps
26
26
  import mlrun.utils.v3io_clients
27
+ from mlrun.common.schemas import EventFieldType
27
28
  from mlrun.model_monitoring.db import TSDBConnector
28
29
  from mlrun.model_monitoring.helpers import get_invocations_fqn
29
30
  from mlrun.utils import logger
@@ -33,7 +34,7 @@ _TSDB_RATE = "1/s"
33
34
  _CONTAINER = "users"
34
35
 
35
36
 
36
- def _is_no_schema_error(exc: v3io_frames.ReadError) -> bool:
37
+ def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
37
38
  """
38
39
  In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
39
40
  Check if the error message contains the relevant string to verify the cause.
@@ -64,14 +65,17 @@ class V3IOTSDBConnector(TSDBConnector):
64
65
  self.container = container
65
66
 
66
67
  self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
67
- self._frames_client: v3io_frames.client.ClientBase = (
68
- self._get_v3io_frames_client(self.container)
69
- )
70
-
68
+ self._frames_client: Optional[v3io_frames.client.ClientBase] = None
71
69
  self._init_tables_path()
70
+ self._create_table = create_table
72
71
 
73
- if create_table:
74
- self.create_tables()
72
+ @property
73
+ def frames_client(self) -> v3io_frames.client.ClientBase:
74
+ if not self._frames_client:
75
+ self._frames_client = self._get_v3io_frames_client(self.container)
76
+ if self._create_table:
77
+ self.create_tables()
78
+ return self._frames_client
75
79
 
76
80
  def _init_tables_path(self):
77
81
  self.tables = {}
@@ -89,6 +93,19 @@ class V3IOTSDBConnector(TSDBConnector):
89
93
  )
90
94
  self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
91
95
 
96
+ errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
97
+ project=self.project,
98
+ kind=mm_schemas.FileTargetKind.ERRORS,
99
+ )
100
+ (
101
+ _,
102
+ _,
103
+ errors_path,
104
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
105
+ errors_table_full_path
106
+ )
107
+ self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
108
+
92
109
  monitoring_application_full_path = (
93
110
  mlrun.mlconf.get_model_monitoring_file_target_path(
94
111
  project=self.project,
@@ -138,7 +155,7 @@ class V3IOTSDBConnector(TSDBConnector):
138
155
  for table_name in application_tables:
139
156
  logger.info("Creating table in V3IO TSDB", table_name=table_name)
140
157
  table = self.tables[table_name]
141
- self._frames_client.create(
158
+ self.frames_client.create(
142
159
  backend=_TSDB_BE,
143
160
  table=table,
144
161
  if_exists=v3io_frames.IGNORE,
@@ -148,8 +165,9 @@ class V3IOTSDBConnector(TSDBConnector):
148
165
  def apply_monitoring_stream_steps(
149
166
  self,
150
167
  graph,
151
- tsdb_batching_max_events: int = 10,
152
- tsdb_batching_timeout_secs: int = 300,
168
+ tsdb_batching_max_events: int = 1000,
169
+ tsdb_batching_timeout_secs: int = 30,
170
+ sample_window: int = 10,
153
171
  ):
154
172
  """
155
173
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -171,7 +189,10 @@ class V3IOTSDBConnector(TSDBConnector):
171
189
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
172
190
  container=self.container,
173
191
  v3io_frames=self.v3io_framesd,
174
- columns=[mm_schemas.EventFieldType.LATENCY],
192
+ columns=[
193
+ mm_schemas.EventFieldType.LATENCY,
194
+ mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
195
+ ],
175
196
  index_cols=[
176
197
  mm_schemas.EventFieldType.ENDPOINT_ID,
177
198
  ],
@@ -182,17 +203,23 @@ class V3IOTSDBConnector(TSDBConnector):
182
203
  key=mm_schemas.EventFieldType.ENDPOINT_ID,
183
204
  )
184
205
 
206
+ # Emits the event in window size of events based on sample_window size (10 by default)
207
+ graph.add_step(
208
+ "storey.steps.SampleWindow",
209
+ name="sample",
210
+ after="Rename",
211
+ window_size=sample_window,
212
+ key=EventFieldType.ENDPOINT_ID,
213
+ )
214
+
185
215
  # Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
186
216
  # stats and details about the events
187
217
 
188
- def apply_process_before_tsdb():
189
- graph.add_step(
190
- "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ProcessBeforeTSDB",
191
- name="ProcessBeforeTSDB",
192
- after="sample",
193
- )
194
-
195
- apply_process_before_tsdb()
218
+ graph.add_step(
219
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ProcessBeforeTSDB",
220
+ name="ProcessBeforeTSDB",
221
+ after="sample",
222
+ )
196
223
 
197
224
  # Unpacked keys from each dictionary and write to TSDB target
198
225
  def apply_filter_and_unpacked_keys(name, keys):
@@ -255,6 +282,40 @@ class V3IOTSDBConnector(TSDBConnector):
255
282
  apply_storey_filter()
256
283
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
257
284
 
285
+ def handle_model_error(
286
+ self,
287
+ graph,
288
+ tsdb_batching_max_events: int = 1000,
289
+ tsdb_batching_timeout_secs: int = 30,
290
+ **kwargs,
291
+ ) -> None:
292
+ graph.add_step(
293
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
294
+ name="error_extractor",
295
+ after="ForwardError",
296
+ )
297
+
298
+ graph.add_step(
299
+ "storey.TSDBTarget",
300
+ name="tsdb_error",
301
+ after="error_extractor",
302
+ path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
303
+ rate="1/s",
304
+ time_col=mm_schemas.EventFieldType.TIMESTAMP,
305
+ container=self.container,
306
+ v3io_frames=self.v3io_framesd,
307
+ columns=[
308
+ mm_schemas.EventFieldType.MODEL_ERROR,
309
+ mm_schemas.EventFieldType.ERROR_COUNT,
310
+ ],
311
+ index_cols=[
312
+ mm_schemas.EventFieldType.ENDPOINT_ID,
313
+ ],
314
+ max_events=tsdb_batching_max_events,
315
+ flush_after_seconds=tsdb_batching_timeout_secs,
316
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
317
+ )
318
+
258
319
  def write_application_event(
259
320
  self,
260
321
  event: dict,
@@ -277,12 +338,14 @@ class V3IOTSDBConnector(TSDBConnector):
277
338
  elif kind == mm_schemas.WriterEventKind.RESULT:
278
339
  table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
279
340
  index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
280
- del event[mm_schemas.ResultData.RESULT_EXTRA_DATA]
341
+ event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
342
+ # TODO: remove this when extra data is supported (ML-7460)
343
+ event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
281
344
  else:
282
345
  raise ValueError(f"Invalid {kind = }")
283
346
 
284
347
  try:
285
- self._frames_client.write(
348
+ self.frames_client.write(
286
349
  backend=_TSDB_BE,
287
350
  table=table,
288
351
  dfs=pd.DataFrame.from_records([event]),
@@ -309,7 +372,7 @@ class V3IOTSDBConnector(TSDBConnector):
309
372
  tables = mm_schemas.V3IOTSDBTables.list()
310
373
  for table_to_delete in tables:
311
374
  try:
312
- self._frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
375
+ self.frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
313
376
  except v3io_frames.DeleteError as e:
314
377
  logger.warning(
315
378
  f"Failed to delete TSDB table '{table}'",
@@ -425,7 +488,7 @@ class V3IOTSDBConnector(TSDBConnector):
425
488
  aggregators = ",".join(agg_funcs) if agg_funcs else None
426
489
  table_path = self.tables[table]
427
490
  try:
428
- df = self._frames_client.read(
491
+ df = self.frames_client.read(
429
492
  backend=_TSDB_BE,
430
493
  table=table_path,
431
494
  start=start,
@@ -437,7 +500,7 @@ class V3IOTSDBConnector(TSDBConnector):
437
500
  step=sliding_window_step,
438
501
  **kwargs,
439
502
  )
440
- except v3io_frames.ReadError as err:
503
+ except v3io_frames.Error as err:
441
504
  if _is_no_schema_error(err):
442
505
  return pd.DataFrame()
443
506
  else:
@@ -504,10 +567,16 @@ class V3IOTSDBConnector(TSDBConnector):
504
567
  if type == "metrics":
505
568
  table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
506
569
  name = mm_schemas.MetricData.METRIC_NAME
570
+ columns = [mm_schemas.MetricData.METRIC_VALUE]
507
571
  df_handler = self.df_to_metrics_values
508
572
  elif type == "results":
509
573
  table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
510
574
  name = mm_schemas.ResultData.RESULT_NAME
575
+ columns = [
576
+ mm_schemas.ResultData.RESULT_VALUE,
577
+ mm_schemas.ResultData.RESULT_STATUS,
578
+ mm_schemas.ResultData.RESULT_KIND,
579
+ ]
511
580
  df_handler = self.df_to_results_values
512
581
  else:
513
582
  raise ValueError(f"Invalid {type = }")
@@ -517,11 +586,12 @@ class V3IOTSDBConnector(TSDBConnector):
517
586
  metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
518
587
  table_path=table_path,
519
588
  name=name,
589
+ columns=columns,
520
590
  )
521
591
 
522
592
  logger.debug("Querying V3IO TSDB", query=query)
523
593
 
524
- df: pd.DataFrame = self._frames_client.read(
594
+ df: pd.DataFrame = self.frames_client.read(
525
595
  backend=_TSDB_BE,
526
596
  start=start,
527
597
  end=end,
@@ -599,7 +669,6 @@ class V3IOTSDBConnector(TSDBConnector):
599
669
  end=end,
600
670
  columns=[mm_schemas.EventFieldType.LATENCY],
601
671
  filter_query=f"endpoint_id=='{endpoint_id}'",
602
- interval=aggregation_window,
603
672
  agg_funcs=agg_funcs,
604
673
  sliding_window_step=aggregation_window,
605
674
  )
@@ -628,33 +697,153 @@ class V3IOTSDBConnector(TSDBConnector):
628
697
  ), # pyright: ignore[reportArgumentType]
629
698
  )
630
699
 
631
- # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
632
- #
633
- # def read_prediction_metric_for_endpoint_if_exists(
634
- # self, endpoint_id: str
635
- # ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
636
- # """
637
- # Read the count of the latency column in the predictions table for the given endpoint_id.
638
- # We just want to check if there is any data for this endpoint_id.
639
- # """
640
- # query = self._get_sql_query(
641
- # endpoint_id=endpoint_id,
642
- # table_path=self.tables[mm_schemas.FileTargetKind.PREDICTIONS],
643
- # columns=[f"count({mm_schemas.EventFieldType.LATENCY})"],
644
- # )
645
- # try:
646
- # logger.debug("Checking TSDB", project=self.project, query=query)
647
- # df: pd.DataFrame = self._frames_client.read(
648
- # backend=_TSDB_BE, query=query, start="0", end="now"
649
- # )
650
- # except v3io_frames.ReadError as err:
651
- # if _is_no_schema_error(err):
652
- # logger.debug(
653
- # "No predictions yet", project=self.project, endpoint_id=endpoint_id
654
- # )
655
- # return
656
- # else:
657
- # raise
658
- #
659
- # if not df.empty:
660
- # return get_invocations_metric(self.project)
700
+ def get_last_request(
701
+ self,
702
+ endpoint_ids: Union[str, list[str]],
703
+ start: Union[datetime, str] = "0",
704
+ end: Union[datetime, str] = "now",
705
+ ) -> pd.DataFrame:
706
+ endpoint_ids = (
707
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
708
+ )
709
+ df = self._get_records(
710
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
711
+ start=start,
712
+ end=end,
713
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
714
+ agg_funcs=["last"],
715
+ )
716
+ if not df.empty:
717
+ df.rename(
718
+ columns={
719
+ f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
720
+ f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
721
+ },
722
+ inplace=True,
723
+ )
724
+ df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
725
+ mm_schemas.EventFieldType.LAST_REQUEST
726
+ ].map(
727
+ lambda last_request: datetime.fromtimestamp(
728
+ last_request, tz=timezone.utc
729
+ )
730
+ )
731
+
732
+ return df.reset_index(drop=True)
733
+
734
+ def get_drift_status(
735
+ self,
736
+ endpoint_ids: Union[str, list[str]],
737
+ start: Union[datetime, str] = "now-24h",
738
+ end: Union[datetime, str] = "now",
739
+ ) -> pd.DataFrame:
740
+ endpoint_ids = (
741
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
742
+ )
743
+ df = self._get_records(
744
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
745
+ start=start,
746
+ end=end,
747
+ columns=[mm_schemas.ResultData.RESULT_STATUS],
748
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
749
+ agg_funcs=["max"],
750
+ group_by="endpoint_id",
751
+ )
752
+ if not df.empty:
753
+ df.columns = [
754
+ col[len("max(") : -1] if "max(" in col else col for col in df.columns
755
+ ]
756
+ return df.reset_index(drop=True)
757
+
758
+ def get_metrics_metadata(
759
+ self,
760
+ endpoint_id: str,
761
+ start: Union[datetime, str] = "0",
762
+ end: Union[datetime, str] = "now",
763
+ ) -> pd.DataFrame:
764
+ df = self._get_records(
765
+ table=mm_schemas.V3IOTSDBTables.METRICS,
766
+ start=start,
767
+ end=end,
768
+ columns=[mm_schemas.MetricData.METRIC_VALUE],
769
+ filter_query=f"endpoint_id=='{endpoint_id}'",
770
+ agg_funcs=["last"],
771
+ )
772
+ if not df.empty:
773
+ df.drop(
774
+ columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
775
+ )
776
+ return df.reset_index(drop=True)
777
+
778
+ def get_results_metadata(
779
+ self,
780
+ endpoint_id: str,
781
+ start: Union[datetime, str] = "0",
782
+ end: Union[datetime, str] = "now",
783
+ ) -> pd.DataFrame:
784
+ df = self._get_records(
785
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
786
+ start=start,
787
+ end=end,
788
+ columns=[
789
+ mm_schemas.ResultData.RESULT_KIND,
790
+ ],
791
+ filter_query=f"endpoint_id=='{endpoint_id}'",
792
+ agg_funcs=["last"],
793
+ )
794
+ if not df.empty:
795
+ df.rename(
796
+ columns={
797
+ f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
798
+ },
799
+ inplace=True,
800
+ )
801
+ return df.reset_index(drop=True)
802
+
803
+ def get_error_count(
804
+ self,
805
+ endpoint_ids: Union[str, list[str]],
806
+ start: Union[datetime, str] = "0",
807
+ end: Union[datetime, str] = "now",
808
+ ) -> pd.DataFrame:
809
+ endpoint_ids = (
810
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
811
+ )
812
+ df = self._get_records(
813
+ table=mm_schemas.FileTargetKind.ERRORS,
814
+ start=start,
815
+ end=end,
816
+ columns=[mm_schemas.EventFieldType.ERROR_COUNT],
817
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
818
+ agg_funcs=["count"],
819
+ )
820
+ if not df.empty:
821
+ df.rename(
822
+ columns={
823
+ f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
824
+ },
825
+ inplace=True,
826
+ )
827
+ df.dropna(inplace=True)
828
+ return df.reset_index(drop=True)
829
+
830
+ def get_avg_latency(
831
+ self,
832
+ endpoint_ids: Union[str, list[str]],
833
+ start: Union[datetime, str] = "0",
834
+ end: Union[datetime, str] = "now",
835
+ ) -> pd.DataFrame:
836
+ endpoint_ids = (
837
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
838
+ )
839
+ df = self._get_records(
840
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
841
+ start=start,
842
+ end=end,
843
+ columns=[mm_schemas.EventFieldType.LATENCY],
844
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
845
+ agg_funcs=["avg"],
846
+ )
847
+ if not df.empty:
848
+ df.dropna(inplace=True)
849
+ return df.reset_index(drop=True)
@@ -18,25 +18,23 @@ import typing
18
18
  import numpy as np
19
19
  import pandas as pd
20
20
 
21
+ if typing.TYPE_CHECKING:
22
+ from mlrun.db.base import RunDBInterface
23
+ from mlrun.projects import MlrunProject
24
+
21
25
  import mlrun
26
+ import mlrun.artifacts
22
27
  import mlrun.common.model_monitoring.helpers
23
- import mlrun.common.schemas
24
- from mlrun.common.schemas.model_monitoring import (
25
- EventFieldType,
26
- )
28
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
29
+ import mlrun.data_types.infer
30
+ import mlrun.model_monitoring
27
31
  from mlrun.common.schemas.model_monitoring.model_endpoints import (
28
32
  ModelEndpointMonitoringMetric,
29
- ModelEndpointMonitoringMetricType,
30
33
  _compose_full_name,
31
34
  )
32
35
  from mlrun.model_monitoring.model_endpoint import ModelEndpoint
33
36
  from mlrun.utils import logger
34
37
 
35
- if typing.TYPE_CHECKING:
36
- from mlrun.db.base import RunDBInterface
37
- from mlrun.projects import MlrunProject
38
- import mlrun.common.schemas.model_monitoring.constants as mm_constants
39
-
40
38
 
41
39
  class _BatchDict(typing.TypedDict):
42
40
  minutes: int
@@ -45,33 +43,32 @@ class _BatchDict(typing.TypedDict):
45
43
 
46
44
 
47
45
  def get_stream_path(
48
- project: str = None,
46
+ project: str,
49
47
  function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
48
+ stream_uri: typing.Optional[str] = None,
50
49
  ) -> str:
51
50
  """
52
51
  Get stream path from the project secret. If wasn't set, take it from the system configurations
53
52
 
54
53
  :param project: Project name.
55
- :param function_name: Application name. Default is model_monitoring_stream.
54
+ :param function_name: Application name. Default is model_monitoring_stream.
55
+ :param stream_uri: Stream URI. If provided, it will be used instead of the one from the project secret.
56
56
 
57
57
  :return: Monitoring stream path to the relevant application.
58
58
  """
59
59
 
60
- stream_uri = mlrun.get_secret_or_env(
61
- mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
60
+ stream_uri = stream_uri or mlrun.get_secret_or_env(
61
+ mm_constants.ProjectSecretKeys.STREAM_PATH
62
62
  )
63
63
 
64
64
  if not stream_uri or stream_uri == "v3io":
65
- # TODO : remove the first part of this condition in 1.9.0
66
65
  stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
67
66
  project=project,
68
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
67
+ kind=mm_constants.FileTargetKind.STREAM,
69
68
  target="online",
70
69
  function_name=function_name,
71
70
  )
72
71
 
73
- if isinstance(stream_uri, list): # ML-6043 - user side gets only the new stream uri
74
- stream_uri = stream_uri[1] # get new stream path, under projects
75
72
  return mlrun.common.model_monitoring.helpers.parse_monitoring_stream_path(
76
73
  stream_uri=stream_uri, project=project, function_name=function_name
77
74
  )
@@ -79,7 +76,7 @@ def get_stream_path(
79
76
 
80
77
  def get_monitoring_parquet_path(
81
78
  project: "MlrunProject",
82
- kind: str = mlrun.common.schemas.model_monitoring.FileTargetKind.PARQUET,
79
+ kind: str = mm_constants.FileTargetKind.PARQUET,
83
80
  ) -> str:
84
81
  """Get model monitoring parquet target for the current project and kind. The parquet target path is based on the
85
82
  project artifact path. If project artifact path is not defined, the parquet target path will be based on MLRun
@@ -111,12 +108,9 @@ def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -
111
108
 
112
109
  """
113
110
 
114
- return (
115
- mlrun.get_secret_or_env(
116
- key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
117
- secret_provider=secret_provider,
118
- )
119
- or mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection
111
+ return mlrun.get_secret_or_env(
112
+ key=mm_constants.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
113
+ secret_provider=secret_provider,
120
114
  )
121
115
 
122
116
 
@@ -129,12 +123,9 @@ def get_tsdb_connection_string(
129
123
  :return: Valid TSDB connection string.
130
124
  """
131
125
 
132
- return (
133
- mlrun.get_secret_or_env(
134
- key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.TSDB_CONNECTION,
135
- secret_provider=secret_provider,
136
- )
137
- or mlrun.mlconf.model_endpoint_monitoring.tsdb_connection
126
+ return mlrun.get_secret_or_env(
127
+ key=mm_constants.ProjectSecretKeys.TSDB_CONNECTION,
128
+ secret_provider=secret_provider,
138
129
  )
139
130
 
140
131
 
@@ -184,7 +175,7 @@ def _get_monitoring_time_window_from_controller_run(
184
175
  def update_model_endpoint_last_request(
185
176
  project: str,
186
177
  model_endpoint: ModelEndpoint,
187
- current_request: datetime,
178
+ current_request: datetime.datetime,
188
179
  db: "RunDBInterface",
189
180
  ) -> None:
190
181
  """
@@ -195,7 +186,8 @@ def update_model_endpoint_last_request(
195
186
  :param current_request: current request time
196
187
  :param db: DB interface.
197
188
  """
198
- if model_endpoint.spec.stream_path != "":
189
+ is_model_server_endpoint = model_endpoint.spec.stream_path != ""
190
+ if is_model_server_endpoint:
199
191
  current_request = current_request.isoformat()
200
192
  logger.info(
201
193
  "Update model endpoint last request time (EP with serving)",
@@ -207,14 +199,15 @@ def update_model_endpoint_last_request(
207
199
  db.patch_model_endpoint(
208
200
  project=project,
209
201
  endpoint_id=model_endpoint.metadata.uid,
210
- attributes={EventFieldType.LAST_REQUEST: current_request},
202
+ attributes={mm_constants.EventFieldType.LAST_REQUEST: current_request},
211
203
  )
212
- else:
204
+ else: # model endpoint without any serving function - close the window "manually"
213
205
  try:
214
206
  time_window = _get_monitoring_time_window_from_controller_run(project, db)
215
207
  except mlrun.errors.MLRunNotFoundError:
216
- logger.debug(
217
- "Not bumping model endpoint last request time - the monitoring controller isn't deployed yet"
208
+ logger.warn(
209
+ "Not bumping model endpoint last request time - the monitoring controller isn't deployed yet.\n"
210
+ "Call `project.enable_model_monitoring()` first."
218
211
  )
219
212
  return
220
213
 
@@ -236,7 +229,7 @@ def update_model_endpoint_last_request(
236
229
  db.patch_model_endpoint(
237
230
  project=project,
238
231
  endpoint_id=model_endpoint.metadata.uid,
239
- attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
232
+ attributes={mm_constants.EventFieldType.LAST_REQUEST: bumped_last_request},
240
233
  )
241
234
 
242
235
 
@@ -256,12 +249,11 @@ def calculate_inputs_statistics(
256
249
 
257
250
  # Use `DFDataInfer` to calculate the statistics over the inputs:
258
251
  inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
259
- df=inputs,
260
- options=mlrun.data_types.infer.InferOptions.Histogram,
252
+ df=inputs, options=mlrun.data_types.infer.InferOptions.Histogram
261
253
  )
262
254
 
263
255
  # Recalculate the histograms over the bins that are set in the sample-set of the end point:
264
- for feature in inputs_statistics.keys():
256
+ for feature in list(inputs_statistics):
265
257
  if feature in sample_set_statistics:
266
258
  counts, bins = np.histogram(
267
259
  inputs[feature].to_numpy(),
@@ -271,13 +263,9 @@ def calculate_inputs_statistics(
271
263
  counts.tolist(),
272
264
  bins.tolist(),
273
265
  ]
274
- elif "hist" in inputs_statistics[feature]:
275
- # Comply with the other common features' histogram length
276
- mlrun.common.model_monitoring.helpers.pad_hist(
277
- mlrun.common.model_monitoring.helpers.Histogram(
278
- inputs_statistics[feature]["hist"]
279
- )
280
- )
266
+ else:
267
+ # If the feature is not in the sample set and doesn't have a histogram, remove it from the statistics:
268
+ inputs_statistics.pop(feature)
281
269
 
282
270
  return inputs_statistics
283
271
 
@@ -312,7 +300,7 @@ def get_invocations_fqn(project: str) -> str:
312
300
  project=project,
313
301
  app=mm_constants.SpecialApps.MLRUN_INFRA,
314
302
  name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
315
- type=ModelEndpointMonitoringMetricType.METRIC,
303
+ type=mm_constants.ModelEndpointMonitoringMetricType.METRIC,
316
304
  )
317
305
 
318
306
 
@@ -326,7 +314,39 @@ def get_invocations_metric(project: str) -> ModelEndpointMonitoringMetric:
326
314
  return ModelEndpointMonitoringMetric(
327
315
  project=project,
328
316
  app=mm_constants.SpecialApps.MLRUN_INFRA,
329
- type=ModelEndpointMonitoringMetricType.METRIC,
317
+ type=mm_constants.ModelEndpointMonitoringMetricType.METRIC,
330
318
  name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
331
319
  full_name=get_invocations_fqn(project),
332
320
  )
321
+
322
+
323
+ def enrich_model_endpoint_with_model_uri(
324
+ model_endpoint: ModelEndpoint,
325
+ model_obj: mlrun.artifacts.ModelArtifact,
326
+ ):
327
+ """
328
+ Enrich the model endpoint object with the model uri from the model object. We will use a unique reference
329
+ to the model object that includes the project, db_key, iter, and tree.
330
+ In addition, we verify that the model object is of type `ModelArtifact`.
331
+
332
+ :param model_endpoint: An object representing the model endpoint that will be enriched with the model uri.
333
+ :param model_obj: An object representing the model artifact.
334
+
335
+ :raise: `MLRunInvalidArgumentError` if the model object is not of type `ModelArtifact`.
336
+ """
337
+ mlrun.utils.helpers.verify_field_of_type(
338
+ field_name="model_endpoint.spec.model_uri",
339
+ field_value=model_obj,
340
+ expected_type=mlrun.artifacts.ModelArtifact,
341
+ )
342
+
343
+ # Update model_uri with a unique reference to handle future changes
344
+ model_artifact_uri = mlrun.utils.helpers.generate_artifact_uri(
345
+ project=model_endpoint.metadata.project,
346
+ key=model_obj.db_key,
347
+ iter=model_obj.iter,
348
+ tree=model_obj.tree,
349
+ )
350
+ model_endpoint.spec.model_uri = mlrun.datastore.get_store_uri(
351
+ kind=mlrun.utils.helpers.StorePrefix.Model, uri=model_artifact_uri
352
+ )