mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/serving/steps.py ADDED
@@ -0,0 +1,62 @@
1
+ # Copyright 2025 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Union
16
+
17
+ import storey
18
+
19
+ import mlrun.errors
20
+
21
+
22
+ class ChoiceByField(storey.Choice):
23
+ """
24
+ Selects downstream outlets to route each event based on a predetermined field.
25
+ :param field_name: event field name that contains the step name or names of the desired outlet or outlets
26
+ """
27
+
28
+ def __init__(self, field_name: Union[str, list[str]], **kwargs):
29
+ self.field_name = field_name
30
+ super().__init__(**kwargs)
31
+
32
+ def select_outlets(self, event):
33
+ # Case 1: Missing field
34
+ if self.field_name not in event:
35
+ raise mlrun.errors.MLRunRuntimeError(
36
+ f"Field '{self.field_name}' is not contained in the event keys {list(event.keys())}."
37
+ )
38
+
39
+ outlet = event[self.field_name]
40
+
41
+ # Case 2: Field exists but is None
42
+ if outlet is None:
43
+ raise mlrun.errors.MLRunInvalidArgumentError(
44
+ f"Field '{self.field_name}' exists but its value is None."
45
+ )
46
+
47
+ # Case 3: Invalid type
48
+ if not isinstance(outlet, str | list | tuple):
49
+ raise mlrun.errors.MLRunInvalidArgumentTypeError(
50
+ f"Field '{self.field_name}' must be a string or list of strings "
51
+ f"but is instead of type '{type(outlet).__name__}'."
52
+ )
53
+
54
+ outlets = [outlet] if isinstance(outlet, str) else outlet
55
+
56
+ # Case 4: Empty list or tuple
57
+ if not outlets:
58
+ raise mlrun.errors.MLRunRuntimeError(
59
+ f"The value of the key '{self.field_name}' cannot be an empty {type(outlets).__name__}."
60
+ )
61
+
62
+ return outlets
@@ -11,8 +11,9 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import random
15
+ import typing
16
+ from copy import copy
16
17
  from datetime import timedelta
17
18
  from typing import Any, Optional, Union
18
19
 
@@ -22,11 +23,29 @@ import storey
22
23
  import mlrun
23
24
  import mlrun.artifacts
24
25
  import mlrun.common.schemas.model_monitoring as mm_schemas
26
+ import mlrun.feature_store
25
27
  import mlrun.serving
28
+ from mlrun.common.model_monitoring.helpers import (
29
+ get_model_endpoints_creation_task_status,
30
+ )
26
31
  from mlrun.common.schemas import MonitoringData
27
32
  from mlrun.utils import get_data_from_path, logger
28
33
 
29
34
 
35
+ class MatchingEndpointsState(mlrun.common.types.StrEnum):
36
+ all_matched = "all_matched"
37
+ not_all_matched = "not_all_matched"
38
+ no_check_needed = "no_check_needed"
39
+ not_yet_checked = "not_yet_matched"
40
+
41
+ @staticmethod
42
+ def success_states() -> list[str]:
43
+ return [
44
+ MatchingEndpointsState.all_matched,
45
+ MatchingEndpointsState.no_check_needed,
46
+ ]
47
+
48
+
30
49
  class MonitoringPreProcessor(storey.MapClass):
31
50
  """preprocess step, reconstructs the serving output event body to StreamProcessingEvent schema"""
32
51
 
@@ -45,33 +64,24 @@ class MonitoringPreProcessor(storey.MapClass):
45
64
  result_path = model_monitoring_data.get(MonitoringData.RESULT_PATH)
46
65
  input_path = model_monitoring_data.get(MonitoringData.INPUT_PATH)
47
66
 
48
- result = get_data_from_path(result_path, event.body.get(model, event.body))
49
67
  output_schema = model_monitoring_data.get(MonitoringData.OUTPUTS)
50
68
  input_schema = model_monitoring_data.get(MonitoringData.INPUTS)
51
- logger.debug("output schema retrieved", output_schema=output_schema)
52
- if isinstance(result, dict):
53
- # transpose by key the outputs:
54
- outputs = self.transpose_by_key(result, output_schema)
55
- if not output_schema:
56
- logger.warn(
57
- "Output schema was not provided using Project:log_model or by ModelRunnerStep:add_model order "
58
- "may not preserved"
59
- )
60
- else:
61
- outputs = result
62
-
63
- event_inputs = event._metadata.get("inputs", {})
64
- event_inputs = get_data_from_path(input_path, event_inputs)
65
- if isinstance(event_inputs, dict):
66
- # transpose by key the inputs:
67
- inputs = self.transpose_by_key(event_inputs, input_schema)
68
- if not input_schema:
69
- logger.warn(
70
- "Input schema was not provided using by ModelRunnerStep:add_model, order "
71
- "may not preserved"
72
- )
69
+ logger.debug(
70
+ "output and input schema retrieved",
71
+ output_schema=output_schema,
72
+ input_schema=input_schema,
73
+ )
74
+ if event.body and isinstance(event.body, list):
75
+ outputs, new_output_schema = self.get_listed_data(
76
+ event.body, result_path, output_schema
77
+ )
73
78
  else:
74
- inputs = event_inputs
79
+ outputs, new_output_schema = self.get_listed_data(
80
+ event.body.get(model, event.body), result_path, output_schema
81
+ )
82
+ inputs, new_input_schema = self.get_listed_data(
83
+ event._metadata.get("inputs", {}), input_path, input_schema
84
+ )
75
85
 
76
86
  if outputs and isinstance(outputs[0], list):
77
87
  if output_schema and len(output_schema) != len(outputs[0]):
@@ -96,15 +106,43 @@ class MonitoringPreProcessor(storey.MapClass):
96
106
  "outputs and inputs are not in the same length check 'input_path' and "
97
107
  "'output_path' was specified if needed"
98
108
  )
99
- request = {"inputs": inputs, "id": getattr(event, "id", None)}
100
- resp = {"outputs": outputs}
109
+ request = {
110
+ "inputs": inputs,
111
+ "id": getattr(event, "id", None),
112
+ "input_schema": new_input_schema,
113
+ }
114
+ resp = {"outputs": outputs, "output_schema": new_output_schema}
101
115
 
102
116
  return request, resp
103
117
 
118
+ def get_listed_data(
119
+ self,
120
+ raw_data: typing.Union[dict, list],
121
+ data_path: Optional[Union[list[str], str]] = None,
122
+ schema: Optional[list[str]] = None,
123
+ ):
124
+ """Get data from a path and transpose it by keys if dict is provided."""
125
+ new_schema = None
126
+ data_from_path = get_data_from_path(data_path, raw_data)
127
+ if isinstance(data_from_path, dict):
128
+ # transpose by key the inputs:
129
+ listed_data, new_schema = self.transpose_by_key(data_from_path, schema)
130
+ new_schema = new_schema or schema
131
+ if not schema:
132
+ logger.warn(
133
+ f"No schema provided through add_model(); the order of {data_from_path} "
134
+ "may not be preserved."
135
+ )
136
+ elif not isinstance(data_from_path, list):
137
+ listed_data = [data_from_path]
138
+ else:
139
+ listed_data = data_from_path
140
+ return listed_data, new_schema
141
+
104
142
  @staticmethod
105
143
  def transpose_by_key(
106
144
  data: dict, schema: Optional[Union[str, list[str]]] = None
107
- ) -> Union[list[Any], list[list[Any]]]:
145
+ ) -> tuple[Union[list[Any], list[list[Any]]], list[str]]:
108
146
  """
109
147
  Transpose values from a dictionary by keys.
110
148
 
@@ -136,25 +174,32 @@ class MonitoringPreProcessor(storey.MapClass):
136
174
  * If result is a matrix, returns a list of lists.
137
175
 
138
176
  :raises ValueError: If the values include a mix of scalars and lists, or if the list lengths do not match.
177
+ mlrun.MLRunInvalidArgumentError if the schema keys are not contained in the data keys.
139
178
  """
140
-
179
+ new_schema = None
180
+ # Normalize keys in data:
181
+ normalize_data = {
182
+ mlrun.feature_store.api.norm_column_name(k): copy(v)
183
+ for k, v in data.items()
184
+ }
141
185
  # Normalize schema to list
142
186
  if not schema:
143
- keys = list(data.keys())
187
+ keys = list(normalize_data.keys())
188
+ new_schema = keys
144
189
  elif isinstance(schema, str):
145
- keys = [schema]
190
+ keys = [mlrun.feature_store.api.norm_column_name(schema)]
146
191
  else:
147
- keys = schema
192
+ keys = [mlrun.feature_store.api.norm_column_name(key) for key in schema]
148
193
 
149
- values = [data[key] for key in keys if key in data]
194
+ values = [normalize_data[key] for key in keys if key in normalize_data]
150
195
  if len(values) != len(keys):
151
196
  raise mlrun.MLRunInvalidArgumentError(
152
- f"Schema keys {keys} do not match the data keys {list(data.keys())}."
197
+ f"Schema keys {keys} are not contained in the data keys {list(data.keys())}."
153
198
  )
154
199
 
155
200
  # Detect if all are scalars ie: int,float,str
156
- all_scalars = all(not isinstance(v, (list, tuple, np.ndarray)) for v in values)
157
- all_lists = all(isinstance(v, (list, tuple, np.ndarray)) for v in values)
201
+ all_scalars = all(not isinstance(v, list | tuple | np.ndarray) for v in values)
202
+ all_lists = all(isinstance(v, list | tuple | np.ndarray) for v in values)
158
203
 
159
204
  if not (all_scalars or all_lists):
160
205
  raise ValueError(
@@ -168,12 +213,12 @@ class MonitoringPreProcessor(storey.MapClass):
168
213
  mat = np.stack(arrays, axis=0)
169
214
  transposed = mat.T
170
215
  else:
171
- return values[0]
216
+ return values[0], new_schema
172
217
 
173
218
  if transposed.shape[1] == 1 and transposed.shape[0] == 1:
174
219
  # Transform [[0]] -> [0]:
175
- return transposed[:, 0].tolist()
176
- return transposed.tolist()
220
+ return transposed[:, 0].tolist(), new_schema
221
+ return transposed.tolist(), new_schema
177
222
 
178
223
  def do(self, event):
179
224
  monitoring_event_list = []
@@ -202,6 +247,21 @@ class MonitoringPreProcessor(storey.MapClass):
202
247
  when = event._metadata.get(model, {}).get(
203
248
  mm_schemas.StreamProcessingEvent.WHEN
204
249
  )
250
+ # if the body is not a dict, use empty labels, error and metrics
251
+ if isinstance(event.body[model], dict):
252
+ body_by_model = event.body[model]
253
+ labels = body_by_model.get("labels") or {}
254
+ error = body_by_model.get(
255
+ mm_schemas.StreamProcessingEvent.ERROR
256
+ )
257
+ metrics = body_by_model.get(
258
+ mm_schemas.StreamProcessingEvent.METRICS
259
+ )
260
+ else:
261
+ labels = {}
262
+ error = None
263
+ metrics = None
264
+
205
265
  monitoring_event_list.append(
206
266
  {
207
267
  mm_schemas.StreamProcessingEvent.MODEL: model,
@@ -217,26 +277,14 @@ class MonitoringPreProcessor(storey.MapClass):
217
277
  ].get(
218
278
  mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
219
279
  ),
220
- mm_schemas.StreamProcessingEvent.LABELS: monitoring_data[
221
- model
222
- ].get(mlrun.common.schemas.MonitoringData.OUTPUTS),
280
+ mm_schemas.StreamProcessingEvent.LABELS: labels,
223
281
  mm_schemas.StreamProcessingEvent.FUNCTION_URI: self.server.function_uri
224
282
  if self.server
225
283
  else None,
226
284
  mm_schemas.StreamProcessingEvent.REQUEST: request,
227
285
  mm_schemas.StreamProcessingEvent.RESPONSE: resp,
228
- mm_schemas.StreamProcessingEvent.ERROR: event.body[model][
229
- mm_schemas.StreamProcessingEvent.ERROR
230
- ]
231
- if mm_schemas.StreamProcessingEvent.ERROR
232
- in event.body[model]
233
- else None,
234
- mm_schemas.StreamProcessingEvent.METRICS: event.body[model][
235
- mm_schemas.StreamProcessingEvent.METRICS
236
- ]
237
- if mm_schemas.StreamProcessingEvent.METRICS
238
- in event.body[model]
239
- else None,
286
+ mm_schemas.StreamProcessingEvent.ERROR: error,
287
+ mm_schemas.StreamProcessingEvent.METRICS: metrics,
240
288
  }
241
289
  )
242
290
  elif monitoring_data:
@@ -248,6 +296,15 @@ class MonitoringPreProcessor(storey.MapClass):
248
296
  when = event._original_timestamp
249
297
  else:
250
298
  when = event._metadata.get(mm_schemas.StreamProcessingEvent.WHEN)
299
+ # if the body is not a dict, use empty labels, error and metrics
300
+ if isinstance(event.body, dict):
301
+ labels = event.body.get("labels") or {}
302
+ error = event.body.get(mm_schemas.StreamProcessingEvent.ERROR)
303
+ metrics = event.body.get(mm_schemas.StreamProcessingEvent.METRICS)
304
+ else:
305
+ labels = {}
306
+ error = None
307
+ metrics = None
251
308
  monitoring_event_list.append(
252
309
  {
253
310
  mm_schemas.StreamProcessingEvent.MODEL: model,
@@ -261,24 +318,14 @@ class MonitoringPreProcessor(storey.MapClass):
261
318
  mm_schemas.StreamProcessingEvent.ENDPOINT_ID: monitoring_data[
262
319
  model
263
320
  ].get(mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID),
264
- mm_schemas.StreamProcessingEvent.LABELS: monitoring_data[model].get(
265
- mlrun.common.schemas.MonitoringData.OUTPUTS
266
- ),
321
+ mm_schemas.StreamProcessingEvent.LABELS: labels,
267
322
  mm_schemas.StreamProcessingEvent.FUNCTION_URI: self.server.function_uri
268
323
  if self.server
269
324
  else None,
270
325
  mm_schemas.StreamProcessingEvent.REQUEST: request,
271
326
  mm_schemas.StreamProcessingEvent.RESPONSE: resp,
272
- mm_schemas.StreamProcessingEvent.ERROR: event.body[
273
- mm_schemas.StreamProcessingEvent.ERROR
274
- ]
275
- if mm_schemas.StreamProcessingEvent.ERROR in event.body
276
- else None,
277
- mm_schemas.StreamProcessingEvent.METRICS: event.body[
278
- mm_schemas.StreamProcessingEvent.METRICS
279
- ]
280
- if mm_schemas.StreamProcessingEvent.METRICS in event.body
281
- else None,
327
+ mm_schemas.StreamProcessingEvent.ERROR: error,
328
+ mm_schemas.StreamProcessingEvent.METRICS: metrics,
282
329
  }
283
330
  )
284
331
  event.body = monitoring_event_list
@@ -293,6 +340,9 @@ class BackgroundTaskStatus(storey.MapClass):
293
340
 
294
341
  def __init__(self, **kwargs):
295
342
  super().__init__(**kwargs)
343
+ self.matching_endpoints = MatchingEndpointsState.not_yet_checked
344
+ self.graph_model_endpoint_uids: set = set()
345
+ self.listed_model_endpoint_uids: set = set()
296
346
  self.server: mlrun.serving.GraphServer = (
297
347
  getattr(self.context, "server", None) if self.context else None
298
348
  )
@@ -313,43 +363,47 @@ class BackgroundTaskStatus(storey.MapClass):
313
363
  )
314
364
  )
315
365
  ):
316
- background_task = mlrun.get_run_db().get_project_background_task(
317
- self.server.project, self.server.model_endpoint_creation_task_name
318
- )
319
- self._background_task_check_timestamp = mlrun.utils.now_date()
320
- self._log_background_task_state(background_task.status.state)
321
- self._background_task_state = background_task.status.state
366
+ (
367
+ self._background_task_state,
368
+ self._background_task_check_timestamp,
369
+ self.listed_model_endpoint_uids,
370
+ ) = get_model_endpoints_creation_task_status(self.server)
371
+ if (
372
+ self.listed_model_endpoint_uids
373
+ and self.matching_endpoints == MatchingEndpointsState.not_yet_checked
374
+ ):
375
+ if not self.graph_model_endpoint_uids:
376
+ self.graph_model_endpoint_uids = collect_model_endpoint_uids(
377
+ self.server
378
+ )
379
+
380
+ if self.graph_model_endpoint_uids.issubset(self.listed_model_endpoint_uids):
381
+ self.matching_endpoints = MatchingEndpointsState.all_matched
382
+ elif self.listed_model_endpoint_uids is None:
383
+ self.matching_endpoints = MatchingEndpointsState.no_check_needed
322
384
 
323
385
  if (
324
386
  self._background_task_state
325
387
  == mlrun.common.schemas.BackgroundTaskState.succeeded
388
+ and self.matching_endpoints in MatchingEndpointsState.success_states()
326
389
  ):
327
390
  return event
328
391
  else:
329
392
  return None
330
393
 
331
- def _log_background_task_state(
332
- self, background_task_state: mlrun.common.schemas.BackgroundTaskState
333
- ):
334
- logger.info(
335
- "Checking model endpoint creation task status",
336
- task_name=self.server.model_endpoint_creation_task_name,
337
- )
338
- if (
339
- background_task_state
340
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
341
- ):
342
- logger.info(
343
- f"Model endpoint creation task completed with state {background_task_state}"
344
- )
345
- else: # in progress
346
- logger.info(
347
- f"Model endpoint creation task is still in progress with the current state: "
348
- f"{background_task_state}. Events will not be monitored for the next "
349
- f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
350
- name=self.name,
351
- background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
352
- )
394
+
395
+ def collect_model_endpoint_uids(server: mlrun.serving.GraphServer) -> set[str]:
396
+ """Collects all model endpoint UIDs from the server's graph steps."""
397
+ model_endpoint_uids = set()
398
+ for step in server.graph.steps.values():
399
+ if hasattr(step, "monitoring_data"):
400
+ for model in step.monitoring_data.keys():
401
+ uid = step.monitoring_data[model].get(
402
+ mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
403
+ )
404
+ if uid:
405
+ model_endpoint_uids.add(uid)
406
+ return model_endpoint_uids
353
407
 
354
408
 
355
409
  class SamplingStep(storey.MapClass):
@@ -24,6 +24,9 @@ import mlrun.common.schemas.model_monitoring
24
24
  import mlrun.model_monitoring
25
25
  from mlrun.utils import logger, now_date
26
26
 
27
+ from ..common.model_monitoring.helpers import (
28
+ get_model_endpoints_creation_task_status,
29
+ )
27
30
  from .utils import StepToDict, _extract_input_data, _update_result_body
28
31
 
29
32
 
@@ -474,22 +477,18 @@ class V2ModelServer(StepToDict):
474
477
  ) or getattr(self.context, "server", None)
475
478
  if not self.context.is_mock or self.context.monitoring_mock:
476
479
  if server.model_endpoint_creation_task_name:
477
- background_task = mlrun.get_run_db().get_project_background_task(
478
- server.project, server.model_endpoint_creation_task_name
479
- )
480
- logger.debug(
481
- "Checking model endpoint creation task status",
482
- task_name=server.model_endpoint_creation_task_name,
480
+ background_task_state, _, _ = get_model_endpoints_creation_task_status(
481
+ server
483
482
  )
484
483
  if (
485
- background_task.status.state
484
+ background_task_state
486
485
  in mlrun.common.schemas.BackgroundTaskState.terminal_states()
487
486
  ):
488
487
  logger.debug(
489
- f"Model endpoint creation task completed with state {background_task.status.state}"
488
+ f"Model endpoint creation task completed with state {background_task_state}"
490
489
  )
491
490
  if (
492
- background_task.status.state
491
+ background_task_state
493
492
  == mlrun.common.schemas.BackgroundTaskState.succeeded
494
493
  ):
495
494
  self._model_logger = (
@@ -504,7 +503,7 @@ class V2ModelServer(StepToDict):
504
503
  else: # in progress
505
504
  logger.debug(
506
505
  f"Model endpoint creation task is still in progress with the current state: "
507
- f"{background_task.status.state}.",
506
+ f"{background_task_state}.",
508
507
  name=self.name,
509
508
  )
510
509
  else:
@@ -217,7 +217,7 @@ class MLFlowTracker(Tracker):
217
217
  handler=handler,
218
218
  run_name=run.info.run_name,
219
219
  project_name=project.name,
220
- uid=run.info.run_uuid,
220
+ uid=run.info.run_id,
221
221
  )
222
222
 
223
223
  # Create a context from the run object:
@@ -373,7 +373,7 @@ class MLFlowTracker(Tracker):
373
373
  # Import the MLFlow run's artifacts to MLRun (model are logged after the rest of artifacts
374
374
  # so the artifacts can be registered as extra data in the models):
375
375
  artifacts = {}
376
- model_paths = []
376
+ model_uris = []
377
377
  for artifact in client.list_artifacts(run_id=run.info.run_id):
378
378
  # Get the artifact's local path (MLFlow suggests that if the artifact is already in the local filesystem
379
379
  # its local path will be returned:
@@ -381,29 +381,29 @@ class MLFlowTracker(Tracker):
381
381
  run_id=run.info.run_id,
382
382
  artifact_path=artifact.path,
383
383
  )
384
- # Check if the artifact is a model (will be logged after the artifacts):
385
- if artifact.is_dir and os.path.exists(
386
- os.path.join(
387
- artifact_local_path, "MLmodel"
388
- ) # Add tag to show model dir
389
- ):
390
- model_paths.append(artifact_local_path)
391
- else:
392
- # Log the artifact:
393
- artifact = MLFlowTracker._log_artifact(
394
- context=context,
395
- key=pathlib.Path(artifact.path).name.replace(".", "_"),
396
- # Mlflow has the same name for files but with different extensions, so we add extension to name
397
- local_path=artifact_local_path,
398
- tmp_path=tmp_dir,
399
- )
400
- artifacts[artifact.key] = artifact
384
+ # Log the artifact:
385
+ artifact = MLFlowTracker._log_artifact(
386
+ context=context,
387
+ key=pathlib.Path(artifact.path).name.replace(".", "_"),
388
+ # Mlflow has the same name for files but with different extensions, so we add extension to name
389
+ local_path=artifact_local_path,
390
+ tmp_path=tmp_dir,
391
+ )
392
+ artifacts[artifact.key] = artifact
393
+
394
+ # get all run model's uri's (artifact_location in mlflow 3.0.0).
395
+ logged_models = mlflow.search_logged_models(
396
+ filter_string=f"source_run_id = '{run.info.run_id}'",
397
+ output_format="list",
398
+ )
399
+ for logged_model in logged_models:
400
+ model_uris.append(logged_model.artifact_location)
401
401
 
402
- for model_path in model_paths:
402
+ for model_uri in model_uris:
403
403
  MLFlowTracker._log_model(
404
404
  context=context,
405
- model_uri=model_path,
406
- key=pathlib.Path(model_path).stem,
405
+ model_uri=model_uri,
406
+ key=pathlib.Path(model_uri).stem,
407
407
  metrics=results,
408
408
  extra_data=artifacts,
409
409
  tmp_path=tmp_dir,
@@ -439,20 +439,18 @@ class MLFlowTracker(Tracker):
439
439
 
440
440
  # Get the model info from MLFlow:
441
441
  model_info = mlflow.models.get_model_info(model_uri=model_uri)
442
+ # Download the model and set the path to local path:
443
+ local_model_path = mlflow.artifacts.download_artifacts(
444
+ artifact_uri=str(model_uri)
445
+ )
446
+ model_path = pathlib.Path(local_model_path)
442
447
 
443
448
  # Prepare the archive path:
444
- model_uri = pathlib.Path(model_uri)
445
- archive_path = pathlib.Path(tmp_path) / f"{model_uri.stem}.zip"
446
- if not os.path.exists(model_uri):
447
- local_path = mlflow.artifacts.download_artifacts(
448
- artifact_uri=str(model_uri)
449
- )
450
- model_uri = pathlib.Path(local_path)
451
-
449
+ archive_path = pathlib.Path(tmp_path) / f"{model_path.name}.zip"
452
450
  # TODO add progress bar for the case of large files
453
451
  # Zip the artifact:
454
452
  with zipfile.ZipFile(archive_path, "w") as zip_file:
455
- for path in model_uri.rglob("*"):
453
+ for path in model_path.rglob("*"):
456
454
  zip_file.write(filename=path, arcname=path.relative_to(model_uri))
457
455
 
458
456
  # Get inputs and outputs info: