mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/base.py +0 -31
  3. mlrun/artifacts/document.py +6 -1
  4. mlrun/artifacts/llm_prompt.py +123 -25
  5. mlrun/artifacts/manager.py +0 -5
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/common/constants.py +10 -1
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/common/model_monitoring/helpers.py +86 -0
  10. mlrun/common/schemas/__init__.py +3 -0
  11. mlrun/common/schemas/auth.py +2 -0
  12. mlrun/common/schemas/function.py +10 -0
  13. mlrun/common/schemas/hub.py +30 -18
  14. mlrun/common/schemas/model_monitoring/__init__.py +3 -0
  15. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  16. mlrun/common/schemas/model_monitoring/functions.py +14 -5
  17. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
  18. mlrun/common/schemas/pipeline.py +1 -1
  19. mlrun/common/schemas/serving.py +3 -0
  20. mlrun/common/schemas/workflow.py +3 -1
  21. mlrun/common/secrets.py +22 -1
  22. mlrun/config.py +33 -11
  23. mlrun/datastore/__init__.py +11 -3
  24. mlrun/datastore/azure_blob.py +162 -47
  25. mlrun/datastore/datastore.py +9 -4
  26. mlrun/datastore/datastore_profile.py +61 -5
  27. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  28. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  29. mlrun/datastore/model_provider/model_provider.py +230 -65
  30. mlrun/datastore/model_provider/openai_provider.py +295 -42
  31. mlrun/datastore/s3.py +24 -2
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +47 -19
  35. mlrun/db/httpdb.py +120 -56
  36. mlrun/db/nopdb.py +38 -10
  37. mlrun/execution.py +70 -19
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +15 -0
  43. mlrun/model.py +24 -3
  44. mlrun/model_monitoring/__init__.py +1 -0
  45. mlrun/model_monitoring/api.py +66 -27
  46. mlrun/model_monitoring/applications/__init__.py +1 -1
  47. mlrun/model_monitoring/applications/base.py +509 -117
  48. mlrun/model_monitoring/applications/context.py +2 -4
  49. mlrun/model_monitoring/applications/results.py +4 -7
  50. mlrun/model_monitoring/controller.py +239 -101
  51. mlrun/model_monitoring/db/_schedules.py +116 -33
  52. mlrun/model_monitoring/db/_stats.py +4 -3
  53. mlrun/model_monitoring/db/tsdb/base.py +100 -9
  54. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
  55. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
  56. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  57. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  58. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
  59. mlrun/model_monitoring/helpers.py +54 -9
  60. mlrun/model_monitoring/stream_processing.py +45 -14
  61. mlrun/model_monitoring/writer.py +220 -1
  62. mlrun/platforms/__init__.py +3 -2
  63. mlrun/platforms/iguazio.py +7 -3
  64. mlrun/projects/operations.py +6 -1
  65. mlrun/projects/pipelines.py +46 -26
  66. mlrun/projects/project.py +166 -58
  67. mlrun/run.py +94 -17
  68. mlrun/runtimes/__init__.py +18 -0
  69. mlrun/runtimes/base.py +14 -6
  70. mlrun/runtimes/daskjob.py +7 -0
  71. mlrun/runtimes/local.py +5 -2
  72. mlrun/runtimes/mounts.py +20 -2
  73. mlrun/runtimes/mpijob/abstract.py +6 -0
  74. mlrun/runtimes/mpijob/v1.py +6 -0
  75. mlrun/runtimes/nuclio/__init__.py +1 -0
  76. mlrun/runtimes/nuclio/application/application.py +149 -17
  77. mlrun/runtimes/nuclio/function.py +76 -27
  78. mlrun/runtimes/nuclio/serving.py +97 -15
  79. mlrun/runtimes/pod.py +234 -21
  80. mlrun/runtimes/remotesparkjob.py +6 -0
  81. mlrun/runtimes/sparkjob/spark3job.py +6 -0
  82. mlrun/runtimes/utils.py +49 -11
  83. mlrun/secrets.py +54 -13
  84. mlrun/serving/__init__.py +2 -0
  85. mlrun/serving/remote.py +79 -6
  86. mlrun/serving/routers.py +23 -41
  87. mlrun/serving/server.py +320 -80
  88. mlrun/serving/states.py +725 -157
  89. mlrun/serving/steps.py +62 -0
  90. mlrun/serving/system_steps.py +200 -119
  91. mlrun/serving/v2_serving.py +9 -10
  92. mlrun/utils/helpers.py +288 -88
  93. mlrun/utils/logger.py +3 -1
  94. mlrun/utils/notifications/notification/base.py +18 -0
  95. mlrun/utils/notifications/notification/git.py +2 -4
  96. mlrun/utils/notifications/notification/slack.py +2 -4
  97. mlrun/utils/notifications/notification/webhook.py +2 -5
  98. mlrun/utils/notifications/notification_pusher.py +1 -1
  99. mlrun/utils/retryer.py +15 -2
  100. mlrun/utils/version/version.json +2 -2
  101. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
  102. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
  103. mlrun/api/schemas/__init__.py +0 -259
  104. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  105. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  106. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  107. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
mlrun/serving/steps.py ADDED
@@ -0,0 +1,62 @@
1
+ # Copyright 2025 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Union
16
+
17
+ import storey
18
+
19
+ import mlrun.errors
20
+
21
+
22
+ class ChoiceByField(storey.Choice):
23
+ """
24
+ Selects downstream outlets to route each event based on a predetermined field.
25
+ :param field_name: event field name that contains the step name or names of the desired outlet or outlets
26
+ """
27
+
28
+ def __init__(self, field_name: Union[str, list[str]], **kwargs):
29
+ self.field_name = field_name
30
+ super().__init__(**kwargs)
31
+
32
+ def select_outlets(self, event):
33
+ # Case 1: Missing field
34
+ if self.field_name not in event:
35
+ raise mlrun.errors.MLRunRuntimeError(
36
+ f"Field '{self.field_name}' is not contained in the event keys {list(event.keys())}."
37
+ )
38
+
39
+ outlet = event[self.field_name]
40
+
41
+ # Case 2: Field exists but is None
42
+ if outlet is None:
43
+ raise mlrun.errors.MLRunInvalidArgumentError(
44
+ f"Field '{self.field_name}' exists but its value is None."
45
+ )
46
+
47
+ # Case 3: Invalid type
48
+ if not isinstance(outlet, (str, list, tuple)):
49
+ raise mlrun.errors.MLRunInvalidArgumentTypeError(
50
+ f"Field '{self.field_name}' must be a string or list of strings "
51
+ f"but is instead of type '{type(outlet).__name__}'."
52
+ )
53
+
54
+ outlets = [outlet] if isinstance(outlet, str) else outlet
55
+
56
+ # Case 4: Empty list or tuple
57
+ if not outlets:
58
+ raise mlrun.errors.MLRunRuntimeError(
59
+ f"The value of the key '{self.field_name}' cannot be an empty {type(outlets).__name__}."
60
+ )
61
+
62
+ return outlets
@@ -11,20 +11,38 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import random
16
- from copy import deepcopy
15
+ from copy import copy
17
16
  from datetime import timedelta
18
17
  from typing import Any, Optional, Union
19
18
 
19
+ import numpy as np
20
20
  import storey
21
21
 
22
22
  import mlrun
23
23
  import mlrun.artifacts
24
24
  import mlrun.common.schemas.model_monitoring as mm_schemas
25
+ import mlrun.feature_store
25
26
  import mlrun.serving
27
+ from mlrun.common.model_monitoring.helpers import (
28
+ get_model_endpoints_creation_task_status,
29
+ )
26
30
  from mlrun.common.schemas import MonitoringData
27
- from mlrun.utils import logger
31
+ from mlrun.utils import get_data_from_path, logger
32
+
33
+
34
+ class MatchingEndpointsState(mlrun.common.types.StrEnum):
35
+ all_matched = "all_matched"
36
+ not_all_matched = "not_all_matched"
37
+ no_check_needed = "no_check_needed"
38
+ not_yet_checked = "not_yet_matched"
39
+
40
+ @staticmethod
41
+ def success_states() -> list[str]:
42
+ return [
43
+ MatchingEndpointsState.all_matched,
44
+ MatchingEndpointsState.no_check_needed,
45
+ ]
28
46
 
29
47
 
30
48
  class MonitoringPreProcessor(storey.MapClass):
@@ -45,46 +63,20 @@ class MonitoringPreProcessor(storey.MapClass):
45
63
  result_path = model_monitoring_data.get(MonitoringData.RESULT_PATH)
46
64
  input_path = model_monitoring_data.get(MonitoringData.INPUT_PATH)
47
65
 
48
- result = self._get_data_from_path(
49
- result_path, event.body.get(model, event.body)
50
- )
51
66
  output_schema = model_monitoring_data.get(MonitoringData.OUTPUTS)
52
67
  input_schema = model_monitoring_data.get(MonitoringData.INPUTS)
53
- logger.debug("output schema retrieved", output_schema=output_schema)
54
- if isinstance(result, dict):
55
- if len(result) > 1:
56
- # transpose by key the outputs:
57
- outputs = self.transpose_by_key(result, output_schema)
58
- elif len(result) == 1:
59
- outputs = (
60
- result[output_schema[0]]
61
- if output_schema
62
- else list(result.values())[0]
63
- )
64
- else:
65
- outputs = []
66
- if not output_schema:
67
- logger.warn(
68
- "Output schema was not provided using Project:log_model or by ModelRunnerStep:add_model order "
69
- "may not preserved"
70
- )
71
- else:
72
- outputs = result
73
-
74
- event_inputs = event._metadata.get("inputs", {})
75
- event_inputs = self._get_data_from_path(input_path, event_inputs)
76
- if isinstance(event_inputs, dict):
77
- if len(event_inputs) > 1:
78
- # transpose by key the inputs:
79
- inputs = self.transpose_by_key(event_inputs, input_schema)
80
- else:
81
- inputs = (
82
- event_inputs[input_schema[0]]
83
- if input_schema
84
- else list(result.values())[0]
85
- )
86
- else:
87
- inputs = event_inputs
68
+ logger.debug(
69
+ "output and input schema retrieved",
70
+ output_schema=output_schema,
71
+ input_schema=input_schema,
72
+ )
73
+
74
+ outputs, new_output_schema = self.get_listed_data(
75
+ event.body.get(model, event.body), result_path, output_schema
76
+ )
77
+ inputs, new_input_schema = self.get_listed_data(
78
+ event._metadata.get("inputs", {}), input_path, input_schema
79
+ )
88
80
 
89
81
  if outputs and isinstance(outputs[0], list):
90
82
  if output_schema and len(output_schema) != len(outputs[0]):
@@ -104,48 +96,124 @@ class MonitoringPreProcessor(storey.MapClass):
104
96
  output_len=len(outputs),
105
97
  schema_len=len(output_schema),
106
98
  )
107
- request = {"inputs": inputs, "id": getattr(event, "id", None)}
108
- resp = {"outputs": outputs}
99
+ if len(inputs) != len(outputs):
100
+ logger.warn(
101
+ "outputs and inputs are not in the same length check 'input_path' and "
102
+ "'output_path' was specified if needed"
103
+ )
104
+ request = {
105
+ "inputs": inputs,
106
+ "id": getattr(event, "id", None),
107
+ "input_schema": new_input_schema,
108
+ }
109
+ resp = {"outputs": outputs, "output_schema": new_output_schema}
109
110
 
110
111
  return request, resp
111
112
 
113
+ def get_listed_data(
114
+ self,
115
+ raw_data: dict,
116
+ data_path: Optional[Union[list[str], str]] = None,
117
+ schema: Optional[list[str]] = None,
118
+ ):
119
+ """Get data from a path and transpose it by keys if dict is provided."""
120
+ new_schema = None
121
+ data_from_path = get_data_from_path(data_path, raw_data)
122
+ if isinstance(data_from_path, dict):
123
+ # transpose by key the inputs:
124
+ listed_data, new_schema = self.transpose_by_key(data_from_path, schema)
125
+ new_schema = new_schema or schema
126
+ if not schema:
127
+ logger.warn(
128
+ f"No schema provided through add_model(); the order of {data_from_path} "
129
+ "may not be preserved."
130
+ )
131
+ elif not isinstance(data_from_path, list):
132
+ listed_data = [data_from_path]
133
+ else:
134
+ listed_data = data_from_path
135
+ return listed_data, new_schema
136
+
112
137
  @staticmethod
113
138
  def transpose_by_key(
114
- data_to_transpose, schema: Optional[list[str]] = None
115
- ) -> list[list[float]]:
116
- values = (
117
- list(data_to_transpose.values())
118
- if not schema
119
- else [data_to_transpose[key] for key in schema]
120
- )
121
- if values and not isinstance(values[0], list):
122
- values = [values]
123
- transposed = (
124
- list(map(list, zip(*values)))
125
- if all(isinstance(v, list) for v in values) and len(values) > 1
126
- else values
127
- )
128
- return transposed
139
+ data: dict, schema: Optional[Union[str, list[str]]] = None
140
+ ) -> tuple[Union[list[Any], list[list[Any]]], list[str]]:
141
+ """
142
+ Transpose values from a dictionary by keys.
129
143
 
130
- @staticmethod
131
- def _get_data_from_path(
132
- path: Union[str, list[str], None], data: dict
133
- ) -> dict[str, Any]:
134
- if isinstance(path, str):
135
- output_data = data.get(path)
136
- elif isinstance(path, list):
137
- output_data = deepcopy(data)
138
- for key in path:
139
- output_data = output_data.get(key, {})
140
- elif path is None:
141
- output_data = data
144
+ Given a dictionary and an optional schema (a key or list of keys), this function:
145
+ - Extracts the values for the specified keys (or all keys if no schema is provided).
146
+ - Ensures the data is represented as a list of rows, then transposes it (i.e., switches rows to columns).
147
+ - Handles edge cases:
148
+ * If a single scalar or single-element list is provided, returns a flat list.
149
+ * If a single key is provided (as a string or a list with one element), handles it properly.
150
+ * If only one row with len of one remains after transposition, unwraps it to avoid nested list-of-one.
151
+
152
+ Example::
153
+
154
+ transpose_by_key({"a": 1})
155
+ # returns: [1]
156
+
157
+ transpose_by_key({"a": [1, 2]})
158
+ # returns: [1 ,2]
159
+
160
+ transpose_by_key({"a": [1, 2], "b": [3, 4]})
161
+ # returns: [[1, 3], [2, 4]]
162
+
163
+ :param data: Dictionary with values that are either scalars or lists.
164
+ :param schema: Optional key or list of keys to extract. If not provided, all keys are used.
165
+ Can be a string (single key) or a list of strings.
166
+
167
+ :return: Transposed values:
168
+ * If result is a single column or row, returns a flat list.
169
+ * If result is a matrix, returns a list of lists.
170
+
171
+ :raises ValueError: If the values include a mix of scalars and lists, or if the list lengths do not match.
172
+ mlrun.MLRunInvalidArgumentError if the schema keys are not contained in the data keys.
173
+ """
174
+ new_schema = None
175
+ # Normalize keys in data:
176
+ normalize_data = {
177
+ mlrun.feature_store.api.norm_column_name(k): copy(v)
178
+ for k, v in data.items()
179
+ }
180
+ # Normalize schema to list
181
+ if not schema:
182
+ keys = list(normalize_data.keys())
183
+ new_schema = keys
184
+ elif isinstance(schema, str):
185
+ keys = [mlrun.feature_store.api.norm_column_name(schema)]
142
186
  else:
143
- raise mlrun.errors.MLRunInvalidArgumentError(
144
- "Expected path be of type str or list of str or None"
187
+ keys = [mlrun.feature_store.api.norm_column_name(key) for key in schema]
188
+
189
+ values = [normalize_data[key] for key in keys if key in normalize_data]
190
+ if len(values) != len(keys):
191
+ raise mlrun.MLRunInvalidArgumentError(
192
+ f"Schema keys {keys} are not contained in the data keys {list(data.keys())}."
145
193
  )
146
- if isinstance(output_data, (int, float)):
147
- output_data = [output_data]
148
- return output_data
194
+
195
+ # Detect if all are scalars ie: int,float,str
196
+ all_scalars = all(not isinstance(v, (list, tuple, np.ndarray)) for v in values)
197
+ all_lists = all(isinstance(v, (list, tuple, np.ndarray)) for v in values)
198
+
199
+ if not (all_scalars or all_lists):
200
+ raise ValueError(
201
+ "All values must be either scalars or lists of equal length."
202
+ )
203
+
204
+ if all_scalars:
205
+ transposed = np.array([values], dtype=object)
206
+ elif all_lists and len(keys) > 1:
207
+ arrays = [np.array(v, dtype=object) for v in values]
208
+ mat = np.stack(arrays, axis=0)
209
+ transposed = mat.T
210
+ else:
211
+ return values[0], new_schema
212
+
213
+ if transposed.shape[1] == 1 and transposed.shape[0] == 1:
214
+ # Transform [[0]] -> [0]:
215
+ return transposed[:, 0].tolist(), new_schema
216
+ return transposed.tolist(), new_schema
149
217
 
150
218
  def do(self, event):
151
219
  monitoring_event_list = []
@@ -168,6 +236,12 @@ class MonitoringPreProcessor(storey.MapClass):
168
236
  request, resp = self.reconstruct_request_resp_fields(
169
237
  event, model, monitoring_data[model]
170
238
  )
239
+ if hasattr(event, "_original_timestamp"):
240
+ when = event._original_timestamp
241
+ else:
242
+ when = event._metadata.get(model, {}).get(
243
+ mm_schemas.StreamProcessingEvent.WHEN
244
+ )
171
245
  monitoring_event_list.append(
172
246
  {
173
247
  mm_schemas.StreamProcessingEvent.MODEL: model,
@@ -177,17 +251,16 @@ class MonitoringPreProcessor(storey.MapClass):
177
251
  mm_schemas.StreamProcessingEvent.MICROSEC: event._metadata.get(
178
252
  model, {}
179
253
  ).get(mm_schemas.StreamProcessingEvent.MICROSEC),
180
- mm_schemas.StreamProcessingEvent.WHEN: event._metadata.get(
181
- model, {}
182
- ).get(mm_schemas.StreamProcessingEvent.WHEN),
254
+ mm_schemas.StreamProcessingEvent.WHEN: when,
183
255
  mm_schemas.StreamProcessingEvent.ENDPOINT_ID: monitoring_data[
184
256
  model
185
257
  ].get(
186
258
  mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
187
259
  ),
188
- mm_schemas.StreamProcessingEvent.LABELS: monitoring_data[
260
+ mm_schemas.StreamProcessingEvent.LABELS: event.body[
189
261
  model
190
- ].get(mlrun.common.schemas.MonitoringData.OUTPUTS),
262
+ ].get("labels")
263
+ or {},
191
264
  mm_schemas.StreamProcessingEvent.FUNCTION_URI: self.server.function_uri
192
265
  if self.server
193
266
  else None,
@@ -212,6 +285,10 @@ class MonitoringPreProcessor(storey.MapClass):
212
285
  request, resp = self.reconstruct_request_resp_fields(
213
286
  event, model, monitoring_data[model]
214
287
  )
288
+ if hasattr(event, "_original_timestamp"):
289
+ when = event._original_timestamp
290
+ else:
291
+ when = event._metadata.get(mm_schemas.StreamProcessingEvent.WHEN)
215
292
  monitoring_event_list.append(
216
293
  {
217
294
  mm_schemas.StreamProcessingEvent.MODEL: model,
@@ -221,25 +298,20 @@ class MonitoringPreProcessor(storey.MapClass):
221
298
  mm_schemas.StreamProcessingEvent.MICROSEC: event._metadata.get(
222
299
  mm_schemas.StreamProcessingEvent.MICROSEC
223
300
  ),
224
- mm_schemas.StreamProcessingEvent.WHEN: event._metadata.get(
225
- mm_schemas.StreamProcessingEvent.WHEN
226
- ),
301
+ mm_schemas.StreamProcessingEvent.WHEN: when,
227
302
  mm_schemas.StreamProcessingEvent.ENDPOINT_ID: monitoring_data[
228
303
  model
229
304
  ].get(mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID),
230
- mm_schemas.StreamProcessingEvent.LABELS: monitoring_data[model].get(
231
- mlrun.common.schemas.MonitoringData.OUTPUTS
232
- ),
305
+ mm_schemas.StreamProcessingEvent.LABELS: event.body.get("labels")
306
+ or {},
233
307
  mm_schemas.StreamProcessingEvent.FUNCTION_URI: self.server.function_uri
234
308
  if self.server
235
309
  else None,
236
310
  mm_schemas.StreamProcessingEvent.REQUEST: request,
237
311
  mm_schemas.StreamProcessingEvent.RESPONSE: resp,
238
- mm_schemas.StreamProcessingEvent.ERROR: event.body[
312
+ mm_schemas.StreamProcessingEvent.ERROR: event.body.get(
239
313
  mm_schemas.StreamProcessingEvent.ERROR
240
- ]
241
- if mm_schemas.StreamProcessingEvent.ERROR in event.body
242
- else None,
314
+ ),
243
315
  mm_schemas.StreamProcessingEvent.METRICS: event.body[
244
316
  mm_schemas.StreamProcessingEvent.METRICS
245
317
  ]
@@ -259,6 +331,9 @@ class BackgroundTaskStatus(storey.MapClass):
259
331
 
260
332
  def __init__(self, **kwargs):
261
333
  super().__init__(**kwargs)
334
+ self.matching_endpoints = MatchingEndpointsState.not_yet_checked
335
+ self.graph_model_endpoint_uids: set = set()
336
+ self.listed_model_endpoint_uids: set = set()
262
337
  self.server: mlrun.serving.GraphServer = (
263
338
  getattr(self.context, "server", None) if self.context else None
264
339
  )
@@ -279,43 +354,47 @@ class BackgroundTaskStatus(storey.MapClass):
279
354
  )
280
355
  )
281
356
  ):
282
- background_task = mlrun.get_run_db().get_project_background_task(
283
- self.server.project, self.server.model_endpoint_creation_task_name
284
- )
285
- self._background_task_check_timestamp = mlrun.utils.now_date()
286
- self._log_background_task_state(background_task.status.state)
287
- self._background_task_state = background_task.status.state
357
+ (
358
+ self._background_task_state,
359
+ self._background_task_check_timestamp,
360
+ self.listed_model_endpoint_uids,
361
+ ) = get_model_endpoints_creation_task_status(self.server)
362
+ if (
363
+ self.listed_model_endpoint_uids
364
+ and self.matching_endpoints == MatchingEndpointsState.not_yet_checked
365
+ ):
366
+ if not self.graph_model_endpoint_uids:
367
+ self.graph_model_endpoint_uids = collect_model_endpoint_uids(
368
+ self.server
369
+ )
370
+
371
+ if self.graph_model_endpoint_uids.issubset(self.listed_model_endpoint_uids):
372
+ self.matching_endpoints = MatchingEndpointsState.all_matched
373
+ elif self.listed_model_endpoint_uids is None:
374
+ self.matching_endpoints = MatchingEndpointsState.no_check_needed
288
375
 
289
376
  if (
290
377
  self._background_task_state
291
378
  == mlrun.common.schemas.BackgroundTaskState.succeeded
379
+ and self.matching_endpoints in MatchingEndpointsState.success_states()
292
380
  ):
293
381
  return event
294
382
  else:
295
383
  return None
296
384
 
297
- def _log_background_task_state(
298
- self, background_task_state: mlrun.common.schemas.BackgroundTaskState
299
- ):
300
- logger.info(
301
- "Checking model endpoint creation task status",
302
- task_name=self.server.model_endpoint_creation_task_name,
303
- )
304
- if (
305
- background_task_state
306
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
307
- ):
308
- logger.info(
309
- f"Model endpoint creation task completed with state {background_task_state}"
310
- )
311
- else: # in progress
312
- logger.info(
313
- f"Model endpoint creation task is still in progress with the current state: "
314
- f"{background_task_state}. Events will not be monitored for the next "
315
- f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
316
- name=self.name,
317
- background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
318
- )
385
+
386
+ def collect_model_endpoint_uids(server: mlrun.serving.GraphServer) -> set[str]:
387
+ """Collects all model endpoint UIDs from the server's graph steps."""
388
+ model_endpoint_uids = set()
389
+ for step in server.graph.steps.values():
390
+ if hasattr(step, "monitoring_data"):
391
+ for model in step.monitoring_data.keys():
392
+ uid = step.monitoring_data[model].get(
393
+ mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
394
+ )
395
+ if uid:
396
+ model_endpoint_uids.add(uid)
397
+ return model_endpoint_uids
319
398
 
320
399
 
321
400
  class SamplingStep(storey.MapClass):
@@ -337,7 +416,9 @@ class SamplingStep(storey.MapClass):
337
416
  event=event,
338
417
  sampling_percentage=self.sampling_percentage,
339
418
  )
340
- if self.sampling_percentage != 100:
419
+ if self.sampling_percentage != 100 and not event.get(
420
+ mm_schemas.StreamProcessingEvent.ERROR
421
+ ):
341
422
  request = event[mm_schemas.StreamProcessingEvent.REQUEST]
342
423
  num_of_inputs = len(request["inputs"])
343
424
  sampled_requests_indices = self._pick_random_requests(
@@ -24,6 +24,9 @@ import mlrun.common.schemas.model_monitoring
24
24
  import mlrun.model_monitoring
25
25
  from mlrun.utils import logger, now_date
26
26
 
27
+ from ..common.model_monitoring.helpers import (
28
+ get_model_endpoints_creation_task_status,
29
+ )
27
30
  from .utils import StepToDict, _extract_input_data, _update_result_body
28
31
 
29
32
 
@@ -474,22 +477,18 @@ class V2ModelServer(StepToDict):
474
477
  ) or getattr(self.context, "server", None)
475
478
  if not self.context.is_mock or self.context.monitoring_mock:
476
479
  if server.model_endpoint_creation_task_name:
477
- background_task = mlrun.get_run_db().get_project_background_task(
478
- server.project, server.model_endpoint_creation_task_name
479
- )
480
- logger.debug(
481
- "Checking model endpoint creation task status",
482
- task_name=server.model_endpoint_creation_task_name,
480
+ background_task_state, _, _ = get_model_endpoints_creation_task_status(
481
+ server
483
482
  )
484
483
  if (
485
- background_task.status.state
484
+ background_task_state
486
485
  in mlrun.common.schemas.BackgroundTaskState.terminal_states()
487
486
  ):
488
487
  logger.debug(
489
- f"Model endpoint creation task completed with state {background_task.status.state}"
488
+ f"Model endpoint creation task completed with state {background_task_state}"
490
489
  )
491
490
  if (
492
- background_task.status.state
491
+ background_task_state
493
492
  == mlrun.common.schemas.BackgroundTaskState.succeeded
494
493
  ):
495
494
  self._model_logger = (
@@ -504,7 +503,7 @@ class V2ModelServer(StepToDict):
504
503
  else: # in progress
505
504
  logger.debug(
506
505
  f"Model endpoint creation task is still in progress with the current state: "
507
- f"{background_task.status.state}.",
506
+ f"{background_task_state}.",
508
507
  name=self.name,
509
508
  )
510
509
  else: