mlrun 1.8.0rc19__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (52) hide show
  1. mlrun/__init__.py +37 -3
  2. mlrun/__main__.py +5 -0
  3. mlrun/alerts/alert.py +1 -0
  4. mlrun/artifacts/document.py +78 -36
  5. mlrun/common/formatters/feature_set.py +1 -0
  6. mlrun/common/runtimes/constants.py +17 -0
  7. mlrun/common/schemas/alert.py +3 -0
  8. mlrun/common/schemas/client_spec.py +0 -1
  9. mlrun/common/schemas/model_monitoring/constants.py +32 -9
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +2 -0
  11. mlrun/common/schemas/workflow.py +1 -0
  12. mlrun/config.py +39 -6
  13. mlrun/datastore/datastore_profile.py +58 -16
  14. mlrun/datastore/sources.py +7 -1
  15. mlrun/datastore/vectorstore.py +20 -1
  16. mlrun/db/base.py +20 -0
  17. mlrun/db/httpdb.py +97 -10
  18. mlrun/db/nopdb.py +19 -0
  19. mlrun/errors.py +4 -0
  20. mlrun/execution.py +15 -6
  21. mlrun/frameworks/_common/model_handler.py +0 -2
  22. mlrun/launcher/client.py +2 -2
  23. mlrun/launcher/local.py +5 -1
  24. mlrun/model_monitoring/applications/_application_steps.py +3 -1
  25. mlrun/model_monitoring/controller.py +266 -103
  26. mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
  27. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
  28. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +20 -21
  29. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -34
  30. mlrun/model_monitoring/helpers.py +16 -10
  31. mlrun/model_monitoring/stream_processing.py +106 -35
  32. mlrun/package/context_handler.py +1 -1
  33. mlrun/package/packagers_manager.py +4 -18
  34. mlrun/projects/pipelines.py +18 -5
  35. mlrun/projects/project.py +156 -39
  36. mlrun/runtimes/nuclio/serving.py +22 -13
  37. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  38. mlrun/secrets.py +1 -1
  39. mlrun/serving/server.py +11 -3
  40. mlrun/serving/states.py +65 -8
  41. mlrun/serving/v2_serving.py +67 -44
  42. mlrun/utils/helpers.py +111 -23
  43. mlrun/utils/notifications/notification/base.py +6 -1
  44. mlrun/utils/notifications/notification/slack.py +5 -1
  45. mlrun/utils/notifications/notification_pusher.py +67 -36
  46. mlrun/utils/version/version.json +2 -2
  47. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
  48. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +52 -52
  49. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
  50. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
  51. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
  52. {mlrun-1.8.0rc19.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0
@@ -33,7 +33,7 @@ _TSDB_BE = "tsdb"
33
33
  _TSDB_RATE = "1/s"
34
34
  _CONTAINER = "users"
35
35
 
36
- V3IO_MEPS_LIMIT = 50 # TODO remove limitation after fixing ML-8886
36
+ V3IO_MEPS_LIMIT = 200
37
37
 
38
38
 
39
39
  def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
@@ -135,7 +135,7 @@ class V3IOTSDBConnector(TSDBConnector):
135
135
  monitoring_predictions_full_path = (
136
136
  mlrun.mlconf.get_model_monitoring_file_target_path(
137
137
  project=self.project,
138
- kind=mm_schemas.FileTargetKind.PREDICTIONS,
138
+ kind=mm_schemas.V3IOTSDBTables.PREDICTIONS,
139
139
  )
140
140
  )
141
141
  (
@@ -145,7 +145,7 @@ class V3IOTSDBConnector(TSDBConnector):
145
145
  ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
146
146
  monitoring_predictions_full_path
147
147
  )
148
- self.tables[mm_schemas.FileTargetKind.PREDICTIONS] = monitoring_predictions_path
148
+ self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS] = monitoring_predictions_path
149
149
 
150
150
  def create_tables(self) -> None:
151
151
  """
@@ -204,7 +204,7 @@ class V3IOTSDBConnector(TSDBConnector):
204
204
  }
205
205
  ],
206
206
  name=EventFieldType.LATENCY,
207
- after="MapFeatureNames",
207
+ after="FilterNOP",
208
208
  step_name="Aggregates",
209
209
  table=".",
210
210
  key_field=EventFieldType.ENDPOINT_ID,
@@ -225,8 +225,8 @@ class V3IOTSDBConnector(TSDBConnector):
225
225
  graph.add_step(
226
226
  "storey.TSDBTarget",
227
227
  name="tsdb_predictions",
228
- after="MapFeatureNames",
229
- path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.PREDICTIONS]}",
228
+ after="FilterNOP",
229
+ path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS]}",
230
230
  rate="1/s",
231
231
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
232
232
  container=self.container,
@@ -234,6 +234,8 @@ class V3IOTSDBConnector(TSDBConnector):
234
234
  columns=[
235
235
  mm_schemas.EventFieldType.LATENCY,
236
236
  mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
237
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
238
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
237
239
  ],
238
240
  index_cols=[
239
241
  mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -580,14 +582,18 @@ class V3IOTSDBConnector(TSDBConnector):
580
582
  )
581
583
 
582
584
  @staticmethod
583
- def _get_endpoint_filter(endpoint_id: Union[str, list[str]]):
585
+ def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
584
586
  if isinstance(endpoint_id, str):
585
587
  return f"endpoint_id=='{endpoint_id}'"
586
588
  elif isinstance(endpoint_id, list):
587
589
  if len(endpoint_id) > V3IO_MEPS_LIMIT:
588
- raise mlrun.errors.MLRunInvalidArgumentError(
589
- f"Filtering more than {V3IO_MEPS_LIMIT} model endpoints in the V3IO connector is not supported."
590
+ logger.info(
591
+ "The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
592
+ "retrieving all the model endpoints from the db.",
593
+ limit=V3IO_MEPS_LIMIT,
594
+ amount=len(endpoint_id),
590
595
  )
596
+ return None
591
597
  return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
592
598
  else:
593
599
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -734,10 +740,10 @@ class V3IOTSDBConnector(TSDBConnector):
734
740
  "both or neither of `aggregation_window` and `agg_funcs` must be provided"
735
741
  )
736
742
  df = self._get_records(
737
- table=mm_schemas.FileTargetKind.PREDICTIONS,
743
+ table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
738
744
  start=start,
739
745
  end=end,
740
- columns=[mm_schemas.EventFieldType.LATENCY],
746
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
741
747
  filter_query=f"endpoint_id=='{endpoint_id}'",
742
748
  agg_funcs=agg_funcs,
743
749
  sliding_window_step=aggregation_window,
@@ -751,10 +757,10 @@ class V3IOTSDBConnector(TSDBConnector):
751
757
  type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
752
758
  )
753
759
 
754
- latency_column = (
755
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
760
+ estimated_prediction_count = (
761
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
756
762
  if agg_funcs
757
- else mm_schemas.EventFieldType.LATENCY
763
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
758
764
  )
759
765
 
760
766
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -762,7 +768,7 @@ class V3IOTSDBConnector(TSDBConnector):
762
768
  values=list(
763
769
  zip(
764
770
  df.index,
765
- df[latency_column],
771
+ df[estimated_prediction_count],
766
772
  )
767
773
  ), # pyright: ignore[reportArgumentType]
768
774
  )
@@ -773,15 +779,13 @@ class V3IOTSDBConnector(TSDBConnector):
773
779
  start: Optional[datetime] = None,
774
780
  end: Optional[datetime] = None,
775
781
  ) -> pd.DataFrame:
776
- endpoint_ids = (
777
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
778
- )
782
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
779
783
  start, end = self._get_start_end(start, end)
780
784
  df = self._get_records(
781
- table=mm_schemas.FileTargetKind.PREDICTIONS,
785
+ table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
782
786
  start=start,
783
787
  end=end,
784
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
788
+ filter_query=filter_query,
785
789
  agg_funcs=["last"],
786
790
  )
787
791
  if not df.empty:
@@ -808,9 +812,7 @@ class V3IOTSDBConnector(TSDBConnector):
808
812
  start: Optional[datetime] = None,
809
813
  end: Optional[datetime] = None,
810
814
  ) -> pd.DataFrame:
811
- endpoint_ids = (
812
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
813
- )
815
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
814
816
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
815
817
  start, end = self._get_start_end(start, end)
816
818
  df = self._get_records(
@@ -818,7 +820,7 @@ class V3IOTSDBConnector(TSDBConnector):
818
820
  start=start,
819
821
  end=end,
820
822
  columns=[mm_schemas.ResultData.RESULT_STATUS],
821
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
823
+ filter_query=filter_query,
822
824
  agg_funcs=["max"],
823
825
  group_by="endpoint_id",
824
826
  )
@@ -883,17 +885,18 @@ class V3IOTSDBConnector(TSDBConnector):
883
885
  start: Optional[datetime] = None,
884
886
  end: Optional[datetime] = None,
885
887
  ) -> pd.DataFrame:
886
- endpoint_ids = (
887
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
888
- )
888
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
889
+ if filter_query:
890
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
891
+ else:
892
+ filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
889
893
  start, end = self._get_start_end(start, end)
890
894
  df = self._get_records(
891
895
  table=mm_schemas.FileTargetKind.ERRORS,
892
896
  start=start,
893
897
  end=end,
894
898
  columns=[mm_schemas.EventFieldType.ERROR_COUNT],
895
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
896
- f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'",
899
+ filter_query=filter_query,
897
900
  agg_funcs=["count"],
898
901
  )
899
902
  if not df.empty:
@@ -912,17 +915,15 @@ class V3IOTSDBConnector(TSDBConnector):
912
915
  start: Optional[datetime] = None,
913
916
  end: Optional[datetime] = None,
914
917
  ) -> pd.DataFrame:
915
- endpoint_ids = (
916
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
917
- )
918
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
918
919
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
919
920
  start, end = self._get_start_end(start, end)
920
921
  df = self._get_records(
921
- table=mm_schemas.FileTargetKind.PREDICTIONS,
922
+ table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
922
923
  start=start,
923
924
  end=end,
924
925
  columns=[mm_schemas.EventFieldType.LATENCY],
925
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
926
+ filter_query=filter_query,
926
927
  agg_funcs=["avg"],
927
928
  )
928
929
  if not df.empty:
@@ -109,7 +109,7 @@ def filter_results_by_regex(
109
109
  result_name_filters=validated_filters,
110
110
  ):
111
111
  filtered_metrics_names.append(existing_result_name)
112
- return filtered_metrics_names
112
+ return list(set(filtered_metrics_names))
113
113
 
114
114
 
115
115
  def get_stream_path(
@@ -117,6 +117,7 @@ def get_stream_path(
117
117
  function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
118
118
  stream_uri: Optional[str] = None,
119
119
  secret_provider: Optional[Callable[[str], str]] = None,
120
+ profile: Optional[mlrun.datastore.datastore_profile.DatastoreProfile] = None,
120
121
  ) -> str:
121
122
  """
122
123
  Get stream path from the project secret. If wasn't set, take it from the system configurations
@@ -126,20 +127,25 @@ def get_stream_path(
126
127
  :param stream_uri: Stream URI. If provided, it will be used instead of the one from the project's secret.
127
128
  :param secret_provider: Optional secret provider to get the connection string secret.
128
129
  If not set, the env vars are used.
130
+ :param profile: Optional datastore profile of the stream (V3IO/KafkaSource profile).
129
131
  :return: Monitoring stream path to the relevant application.
130
132
  """
131
133
 
132
- try:
133
- profile = _get_stream_profile(project=project, secret_provider=secret_provider)
134
- except mlrun.errors.MLRunNotFoundError:
135
- profile = None
134
+ profile = profile or _get_stream_profile(
135
+ project=project, secret_provider=secret_provider
136
+ )
136
137
 
137
138
  if isinstance(profile, mlrun.datastore.datastore_profile.DatastoreProfileV3io):
138
139
  stream_uri = "v3io"
139
-
140
- stream_uri = stream_uri or mlrun.get_secret_or_env(
141
- key=mm_constants.ProjectSecretKeys.STREAM_PATH, secret_provider=secret_provider
142
- )
140
+ elif isinstance(
141
+ profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
142
+ ):
143
+ stream_uri = f"kafka://{profile.brokers[0]}"
144
+ else:
145
+ raise mlrun.errors.MLRunValueError(
146
+ f"Received an unexpected stream profile type: {type(profile)}\n"
147
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
148
+ )
143
149
 
144
150
  if not stream_uri or stream_uri == "v3io":
145
151
  stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
@@ -273,7 +279,7 @@ def _get_profile(
273
279
  )
274
280
  if not profile_name:
275
281
  raise mlrun.errors.MLRunNotFoundError(
276
- f"Not found `{profile_name_key}` profile name"
282
+ f"Not found `{profile_name_key}` profile name for project '{project}'"
277
283
  )
278
284
  return mlrun.datastore.datastore_profile.datastore_profile_read(
279
285
  url=f"ds://{profile_name}", project_name=project, secrets=secret_provider
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import collections
16
15
  import datetime
17
16
  import os
18
17
  import typing
@@ -29,11 +28,14 @@ import mlrun.model_monitoring.db
29
28
  import mlrun.serving.states
30
29
  import mlrun.utils
31
30
  from mlrun.common.schemas.model_monitoring.constants import (
31
+ ControllerEvent,
32
+ ControllerEventKind,
32
33
  EndpointType,
33
34
  EventFieldType,
34
35
  FileTargetKind,
35
36
  ProjectSecretKeys,
36
37
  )
38
+ from mlrun.datastore import parse_kafka_url
37
39
  from mlrun.model_monitoring.db import TSDBConnector
38
40
  from mlrun.utils import logger
39
41
 
@@ -88,7 +90,9 @@ class EventStreamProcessor:
88
90
  self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
89
91
  self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
90
92
 
91
- self.v3io_access_key = v3io_access_key or os.environ.get("V3IO_ACCESS_KEY")
93
+ self.v3io_access_key = v3io_access_key or mlrun.get_secret_or_env(
94
+ "V3IO_ACCESS_KEY"
95
+ )
92
96
  self.model_monitoring_access_key = (
93
97
  model_monitoring_access_key
94
98
  or os.environ.get(ProjectSecretKeys.ACCESS_KEY)
@@ -118,6 +122,7 @@ class EventStreamProcessor:
118
122
  self,
119
123
  fn: mlrun.runtimes.ServingRuntime,
120
124
  tsdb_connector: TSDBConnector,
125
+ controller_stream_uri: str,
121
126
  ) -> None:
122
127
  """
123
128
  Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
@@ -146,6 +151,8 @@ class EventStreamProcessor:
146
151
 
147
152
  :param fn: A serving function.
148
153
  :param tsdb_connector: Time series database connector.
154
+ :param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
155
+ input
149
156
  """
150
157
 
151
158
  graph = typing.cast(
@@ -209,6 +216,20 @@ class EventStreamProcessor:
209
216
  )
210
217
 
211
218
  apply_map_feature_names()
219
+ # split the graph between event with error vs valid event
220
+ graph.add_step(
221
+ "storey.Filter",
222
+ "FilterNOP",
223
+ after="MapFeatureNames",
224
+ _fn="(event.get('kind', " ") != 'nop_event')",
225
+ )
226
+ graph.add_step(
227
+ "storey.Filter",
228
+ "ForwardNOP",
229
+ after="MapFeatureNames",
230
+ _fn="(event.get('kind', " ") == 'nop_event')",
231
+ )
232
+
212
233
  tsdb_connector.apply_monitoring_stream_steps(
213
234
  graph=graph,
214
235
  aggregate_windows=self.aggregate_windows,
@@ -221,7 +242,7 @@ class EventStreamProcessor:
221
242
  graph.add_step(
222
243
  "ProcessBeforeParquet",
223
244
  name="ProcessBeforeParquet",
224
- after="MapFeatureNames",
245
+ after="FilterNOP",
225
246
  _fn="(event)",
226
247
  )
227
248
 
@@ -248,6 +269,44 @@ class EventStreamProcessor:
248
269
 
249
270
  apply_parquet_target()
250
271
 
272
+ # controller branch
273
+ def apply_push_controller_stream(stream_uri: str):
274
+ if stream_uri.startswith("v3io://"):
275
+ graph.add_step(
276
+ ">>",
277
+ "controller_stream_v3io",
278
+ path=stream_uri,
279
+ sharding_func=ControllerEvent.ENDPOINT_ID,
280
+ access_key=self.v3io_access_key,
281
+ after="ForwardNOP",
282
+ )
283
+ elif stream_uri.startswith("kafka://"):
284
+ topic, brokers = parse_kafka_url(stream_uri)
285
+ logger.info(
286
+ "Controller stream uri for kafka",
287
+ stream_uri=stream_uri,
288
+ topic=topic,
289
+ brokers=brokers,
290
+ )
291
+ if isinstance(brokers, list):
292
+ path = f"kafka://{brokers[0]}/{topic}"
293
+ elif isinstance(brokers, str):
294
+ path = f"kafka://{brokers}/{topic}"
295
+ else:
296
+ raise mlrun.errors.MLRunInvalidArgumentError(
297
+ "Brokers must be a list or str check controller stream uri"
298
+ )
299
+ graph.add_step(
300
+ ">>",
301
+ "controller_stream_kafka",
302
+ path=path,
303
+ kafka_brokers=brokers,
304
+ _sharding_func=ControllerEvent.ENDPOINT_ID,
305
+ after="ForwardNOP",
306
+ )
307
+
308
+ apply_push_controller_stream(controller_stream_uri)
309
+
251
310
 
252
311
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
253
312
  def __init__(self, **kwargs):
@@ -313,14 +372,14 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
313
372
  self.first_request: dict[str, str] = dict()
314
373
  self.last_request: dict[str, str] = dict()
315
374
 
316
- # Number of errors (value) per endpoint (key)
317
- self.error_count: dict[str, int] = collections.defaultdict(int)
318
-
319
375
  # Set of endpoints in the current events
320
376
  self.endpoints: set[str] = set()
321
377
 
322
378
  def do(self, full_event):
323
379
  event = full_event.body
380
+ if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
381
+ logger.info("Skipped nop event inside of ProcessEndpointEvent", event=event)
382
+ return storey.Event(body=[event])
324
383
  # Getting model version and function uri from event
325
384
  # and use them for retrieving the endpoint_id
326
385
  function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
@@ -354,10 +413,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
354
413
  predictions = event.get("resp", {}).get("outputs")
355
414
 
356
415
  if not self.is_valid(
357
- endpoint_id,
358
- is_not_none,
359
- timestamp,
360
- ["when"],
416
+ validation_function=is_not_none,
417
+ field=timestamp,
418
+ dict_path=["when"],
361
419
  ):
362
420
  return None
363
421
 
@@ -369,31 +427,27 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
369
427
  self.last_request[endpoint_id] = timestamp
370
428
 
371
429
  if not self.is_valid(
372
- endpoint_id,
373
- is_not_none,
374
- request_id,
375
- ["request", "id"],
430
+ validation_function=is_not_none,
431
+ field=request_id,
432
+ dict_path=["request", "id"],
376
433
  ):
377
434
  return None
378
435
  if not self.is_valid(
379
- endpoint_id,
380
- is_not_none,
381
- latency,
382
- ["microsec"],
436
+ validation_function=is_not_none,
437
+ field=latency,
438
+ dict_path=["microsec"],
383
439
  ):
384
440
  return None
385
441
  if not self.is_valid(
386
- endpoint_id,
387
- is_not_none,
388
- features,
389
- ["request", "inputs"],
442
+ validation_function=is_not_none,
443
+ field=features,
444
+ dict_path=["request", "inputs"],
390
445
  ):
391
446
  return None
392
447
  if not self.is_valid(
393
- endpoint_id,
394
- is_not_none,
395
- predictions,
396
- ["resp", "outputs"],
448
+ validation_function=is_not_none,
449
+ field=predictions,
450
+ dict_path=["resp", "outputs"],
397
451
  ):
398
452
  return None
399
453
 
@@ -430,6 +484,10 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
430
484
  if not isinstance(feature, list):
431
485
  feature = [feature]
432
486
 
487
+ effective_sample_count, estimated_prediction_count = (
488
+ self._get_effective_and_estimated_counts(event=event)
489
+ )
490
+
433
491
  events.append(
434
492
  {
435
493
  EventFieldType.FUNCTION_URI: function_uri,
@@ -447,12 +505,13 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
447
505
  EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
448
506
  self.last_request[endpoint_id]
449
507
  ).timestamp(),
450
- EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
451
508
  EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
452
509
  EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
453
510
  EventFieldType.ENTITIES: event.get("request", {}).get(
454
511
  EventFieldType.ENTITIES, {}
455
512
  ),
513
+ EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
514
+ EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
456
515
  }
457
516
  )
458
517
 
@@ -476,7 +535,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
476
535
  .flat_dict()
477
536
  )
478
537
 
479
- # If model endpoint found, get first_request, last_request and error_count values
538
+ # If model endpoint found, get first_request & last_request values
480
539
  if endpoint_record:
481
540
  first_request = endpoint_record.get(EventFieldType.FIRST_REQUEST)
482
541
 
@@ -487,26 +546,34 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
487
546
  if last_request:
488
547
  self.last_request[endpoint_id] = last_request
489
548
 
490
- error_count = endpoint_record.get(EventFieldType.ERROR_COUNT)
491
-
492
- if error_count:
493
- self.error_count[endpoint_id] = int(error_count)
494
-
495
549
  # add endpoint to endpoints set
496
550
  self.endpoints.add(endpoint_id)
497
551
 
498
552
  def is_valid(
499
553
  self,
500
- endpoint_id: str,
501
554
  validation_function,
502
555
  field: typing.Any,
503
556
  dict_path: list[str],
504
557
  ):
505
558
  if validation_function(field, dict_path):
506
559
  return True
507
- self.error_count[endpoint_id] += 1
560
+
508
561
  return False
509
562
 
563
+ @staticmethod
564
+ def _get_effective_and_estimated_counts(event):
565
+ """
566
+ Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
567
+ sampling percentage. These values will be stored in the TSDB target.
568
+ Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
569
+ percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
570
+ """
571
+ effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
572
+ estimated_prediction_count = effective_sample_count * (
573
+ 100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
574
+ )
575
+ return effective_sample_count, estimated_prediction_count
576
+
510
577
 
511
578
  def is_not_none(field: typing.Any, dict_path: list[str]):
512
579
  if field is not None:
@@ -569,6 +636,9 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
569
636
  return None
570
637
 
571
638
  def do(self, event: dict):
639
+ if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
640
+ logger.info("Skipped nop event inside of MapFeatureNames", event=event)
641
+ return event
572
642
  endpoint_id = event[EventFieldType.ENDPOINT_ID]
573
643
 
574
644
  feature_values = event[EventFieldType.FEATURES]
@@ -672,6 +742,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
672
742
  )
673
743
  )
674
744
  self.first_request[endpoint_id] = True
745
+
675
746
  if attributes_to_update:
676
747
  logger.info(
677
748
  "Updating endpoint record",
@@ -50,7 +50,7 @@ class ContextHandler:
50
50
  "numpy",
51
51
  ]
52
52
  # Optional packagers to be collected at initialization time:
53
- _EXTENDED_PACKAGERS = [] # TODO: Create "matplotlib", "plotly", "bokeh" packagers.
53
+ _EXTENDED_PACKAGERS = [] # TODO: Create "matplotlib", "plotly", packagers.
54
54
  # Optional packagers from the `mlrun.frameworks` package:
55
55
  _MLRUN_FRAMEWORKS_PACKAGERS = [] # TODO: Create frameworks packagers.
56
56
  # Default priority values for packagers:
@@ -667,16 +667,9 @@ class PackagersManager:
667
667
  data_item=data_item,
668
668
  instructions={},
669
669
  )
670
- except Exception as exception:
670
+ except Exception:
671
671
  # Could not unpack as the reduced type hint, collect the exception and go to the next one:
672
- exception_string = "".join(
673
- traceback.format_exception(
674
- etype=type(exception),
675
- value=exception,
676
- tb=exception.__traceback__,
677
- )
678
- )
679
- found_packagers.append((packager, exception_string))
672
+ found_packagers.append((packager, traceback.format_exc()))
680
673
  # Reduce the type hint list and continue:
681
674
  possible_type_hints = TypeHintUtils.reduce_type_hint(
682
675
  type_hint=possible_type_hints
@@ -692,15 +685,8 @@ class PackagersManager:
692
685
  artifact_type=None,
693
686
  instructions={},
694
687
  )
695
- except Exception as exception:
696
- exception_string = "".join(
697
- traceback.format_exception(
698
- etype=type(exception),
699
- value=exception,
700
- tb=exception.__traceback__,
701
- )
702
- )
703
- found_packagers.append((self._default_packager, exception_string))
688
+ except Exception:
689
+ found_packagers.append((self._default_packager, traceback.format_exc()))
704
690
 
705
691
  # The method did not return until this point, raise an error:
706
692
  raise MLRunPackageUnpackingError(
@@ -31,7 +31,7 @@ import mlrun_pipelines.patcher
31
31
  import mlrun_pipelines.utils
32
32
  from mlrun.errors import err_to_str
33
33
  from mlrun.utils import (
34
- get_ui_url,
34
+ get_workflow_url,
35
35
  logger,
36
36
  normalize_workflow_name,
37
37
  retry_until_successful,
@@ -523,11 +523,12 @@ class _PipelineRunner(abc.ABC):
523
523
  text = _PipelineRunner._generate_workflow_finished_message(
524
524
  run.run_id, errors_counter, run._state
525
525
  )
526
-
527
526
  notifiers = notifiers or project.notifiers
528
527
  if notifiers:
529
528
  notifiers.push(text, "info", runs)
530
529
 
530
+ project.push_pipeline_notification_kfp_runner(run.run_id, run._state, text)
531
+
531
532
  if raise_error:
532
533
  raise raise_error
533
534
  return state or run._state, errors_counter, text
@@ -620,6 +621,8 @@ class _KFPRunner(_PipelineRunner):
620
621
  params.update(notification.secret_params)
621
622
  project.notifiers.add_notification(notification.kind, params)
622
623
 
624
+ project.spec.notifications = notifications
625
+
623
626
  run_id = _run_pipeline(
624
627
  workflow_handler,
625
628
  project=project.metadata.name,
@@ -647,13 +650,23 @@ class _KFPRunner(_PipelineRunner):
647
650
  exc_info=err_to_str(exc),
648
651
  )
649
652
 
650
- # TODO: we should check how can we get the run uid when we don't have the context (for example on
651
- # mlrun.load_project() and later call directly to project.run)
653
+ # Pushing only relevant notification for the client (ipython and console)
654
+ project.notifiers.push_pipeline_start_message_from_client(
655
+ project.metadata.name, pipeline_id=run_id
656
+ )
657
+
652
658
  if context:
653
659
  project.notifiers.push_pipeline_start_message(
654
660
  project.metadata.name,
655
661
  context.uid,
656
662
  )
663
+ else:
664
+ project.push_pipeline_notification_kfp_runner(
665
+ run_id,
666
+ mlrun_pipelines.common.models.RunStatuses.running,
667
+ f"Workflow {run_id} started in project {project.metadata.name}",
668
+ notifications,
669
+ )
657
670
  pipeline_context.clear()
658
671
  return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
659
672
 
@@ -1212,7 +1225,7 @@ def notify_scheduled_workflow_failure(
1212
1225
  notification_pusher = mlrun.utils.notifications.CustomNotificationPusher(
1213
1226
  ["slack"]
1214
1227
  )
1215
- url = get_ui_url(project_name, context_uid)
1228
+ url = get_workflow_url(project_name, context_uid)
1216
1229
  link = f"<{url}|*view workflow job details*>"
1217
1230
  message = (
1218
1231
  f":x: Failed to run scheduled workflow {workflow_name} "