mlrun 1.10.0rc11__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (59) hide show
  1. mlrun/__init__.py +2 -1
  2. mlrun/__main__.py +7 -1
  3. mlrun/artifacts/base.py +9 -3
  4. mlrun/artifacts/dataset.py +2 -1
  5. mlrun/artifacts/llm_prompt.py +6 -2
  6. mlrun/artifacts/model.py +2 -2
  7. mlrun/common/constants.py +1 -0
  8. mlrun/common/runtimes/constants.py +10 -1
  9. mlrun/common/schemas/__init__.py +1 -1
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +1 -1
  11. mlrun/common/schemas/serving.py +7 -0
  12. mlrun/config.py +21 -2
  13. mlrun/datastore/__init__.py +3 -1
  14. mlrun/datastore/alibaba_oss.py +1 -1
  15. mlrun/datastore/azure_blob.py +1 -1
  16. mlrun/datastore/base.py +6 -31
  17. mlrun/datastore/datastore.py +109 -33
  18. mlrun/datastore/datastore_profile.py +31 -0
  19. mlrun/datastore/dbfs_store.py +1 -1
  20. mlrun/datastore/google_cloud_storage.py +2 -2
  21. mlrun/datastore/model_provider/__init__.py +13 -0
  22. mlrun/datastore/model_provider/model_provider.py +160 -0
  23. mlrun/datastore/model_provider/openai_provider.py +144 -0
  24. mlrun/datastore/remote_client.py +65 -0
  25. mlrun/datastore/s3.py +1 -1
  26. mlrun/datastore/storeytargets.py +1 -1
  27. mlrun/datastore/utils.py +22 -0
  28. mlrun/datastore/v3io.py +1 -1
  29. mlrun/db/base.py +1 -1
  30. mlrun/db/httpdb.py +9 -4
  31. mlrun/db/nopdb.py +1 -1
  32. mlrun/execution.py +28 -7
  33. mlrun/launcher/base.py +23 -13
  34. mlrun/launcher/local.py +3 -1
  35. mlrun/launcher/remote.py +4 -2
  36. mlrun/model.py +65 -0
  37. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +175 -8
  38. mlrun/package/packagers_manager.py +2 -0
  39. mlrun/projects/operations.py +8 -1
  40. mlrun/projects/pipelines.py +40 -18
  41. mlrun/projects/project.py +28 -5
  42. mlrun/run.py +42 -2
  43. mlrun/runtimes/__init__.py +6 -0
  44. mlrun/runtimes/base.py +24 -6
  45. mlrun/runtimes/daskjob.py +1 -0
  46. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  47. mlrun/runtimes/local.py +1 -6
  48. mlrun/serving/server.py +1 -2
  49. mlrun/serving/states.py +438 -23
  50. mlrun/serving/system_steps.py +27 -29
  51. mlrun/utils/helpers.py +13 -2
  52. mlrun/utils/notifications/notification_pusher.py +15 -0
  53. mlrun/utils/version/version.json +2 -2
  54. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/METADATA +2 -2
  55. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/RECORD +59 -55
  56. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/WHEEL +0 -0
  57. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/entry_points.txt +0 -0
  58. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/licenses/LICENSE +0 -0
  59. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/top_level.txt +0 -0
mlrun/execution.py CHANGED
@@ -26,6 +26,7 @@ from dateutil import parser
26
26
  import mlrun
27
27
  import mlrun.common.constants as mlrun_constants
28
28
  import mlrun.common.formatters
29
+ import mlrun.common.runtimes.constants
29
30
  from mlrun.artifacts import (
30
31
  Artifact,
31
32
  DatasetArtifact,
@@ -91,6 +92,8 @@ class MLClientCtx:
91
92
  self._autocommit = autocommit
92
93
  self._notifications = []
93
94
  self._state_thresholds = {}
95
+ self._retry_spec = {}
96
+ self._retry_count = None
94
97
 
95
98
  self._labels = {}
96
99
  self._annotations = {}
@@ -432,6 +435,7 @@ class MLClientCtx:
432
435
  self._tolerations = spec.get("tolerations", self._tolerations)
433
436
  self._affinity = spec.get("affinity", self._affinity)
434
437
  self._reset_on_run = spec.get("reset_on_run", self._reset_on_run)
438
+ self._retry_spec = spec.get("retry", self._retry_spec)
435
439
 
436
440
  self._init_dbs(rundb)
437
441
 
@@ -450,10 +454,11 @@ class MLClientCtx:
450
454
  if start:
451
455
  start = parser.parse(start) if isinstance(start, str) else start
452
456
  self._start_time = start
453
- self._state = "running"
457
+ self._state = mlrun.common.runtimes.constants.RunStates.running
454
458
 
455
459
  status = attrs.get("status")
456
- if include_status and status:
460
+ retry_configured = self._retry_spec and self._retry_spec.get("count")
461
+ if (include_status or retry_configured) and status:
457
462
  self._results = status.get("results", self._results)
458
463
  for artifact in status.get("artifacts", []):
459
464
  artifact_obj = dict_to_artifact(artifact)
@@ -462,7 +467,10 @@ class MLClientCtx:
462
467
  )
463
468
  for key, uri in status.get("artifact_uris", {}).items():
464
469
  self._artifacts_manager.artifact_uris[key] = uri
465
- self._state = status.get("state", self._state)
470
+ self._retry_count = status.get("retry_count", self._retry_count)
471
+ # if run is a retry, the state needs to move to running
472
+ if include_status:
473
+ self._state = status.get("state", self._state)
466
474
 
467
475
  # No need to store the run for every worker
468
476
  if store_run and self.is_logging_worker():
@@ -953,6 +961,11 @@ class MLClientCtx:
953
961
  :returns: The logged `LLMPromptArtifact` object.
954
962
  """
955
963
 
964
+ if not prompt_string and not prompt_path:
965
+ raise mlrun.errors.MLRunInvalidArgumentError(
966
+ "Either 'prompt_string' or 'prompt_path' must be provided"
967
+ )
968
+
956
969
  llm_prompt = LLMPromptArtifact(
957
970
  key=key,
958
971
  project=self.project or "",
@@ -1107,13 +1120,13 @@ class MLClientCtx:
1107
1120
  :param completed: Mark run as completed
1108
1121
  """
1109
1122
  # Changing state to completed is allowed only when the execution is in running state
1110
- if self._state != "running":
1123
+ if self._state != mlrun.common.runtimes.constants.RunStates.running:
1111
1124
  completed = False
1112
1125
 
1113
1126
  if message:
1114
1127
  self._annotations["message"] = message
1115
1128
  if completed:
1116
- self._state = "completed"
1129
+ self._state = mlrun.common.runtimes.constants.RunStates.completed
1117
1130
 
1118
1131
  if self._parent:
1119
1132
  self._parent.update_child_iterations()
@@ -1147,9 +1160,15 @@ class MLClientCtx:
1147
1160
  updates = {"status.last_update": now_date().isoformat()}
1148
1161
 
1149
1162
  if error is not None:
1150
- self._state = "error"
1163
+ state = mlrun.common.runtimes.constants.RunStates.error
1164
+ max_retries = self._retry_spec.get("count", 0)
1165
+ self._retry_count = self._retry_count or 0
1166
+ if max_retries and self._retry_count < max_retries:
1167
+ state = mlrun.common.runtimes.constants.RunStates.pending_retry
1168
+
1169
+ self._state = state
1151
1170
  self._error = str(error)
1152
- updates["status.state"] = "error"
1171
+ updates["status.state"] = state
1153
1172
  updates["status.error"] = error
1154
1173
  elif (
1155
1174
  execution_state
@@ -1241,11 +1260,13 @@ class MLClientCtx:
1241
1260
  "node_selector": self._node_selector,
1242
1261
  "tolerations": self._tolerations,
1243
1262
  "affinity": self._affinity,
1263
+ "retry": self._retry_spec,
1244
1264
  },
1245
1265
  "status": {
1246
1266
  "results": self._results,
1247
1267
  "start_time": to_date_str(self._start_time),
1248
1268
  "last_update": to_date_str(self._last_update),
1269
+ "retry_count": self._retry_count,
1249
1270
  },
1250
1271
  }
1251
1272
 
mlrun/launcher/base.py CHANGED
@@ -18,6 +18,8 @@ import os
18
18
  import uuid
19
19
  from typing import Any, Callable, Optional, Union
20
20
 
21
+ import mlrun.common.constants
22
+ import mlrun.common.runtimes.constants
21
23
  import mlrun.common.schemas
22
24
  import mlrun.config
23
25
  import mlrun.errors
@@ -72,6 +74,7 @@ class BaseLauncher(abc.ABC):
72
74
  notifications: Optional[list[mlrun.model.Notification]] = None,
73
75
  returns: Optional[list[Union[str, dict[str, str]]]] = None,
74
76
  state_thresholds: Optional[dict[str, int]] = None,
77
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
75
78
  ) -> "mlrun.run.RunObject":
76
79
  """run the function from the server/client[local/remote]"""
77
80
  pass
@@ -133,7 +136,7 @@ class BaseLauncher(abc.ABC):
133
136
  """Check if the runtime requires to build the image and updates the spec accordingly"""
134
137
  pass
135
138
 
136
- def _validate_runtime(
139
+ def _validate_run(
137
140
  self,
138
141
  runtime: "mlrun.runtimes.BaseRuntime",
139
142
  run: "mlrun.run.RunObject",
@@ -194,7 +197,7 @@ class BaseLauncher(abc.ABC):
194
197
  )
195
198
 
196
199
  @classmethod
197
- def _validate_run_single_param(cls, param_name, param_value):
200
+ def _validate_run_single_param(cls, param_name: str, param_value: int):
198
201
  # verify that integer parameters don't exceed a int64
199
202
  if isinstance(param_value, int) and abs(param_value) >= 2**63:
200
203
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -203,8 +206,6 @@ class BaseLauncher(abc.ABC):
203
206
 
204
207
  @staticmethod
205
208
  def _create_run_object(task):
206
- valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
207
-
208
209
  if not task:
209
210
  # if task passed generate default RunObject
210
211
  return mlrun.run.RunObject.from_dict(task)
@@ -215,18 +216,18 @@ class BaseLauncher(abc.ABC):
215
216
  if isinstance(task, str):
216
217
  task = ast.literal_eval(task)
217
218
 
218
- if not isinstance(task, valid_task_types):
219
- raise mlrun.errors.MLRunInvalidArgumentError(
220
- f"Task is not a valid object, type={type(task)}, expected types={valid_task_types}"
221
- )
222
-
219
+ valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
220
+ if isinstance(task, mlrun.run.RunObject):
221
+ # if task is already a RunObject, we can return it as is
222
+ return task
223
223
  if isinstance(task, mlrun.run.RunTemplate):
224
224
  return mlrun.run.RunObject.from_template(task)
225
225
  elif isinstance(task, dict):
226
226
  return mlrun.run.RunObject.from_dict(task)
227
227
 
228
- # task is already a RunObject
229
- return task
228
+ raise mlrun.errors.MLRunInvalidArgumentError(
229
+ f"Task is not a valid object, type={type(task)}, expected types={valid_task_types}"
230
+ )
230
231
 
231
232
  @staticmethod
232
233
  def _enrich_run(
@@ -246,6 +247,7 @@ class BaseLauncher(abc.ABC):
246
247
  workdir=None,
247
248
  notifications: Optional[list[mlrun.model.Notification]] = None,
248
249
  state_thresholds: Optional[dict[str, int]] = None,
250
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
249
251
  ):
250
252
  run.spec.handler = (
251
253
  handler or run.spec.handler or runtime.spec.default_handler or ""
@@ -364,6 +366,7 @@ class BaseLauncher(abc.ABC):
364
366
  | state_thresholds
365
367
  )
366
368
  run.spec.state_thresholds = state_thresholds or run.spec.state_thresholds
369
+ run.spec.retry = retry or run.spec.retry
367
370
  return run
368
371
 
369
372
  @staticmethod
@@ -410,7 +413,7 @@ class BaseLauncher(abc.ABC):
410
413
  )
411
414
  if (
412
415
  run.status.state
413
- in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
416
+ in mlrun.common.runtimes.constants.RunStates.error_states()
414
417
  ):
415
418
  if runtime._is_remote and not runtime.is_child:
416
419
  logger.error(
@@ -418,7 +421,14 @@ class BaseLauncher(abc.ABC):
418
421
  state=run.status.state,
419
422
  status=run.status.to_dict(),
420
423
  )
421
- raise mlrun.runtimes.utils.RunError(run.error)
424
+
425
+ error = run.error
426
+ if (
427
+ run.status.state
428
+ == mlrun.common.runtimes.constants.RunStates.pending_retry
429
+ ):
430
+ error = f"Run is pending retry, error: {run.error}"
431
+ raise mlrun.runtimes.utils.RunError(error)
422
432
  return run
423
433
 
424
434
  return None
mlrun/launcher/local.py CHANGED
@@ -72,6 +72,7 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
72
72
  returns: Optional[list[Union[str, dict[str, str]]]] = None,
73
73
  state_thresholds: Optional[dict[str, int]] = None,
74
74
  reset_on_run: Optional[bool] = None,
75
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
75
76
  ) -> "mlrun.run.RunObject":
76
77
  # do not allow local function to be scheduled
77
78
  if schedule is not None:
@@ -122,8 +123,9 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
122
123
  workdir=workdir,
123
124
  notifications=notifications,
124
125
  state_thresholds=state_thresholds,
126
+ retry=retry,
125
127
  )
126
- self._validate_runtime(runtime, run)
128
+ self._validate_run(runtime, run)
127
129
  result = self._execute(
128
130
  runtime=runtime,
129
131
  run=run,
mlrun/launcher/remote.py CHANGED
@@ -61,6 +61,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
61
61
  returns: Optional[list[Union[str, dict[str, str]]]] = None,
62
62
  state_thresholds: Optional[dict[str, int]] = None,
63
63
  reset_on_run: Optional[bool] = None,
64
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
64
65
  ) -> "mlrun.run.RunObject":
65
66
  self.enrich_runtime(runtime, project)
66
67
  run = self._create_run_object(task)
@@ -82,8 +83,9 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
82
83
  workdir=workdir,
83
84
  notifications=notifications,
84
85
  state_thresholds=state_thresholds,
86
+ retry=retry,
85
87
  )
86
- self._validate_runtime(runtime, run)
88
+ self._validate_run(runtime, run)
87
89
 
88
90
  if not runtime.is_deployed():
89
91
  if runtime.spec.build.auto_build or auto_build:
@@ -190,7 +192,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
190
192
  return self._wrap_run_result(runtime, resp, run, schedule=schedule)
191
193
 
192
194
  @classmethod
193
- def _validate_run_single_param(cls, param_name, param_value):
195
+ def _validate_run_single_param(cls, param_name: str, param_value: int):
194
196
  if isinstance(param_value, pd.DataFrame):
195
197
  raise mlrun.errors.MLRunInvalidArgumentTypeError(
196
198
  f"Parameter '{param_name}' has an unsupported value of type"
mlrun/model.py CHANGED
@@ -935,6 +935,41 @@ class HyperParamOptions(ModelObj):
935
935
  )
936
936
 
937
937
 
938
+ class RetryBackoff(ModelObj):
939
+ """Backoff strategy for retries."""
940
+
941
+ def __init__(self, base_delay: Optional[str] = None):
942
+ # The base_delay time string must conform to timelength python package standards and be at least
943
+ # mlrun.mlconf.function.spec.retry.backoff.min_base_delay (e.g. 1000s, 1 hour 30m, 1h etc.).
944
+ self.base_delay = (
945
+ base_delay or mlrun.mlconf.function.spec.retry.backoff.default_base_delay
946
+ )
947
+
948
+
949
+ class Retry(ModelObj):
950
+ """Retry configuration"""
951
+
952
+ def __init__(
953
+ self,
954
+ count: int = 0,
955
+ backoff: typing.Union[RetryBackoff, dict] = None,
956
+ ):
957
+ # Set to None if count is 0 to eliminate the retry configuration from the dictionary representation.
958
+ self.count = count or None
959
+ self.backoff = backoff
960
+
961
+ @property
962
+ def backoff(self) -> Optional[RetryBackoff]:
963
+ if not self.count:
964
+ # Retry is not configured, return None
965
+ return None
966
+ return self._backoff
967
+
968
+ @backoff.setter
969
+ def backoff(self, backoff):
970
+ self._backoff = self._verify_dict(backoff, "backoff", RetryBackoff)
971
+
972
+
938
973
  class RunSpec(ModelObj):
939
974
  """Run specification"""
940
975
 
@@ -971,6 +1006,7 @@ class RunSpec(ModelObj):
971
1006
  node_selector=None,
972
1007
  tolerations=None,
973
1008
  affinity=None,
1009
+ retry=None,
974
1010
  ):
975
1011
  # A dictionary of parsing configurations that will be read from the inputs the user set. The keys are the inputs
976
1012
  # keys (parameter names) and the values are the type hint given in the input keys after the colon.
@@ -1011,6 +1047,7 @@ class RunSpec(ModelObj):
1011
1047
  self.node_selector = node_selector or {}
1012
1048
  self.tolerations = tolerations or {}
1013
1049
  self.affinity = affinity or {}
1050
+ self.retry = retry or {}
1014
1051
 
1015
1052
  def _serialize_field(
1016
1053
  self, struct: dict, field_name: Optional[str] = None, strip: bool = False
@@ -1212,6 +1249,14 @@ class RunSpec(ModelObj):
1212
1249
  self._verify_dict(state_thresholds, "state_thresholds")
1213
1250
  self._state_thresholds = state_thresholds
1214
1251
 
1252
+ @property
1253
+ def retry(self) -> Retry:
1254
+ return self._retry
1255
+
1256
+ @retry.setter
1257
+ def retry(self, retry: typing.Union[Retry, dict]):
1258
+ self._retry = self._verify_dict(retry, "retry", Retry)
1259
+
1215
1260
  def extract_type_hints_from_inputs(self):
1216
1261
  """
1217
1262
  This method extracts the type hints from the input keys in the input dictionary.
@@ -1329,6 +1374,7 @@ class RunStatus(ModelObj):
1329
1374
  reason: Optional[str] = None,
1330
1375
  notifications: Optional[dict[str, Notification]] = None,
1331
1376
  artifact_uris: Optional[dict[str, str]] = None,
1377
+ retry_count: Optional[int] = None,
1332
1378
  ):
1333
1379
  self.state = state or "created"
1334
1380
  self.status_text = status_text
@@ -1346,6 +1392,7 @@ class RunStatus(ModelObj):
1346
1392
  self.notifications = notifications or {}
1347
1393
  # Artifact key -> URI mapping, since the full artifacts are not stored in the runs DB table
1348
1394
  self._artifact_uris = artifact_uris or {}
1395
+ self._retry_count = retry_count or None
1349
1396
 
1350
1397
  @classmethod
1351
1398
  def from_dict(
@@ -1399,6 +1446,21 @@ class RunStatus(ModelObj):
1399
1446
 
1400
1447
  self._artifact_uris = resolved_artifact_uris
1401
1448
 
1449
+ @property
1450
+ def retry_count(self) -> Optional[int]:
1451
+ """
1452
+ The number of retries that were made for this run.
1453
+ """
1454
+ return self._retry_count
1455
+
1456
+ @retry_count.setter
1457
+ def retry_count(self, retry_count: int):
1458
+ """
1459
+ Set the number of retries that were made for this run.
1460
+ :param retry_count: The number of retries.
1461
+ """
1462
+ self._retry_count = retry_count
1463
+
1402
1464
  def is_failed(self) -> Optional[bool]:
1403
1465
  """
1404
1466
  This method returns whether a run has failed.
@@ -2026,6 +2088,7 @@ def new_task(
2026
2088
  secrets=None,
2027
2089
  base=None,
2028
2090
  returns=None,
2091
+ retry=None,
2029
2092
  ) -> RunTemplate:
2030
2093
  """Creates a new task
2031
2094
 
@@ -2061,6 +2124,7 @@ def new_task(
2061
2124
  * A dictionary of configurations to use when logging. Further info per object type and
2062
2125
  artifact type can be given there. The artifact key must appear in the dictionary as
2063
2126
  "key": "the_key".
2127
+ :param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
2064
2128
  """
2065
2129
 
2066
2130
  if base:
@@ -2086,6 +2150,7 @@ def new_task(
2086
2150
  run.spec.hyper_param_options.selector = (
2087
2151
  selector or run.spec.hyper_param_options.selector
2088
2152
  )
2153
+ run.spec.retry = retry or run.spec.retry
2089
2154
  return run
2090
2155
 
2091
2156
 
@@ -804,25 +804,45 @@ class V3IOTSDBConnector(TSDBConnector):
804
804
  @staticmethod
805
805
  def _get_sql_query(
806
806
  *,
807
- endpoint_id: str,
808
807
  table_path: str,
808
+ endpoint_id: Optional[str] = None,
809
+ application_names: Optional[list[str]] = None,
809
810
  name: str = mm_schemas.ResultData.RESULT_NAME,
810
811
  metric_and_app_names: Optional[list[tuple[str, str]]] = None,
811
812
  columns: Optional[list[str]] = None,
813
+ group_by_columns: Optional[list[str]] = None,
812
814
  ) -> str:
813
815
  """Get the SQL query for the results/metrics table"""
816
+
817
+ if metric_and_app_names and not endpoint_id:
818
+ raise mlrun.errors.MLRunInvalidArgumentError(
819
+ "If metric_and_app_names is provided, endpoint_id must also be provided"
820
+ )
821
+
822
+ if metric_and_app_names and application_names:
823
+ raise mlrun.errors.MLRunInvalidArgumentError(
824
+ "Cannot provide both metric_and_app_names and application_names"
825
+ )
826
+
814
827
  if columns:
815
828
  selection = ",".join(columns)
816
829
  else:
817
830
  selection = "*"
818
831
 
819
832
  with StringIO() as query:
820
- query.write(
821
- f"SELECT {selection} FROM '{table_path}' "
822
- f"WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
823
- )
833
+ where_added = False
834
+ query.write(f"SELECT {selection} FROM '{table_path}'")
835
+ if endpoint_id:
836
+ query.write(
837
+ f" WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
838
+ )
839
+ where_added = True
824
840
  if metric_and_app_names:
825
- query.write(" AND (")
841
+ if where_added:
842
+ query.write(" AND (")
843
+ else:
844
+ query.write(" WHERE (")
845
+ where_added = True
826
846
 
827
847
  for i, (app_name, result_name) in enumerate(metric_and_app_names):
828
848
  sub_cond = (
@@ -835,6 +855,22 @@ class V3IOTSDBConnector(TSDBConnector):
835
855
 
836
856
  query.write(")")
837
857
 
858
+ if application_names:
859
+ if where_added:
860
+ query.write(" AND (")
861
+ else:
862
+ query.write(" WHERE (")
863
+ for i, app_name in enumerate(application_names):
864
+ sub_cond = f"{mm_schemas.WriterEvent.APPLICATION_NAME}='{app_name}'"
865
+ if i != 0: # not first sub condition
866
+ query.write(" OR ")
867
+ query.write(sub_cond)
868
+ query.write(")")
869
+
870
+ if group_by_columns:
871
+ query.write(" GROUP BY ")
872
+ query.write(",".join(group_by_columns))
873
+
838
874
  query.write(";")
839
875
  return query.getvalue()
840
876
 
@@ -1272,7 +1308,49 @@ class V3IOTSDBConnector(TSDBConnector):
1272
1308
  end: Optional[Union[datetime, str]] = None,
1273
1309
  application_names: Optional[Union[str, list[str]]] = None,
1274
1310
  ) -> dict[str, int]:
1275
- raise NotImplementedError
1311
+ start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
1312
+ group_by_columns = [
1313
+ mm_schemas.ApplicationEvent.APPLICATION_NAME,
1314
+ mm_schemas.ApplicationEvent.ENDPOINT_ID,
1315
+ ]
1316
+
1317
+ def get_application_endpoints_records(
1318
+ record_type: Literal["metrics", "results"],
1319
+ ):
1320
+ if record_type == "results":
1321
+ table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
1322
+ else:
1323
+ table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
1324
+ sql_query = self._get_sql_query(
1325
+ table_path=table_path,
1326
+ columns=[mm_schemas.WriterEvent.START_INFER_TIME],
1327
+ group_by_columns=group_by_columns,
1328
+ application_names=application_names,
1329
+ )
1330
+ return self.frames_client.read(
1331
+ backend=_TSDB_BE,
1332
+ start=start,
1333
+ end=end,
1334
+ query=sql_query,
1335
+ )
1336
+
1337
+ df_results = get_application_endpoints_records("results")
1338
+ df_metrics = get_application_endpoints_records("metrics")
1339
+
1340
+ if df_results.empty and df_metrics.empty:
1341
+ return {}
1342
+
1343
+ # Combine the two dataframes and count unique endpoints per application
1344
+ combined_df = pd.concat([df_results, df_metrics], ignore_index=True)
1345
+ if combined_df.empty:
1346
+ return {}
1347
+ combined_df.drop_duplicates(subset=group_by_columns, inplace=True)
1348
+
1349
+ grouped_df = combined_df.groupby(
1350
+ mm_schemas.WriterEvent.APPLICATION_NAME
1351
+ ).count()
1352
+
1353
+ return grouped_df[mm_schemas.WriterEvent.ENDPOINT_ID].to_dict()
1276
1354
 
1277
1355
  def calculate_latest_metrics(
1278
1356
  self,
@@ -1282,4 +1360,93 @@ class V3IOTSDBConnector(TSDBConnector):
1282
1360
  ) -> list[
1283
1361
  Union[mm_schemas.ApplicationResultRecord, mm_schemas.ApplicationMetricRecord]
1284
1362
  ]:
1285
- raise NotImplementedError
1363
+ metric_list = []
1364
+ start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
1365
+
1366
+ # Get the latest results
1367
+ def get_latest_metrics_records(
1368
+ record_type: Literal["metrics", "results"],
1369
+ ) -> pd.DataFrame:
1370
+ group_by_columns = [mm_schemas.ApplicationEvent.APPLICATION_NAME]
1371
+ if record_type == "results":
1372
+ table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
1373
+ columns = [
1374
+ f"last({mm_schemas.ResultData.RESULT_STATUS})",
1375
+ f"last({mm_schemas.ResultData.RESULT_VALUE})",
1376
+ f"last({mm_schemas.ResultData.RESULT_KIND})",
1377
+ ]
1378
+ group_by_columns += [
1379
+ mm_schemas.ResultData.RESULT_NAME,
1380
+ ]
1381
+ else:
1382
+ table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
1383
+ columns = [f"last({mm_schemas.MetricData.METRIC_VALUE})"]
1384
+ group_by_columns += [
1385
+ mm_schemas.MetricData.METRIC_NAME,
1386
+ ]
1387
+ sql_query = self._get_sql_query(
1388
+ table_path=table_path,
1389
+ columns=columns,
1390
+ group_by_columns=group_by_columns,
1391
+ application_names=application_names,
1392
+ )
1393
+
1394
+ return self.frames_client.read(
1395
+ backend=_TSDB_BE,
1396
+ start=start,
1397
+ end=end,
1398
+ query=sql_query,
1399
+ )
1400
+
1401
+ df_results = get_latest_metrics_records("results")
1402
+ df_metrics = get_latest_metrics_records("metrics")
1403
+
1404
+ if df_results.empty and df_metrics.empty:
1405
+ return metric_list
1406
+
1407
+ # Convert the results DataFrame to a list of ApplicationResultRecord
1408
+ def build_metric_objects() -> (
1409
+ list[
1410
+ Union[
1411
+ mm_schemas.ApplicationResultRecord,
1412
+ mm_schemas.ApplicationMetricRecord,
1413
+ ]
1414
+ ]
1415
+ ):
1416
+ metric_objects = []
1417
+ if not df_results.empty:
1418
+ df_results.rename(
1419
+ columns={
1420
+ f"last({mm_schemas.ResultData.RESULT_VALUE})": mm_schemas.ResultData.RESULT_VALUE,
1421
+ f"last({mm_schemas.ResultData.RESULT_STATUS})": mm_schemas.ResultData.RESULT_STATUS,
1422
+ f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND,
1423
+ },
1424
+ inplace=True,
1425
+ )
1426
+ for _, row in df_results.iterrows():
1427
+ metric_objects.append(
1428
+ mm_schemas.ApplicationResultRecord(
1429
+ result_name=row[mm_schemas.ResultData.RESULT_NAME],
1430
+ kind=row[mm_schemas.ResultData.RESULT_KIND],
1431
+ status=row[mm_schemas.ResultData.RESULT_STATUS],
1432
+ value=row[mm_schemas.ResultData.RESULT_VALUE],
1433
+ )
1434
+ )
1435
+ if not df_metrics.empty:
1436
+ df_metrics.rename(
1437
+ columns={
1438
+ f"last({mm_schemas.MetricData.METRIC_VALUE})": mm_schemas.MetricData.METRIC_VALUE,
1439
+ },
1440
+ inplace=True,
1441
+ )
1442
+
1443
+ for _, row in df_metrics.iterrows():
1444
+ metric_objects.append(
1445
+ mm_schemas.ApplicationMetricRecord(
1446
+ metric_name=row[mm_schemas.MetricData.METRIC_NAME],
1447
+ value=row[mm_schemas.MetricData.METRIC_VALUE],
1448
+ )
1449
+ )
1450
+ return metric_objects
1451
+
1452
+ return build_metric_objects()
@@ -21,6 +21,7 @@ from typing import Any, Optional, Union
21
21
 
22
22
  import mlrun.errors
23
23
  from mlrun.artifacts import Artifact
24
+ from mlrun.artifacts.base import verify_target_path
24
25
  from mlrun.datastore import DataItem, get_store_resource, store_manager
25
26
  from mlrun.errors import MLRunInvalidArgumentError
26
27
  from mlrun.utils import logger
@@ -276,6 +277,7 @@ class PackagersManager:
276
277
  if data_item.get_artifact_type():
277
278
  # Get the artifact object in the data item:
278
279
  artifact, _ = store_manager.get_store_artifact(url=data_item.artifact_url)
280
+ verify_target_path(artifact)
279
281
  # Get the key from the artifact's metadata and instructions from the artifact's spec:
280
282
  artifact_key = artifact.metadata.key
281
283
  packaging_instructions = artifact.spec.unpackaging_instructions
@@ -20,7 +20,6 @@ import mlrun
20
20
  import mlrun.common.constants as mlrun_constants
21
21
  import mlrun.common.schemas.function
22
22
  import mlrun.common.schemas.workflow
23
- import mlrun_pipelines.common.models
24
23
  import mlrun_pipelines.models
25
24
  from mlrun.utils import hub_prefix
26
25
 
@@ -82,6 +81,7 @@ def run_function(
82
81
  builder_env: Optional[list] = None,
83
82
  reset_on_run: Optional[bool] = None,
84
83
  output_path: Optional[str] = None,
84
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
85
85
  ) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
86
86
  """Run a local or remote task as part of a local/kubeflow pipeline
87
87
 
@@ -177,6 +177,7 @@ def run_function(
177
177
  This ensures latest code changes are executed. This argument must be used in
178
178
  conjunction with the local=True argument.
179
179
  :param output_path: path to store artifacts, when running in a workflow this will be set automatically
180
+ :param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
180
181
  :return: MLRun RunObject or PipelineNodeWrapper
181
182
  """
182
183
  if artifact_path:
@@ -197,6 +198,7 @@ def run_function(
197
198
  returns=returns,
198
199
  base=base_task,
199
200
  selector=selector,
201
+ retry=retry,
200
202
  )
201
203
  task.spec.verbose = task.spec.verbose or verbose
202
204
 
@@ -205,6 +207,11 @@ def run_function(
205
207
  raise mlrun.errors.MLRunInvalidArgumentError(
206
208
  "Scheduling jobs is not supported when running a workflow with the kfp engine."
207
209
  )
210
+ if retry:
211
+ raise mlrun.errors.MLRunInvalidArgumentError(
212
+ "Retrying jobs is not supported when running a workflow with the kfp engine. "
213
+ "Use KFP set_retry instead."
214
+ )
208
215
  return function.as_step(
209
216
  name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels
210
217
  )