mlrun 1.10.0rc9__py3-none-any.whl → 1.10.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (57) hide show
  1. mlrun/artifacts/manager.py +1 -1
  2. mlrun/common/constants.py +12 -0
  3. mlrun/common/schemas/__init__.py +1 -0
  4. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  5. mlrun/common/schemas/model_monitoring/functions.py +2 -0
  6. mlrun/common/schemas/model_monitoring/model_endpoints.py +19 -1
  7. mlrun/common/schemas/serving.py +1 -0
  8. mlrun/common/schemas/workflow.py +8 -0
  9. mlrun/datastore/azure_blob.py +1 -1
  10. mlrun/datastore/base.py +4 -2
  11. mlrun/datastore/datastore.py +46 -14
  12. mlrun/datastore/google_cloud_storage.py +1 -1
  13. mlrun/datastore/s3.py +16 -5
  14. mlrun/datastore/sources.py +2 -2
  15. mlrun/datastore/targets.py +2 -2
  16. mlrun/db/__init__.py +0 -1
  17. mlrun/db/base.py +29 -0
  18. mlrun/db/httpdb.py +35 -0
  19. mlrun/db/nopdb.py +19 -0
  20. mlrun/execution.py +12 -0
  21. mlrun/frameworks/tf_keras/mlrun_interface.py +8 -19
  22. mlrun/frameworks/tf_keras/model_handler.py +21 -12
  23. mlrun/launcher/base.py +1 -0
  24. mlrun/launcher/client.py +1 -0
  25. mlrun/launcher/local.py +4 -0
  26. mlrun/model.py +15 -4
  27. mlrun/model_monitoring/applications/base.py +74 -56
  28. mlrun/model_monitoring/db/tsdb/base.py +52 -19
  29. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +179 -11
  30. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +26 -11
  31. mlrun/model_monitoring/helpers.py +48 -0
  32. mlrun/projects/__init__.py +1 -0
  33. mlrun/projects/pipelines.py +44 -1
  34. mlrun/projects/project.py +30 -0
  35. mlrun/runtimes/daskjob.py +2 -0
  36. mlrun/runtimes/kubejob.py +4 -0
  37. mlrun/runtimes/mpijob/abstract.py +2 -0
  38. mlrun/runtimes/mpijob/v1.py +2 -0
  39. mlrun/runtimes/nuclio/function.py +2 -0
  40. mlrun/runtimes/nuclio/serving.py +59 -0
  41. mlrun/runtimes/pod.py +3 -0
  42. mlrun/runtimes/remotesparkjob.py +2 -0
  43. mlrun/runtimes/sparkjob/spark3job.py +2 -0
  44. mlrun/serving/routers.py +17 -13
  45. mlrun/serving/server.py +97 -3
  46. mlrun/serving/states.py +146 -38
  47. mlrun/serving/system_steps.py +2 -1
  48. mlrun/serving/v2_serving.py +2 -2
  49. mlrun/utils/version/version.json +2 -2
  50. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/METADATA +13 -7
  51. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/RECORD +55 -57
  52. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/licenses/LICENSE +1 -1
  53. mlrun/db/sql_types.py +0 -160
  54. mlrun/utils/db.py +0 -71
  55. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/WHEEL +0 -0
  56. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/entry_points.txt +0 -0
  57. {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/top_level.txt +0 -0
@@ -413,8 +413,8 @@ class ArtifactManager:
413
413
  self.artifact_db.del_artifact(
414
414
  key=item.db_key,
415
415
  project=item.project,
416
- tag=item.tag,
417
416
  tree=item.tree,
417
+ uid=item.uid,
418
418
  iter=item.iter,
419
419
  deletion_strategy=deletion_strategy,
420
420
  secrets=secrets,
mlrun/common/constants.py CHANGED
@@ -27,6 +27,10 @@ DASK_LABEL_PREFIX = "dask.org/"
27
27
  NUCLIO_LABEL_PREFIX = "nuclio.io/"
28
28
  RESERVED_TAG_NAME_LATEST = "latest"
29
29
 
30
+ JOB_TYPE_WORKFLOW_RUNNER = "workflow-runner"
31
+ JOB_TYPE_PROJECT_LOADER = "project-loader"
32
+ JOB_TYPE_RERUN_WORKFLOW_RUNNER = "rerun-workflow-runner"
33
+
30
34
 
31
35
  class MLRunInternalLabels:
32
36
  ### dask
@@ -76,6 +80,9 @@ class MLRunInternalLabels:
76
80
  kind = "kind"
77
81
  component = "component"
78
82
  mlrun_type = "mlrun__type"
83
+ rerun_of = "rerun-of"
84
+ original_workflow_id = "original-workflow-id"
85
+ workflow_id = "workflow-id"
79
86
 
80
87
  owner = "owner"
81
88
  v3io_user = "v3io_user"
@@ -101,3 +108,8 @@ class MLRunInternalLabels:
101
108
  class DeployStatusTextKind(mlrun.common.types.StrEnum):
102
109
  logs = "logs"
103
110
  events = "events"
111
+
112
+
113
+ class WorkflowSubmitMode(mlrun.common.types.StrEnum):
114
+ direct = "direct" # call KFP retry API directly
115
+ rerun = "rerun" # launch a RerunRunner function
@@ -218,6 +218,7 @@ from .serving import ModelRunnerStepData, MonitoringData
218
218
  from .tag import Tag, TagObjects
219
219
  from .workflow import (
220
220
  GetWorkflowResponse,
221
+ RerunWorkflowRequest,
221
222
  WorkflowRequest,
222
223
  WorkflowResponse,
223
224
  WorkflowSpec,
@@ -54,6 +54,8 @@ from .grafana import (
54
54
  GrafanaTable,
55
55
  )
56
56
  from .model_endpoints import (
57
+ ApplicationMetricRecord,
58
+ ApplicationResultRecord,
57
59
  Features,
58
60
  FeatureValues,
59
61
  ModelEndpoint,
@@ -34,6 +34,7 @@ class FunctionSummary(BaseModel):
34
34
  type: FunctionsType
35
35
  name: str
36
36
  application_class: str
37
+ project_name: str
37
38
  updated_time: datetime
38
39
  status: Optional[str] = None
39
40
  base_period: Optional[int] = None
@@ -59,6 +60,7 @@ class FunctionSummary(BaseModel):
59
60
  else func_dict["spec"]["graph"]["steps"]["PushToMonitoringWriter"]["after"][
60
61
  0
61
62
  ],
63
+ project_name=func_dict["metadata"]["project"],
62
64
  updated_time=func_dict["metadata"].get("updated"),
63
65
  status=func_dict["status"].get("state"),
64
66
  base_period=base_period,
@@ -14,7 +14,7 @@
14
14
  import abc
15
15
  import json
16
16
  from datetime import datetime
17
- from typing import Any, NamedTuple, Optional, TypeVar
17
+ from typing import Any, Literal, NamedTuple, Optional, TypeVar
18
18
  from uuid import UUID
19
19
 
20
20
  from pydantic import validator # use `validator` if you’re still on Pydantic v1
@@ -334,6 +334,24 @@ class ModelEndpointMonitoringMetricNoData(_ModelEndpointMonitoringMetricValuesBa
334
334
  data: bool = False
335
335
 
336
336
 
337
+ class ApplicationBaseRecord(BaseModel):
338
+ type: Literal["metric", "result"]
339
+ time: datetime
340
+ value: float
341
+
342
+
343
+ class ApplicationResultRecord(ApplicationBaseRecord):
344
+ kind: ResultKindApp
345
+ status: ResultStatusApp
346
+ result_name: str
347
+ type: Literal["result"] = "result"
348
+
349
+
350
+ class ApplicationMetricRecord(ApplicationBaseRecord):
351
+ metric_name: str
352
+ type: Literal["metric"] = "metric"
353
+
354
+
337
355
  def _mapping_attributes(
338
356
  model_class: type[Model],
339
357
  flattened_dictionary: dict,
@@ -26,6 +26,7 @@ class DeployResponse(BaseModel):
26
26
 
27
27
  class ModelRunnerStepData(StrEnum):
28
28
  MODELS = "models"
29
+ MODEL_TO_EXECUTION_MECHANISM = "execution_mechanism_by_model_name"
29
30
  MONITORING_DATA = "monitoring_data"
30
31
 
31
32
 
@@ -46,6 +46,14 @@ class WorkflowRequest(pydantic.v1.BaseModel):
46
46
  notifications: typing.Optional[list[Notification]] = None
47
47
 
48
48
 
49
+ class RerunWorkflowRequest(pydantic.v1.BaseModel):
50
+ run_name: typing.Optional[str] = None
51
+ run_id: typing.Optional[str] = None
52
+ original_workflow_id: typing.Optional[str] = None
53
+ notifications: typing.Optional[list[Notification]] = None
54
+ workflow_runner_node_selector: typing.Optional[dict[str, str]] = None
55
+
56
+
49
57
  class WorkflowResponse(pydantic.v1.BaseModel):
50
58
  project: str = None
51
59
  name: str = None
@@ -224,7 +224,7 @@ class AzureBlobStore(DataStore):
224
224
  path = self._convert_key_to_remote_path(key=path)
225
225
  super().rm(path=path, recursive=recursive, maxdepth=maxdepth)
226
226
 
227
- def get_spark_options(self):
227
+ def get_spark_options(self, path=None):
228
228
  res = {}
229
229
  st = self.storage_options
230
230
  service = "blob"
mlrun/datastore/base.py CHANGED
@@ -48,7 +48,9 @@ class FileStats:
48
48
  class DataStore:
49
49
  using_bucket = False
50
50
 
51
- def __init__(self, parent, name, kind, endpoint="", secrets: Optional[dict] = None):
51
+ def __init__(
52
+ self, parent, name, kind, endpoint="", secrets: Optional[dict] = None, **kwargs
53
+ ):
52
54
  self._parent = parent
53
55
  self.kind = kind
54
56
  self.name = name
@@ -176,7 +178,7 @@ class DataStore:
176
178
  def upload(self, key, src_path):
177
179
  pass
178
180
 
179
- def get_spark_options(self):
181
+ def get_spark_options(self, path=None):
180
182
  return {}
181
183
 
182
184
  @staticmethod
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import warnings
14
15
  from typing import Optional
15
16
  from urllib.parse import urlparse
16
17
 
@@ -105,8 +106,7 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
105
106
  from .alibaba_oss import OSSStore
106
107
 
107
108
  return OSSStore
108
- else:
109
- raise ValueError(f"unsupported store scheme ({schema})")
109
+ raise ValueError(f"unsupported store scheme ({schema})")
110
110
 
111
111
 
112
112
  def uri_to_ipython(link):
@@ -210,12 +210,20 @@ class StoreManager:
210
210
  artifact_url=artifact_url,
211
211
  )
212
212
 
213
- def get_or_create_store(
214
- self, url, secrets: Optional[dict] = None, project_name=""
213
+ def _get_or_create_remote_client(
214
+ self,
215
+ url,
216
+ secrets: Optional[dict] = None,
217
+ project_name="",
218
+ cache: Optional[dict] = None,
219
+ schema_to_class: callable = schema_to_store,
220
+ **kwargs,
215
221
  ) -> (DataStore, str, str):
222
+ # The cache can be an empty dictionary ({}), even if it is a _stores object
223
+ cache = cache if cache is not None else {}
216
224
  schema, endpoint, parsed_url = parse_url(url)
217
225
  subpath = parsed_url.path
218
- store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
226
+ cache_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
219
227
 
220
228
  if schema == "ds":
221
229
  datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -237,24 +245,48 @@ class StoreManager:
237
245
  subpath = url.replace("file://", "", 1)
238
246
 
239
247
  if not schema and endpoint:
240
- if endpoint in self._stores.keys():
241
- return self._stores[endpoint], subpath, url
248
+ if endpoint in cache.keys():
249
+ return cache[endpoint], subpath, url
242
250
  else:
243
251
  raise ValueError(f"no such store ({endpoint})")
244
252
 
245
253
  if not secrets and not mlrun.config.is_running_as_api():
246
- if store_key in self._stores.keys():
247
- return self._stores[store_key], subpath, url
254
+ if cache_key in cache.keys():
255
+ return cache[cache_key], subpath, url
248
256
 
249
257
  # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
250
258
  # when running on server we don't cache the datastore, because there are multiple users and we don't want to
251
259
  # cache the credentials, so for each new request we create a new store
252
- store = schema_to_store(schema)(
253
- self, schema, store_key, parsed_url.netloc, secrets=secrets
260
+ remote_client_class = schema_to_class(schema)
261
+ remote_client = None
262
+ if remote_client_class:
263
+ remote_client = remote_client_class(
264
+ self, schema, cache_key, parsed_url.netloc, secrets=secrets, **kwargs
265
+ )
266
+ if not secrets and not mlrun.config.is_running_as_api():
267
+ cache[cache_key] = remote_client
268
+ else:
269
+ warnings.warn("scheme not found. Returning None")
270
+ return remote_client, subpath, url
271
+
272
+ def get_or_create_store(
273
+ self,
274
+ url,
275
+ secrets: Optional[dict] = None,
276
+ project_name="",
277
+ ) -> (DataStore, str, str):
278
+ datastore, sub_path, url = self._get_or_create_remote_client(
279
+ url=url,
280
+ secrets=secrets,
281
+ project_name=project_name,
282
+ cache=self._stores,
283
+ schema_to_class=schema_to_store,
254
284
  )
255
- if not secrets and not mlrun.config.is_running_as_api():
256
- self._stores[store_key] = store
257
- return store, subpath, url
285
+ if not isinstance(datastore, DataStore):
286
+ raise mlrun.errors.MLRunInvalidArgumentError(
287
+ "remote client by url is not datastore"
288
+ )
289
+ return datastore, sub_path, url
258
290
 
259
291
  def reset_secrets(self):
260
292
  self._secrets = {}
@@ -194,7 +194,7 @@ class GoogleCloudStorageStore(DataStore):
194
194
  self.filesystem.exists(path)
195
195
  super().rm(path, recursive=recursive, maxdepth=maxdepth)
196
196
 
197
- def get_spark_options(self):
197
+ def get_spark_options(self, path=None):
198
198
  res = {}
199
199
  st = self._get_credentials()
200
200
  if "token" in st:
mlrun/datastore/s3.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import time
16
16
  from typing import Optional
17
+ from urllib.parse import urlparse
17
18
 
18
19
  import boto3
19
20
  from boto3.s3.transfer import TransferConfig
@@ -115,17 +116,27 @@ class S3Store(DataStore):
115
116
  byterange += str(offset + size - 1)
116
117
  return byterange
117
118
 
118
- def get_spark_options(self):
119
+ def get_spark_options(self, path=None):
119
120
  res = {}
121
+ bucket_str = ""
122
+ if path:
123
+ parsed = urlparse(path)
124
+ if parsed.scheme: # s3:// or s3a://
125
+ bucket = parsed.hostname
126
+ else:
127
+ # drop a leading slash, if any and take 1st segment
128
+ bucket = path.lstrip("/").split("/", 1)[0]
129
+ bucket_str = f".bucket.{bucket}"
130
+
120
131
  st = self.get_storage_options()
121
132
  if st.get("key"):
122
- res["spark.hadoop.fs.s3a.access.key"] = st.get("key")
133
+ res[f"spark.hadoop.fs.s3a{bucket_str}.access.key"] = st.get("key")
123
134
  if st.get("secret"):
124
- res["spark.hadoop.fs.s3a.secret.key"] = st.get("secret")
135
+ res[f"spark.hadoop.fs.s3a{bucket_str}.secret.key"] = st.get("secret")
125
136
  if st.get("endpoint_url"):
126
- res["spark.hadoop.fs.s3a.endpoint"] = st.get("endpoint_url")
137
+ res[f"spark.hadoop.fs.s3a{bucket_str}.endpoint"] = st.get("endpoint_url")
127
138
  if st.get("profile"):
128
- res["spark.hadoop.fs.s3a.aws.profile"] = st.get("profile")
139
+ res[f"spark.hadoop.fs.s3a{bucket_str}.aws.profile"] = st.get("profile")
129
140
  return res
130
141
 
131
142
  @property
@@ -220,7 +220,7 @@ class CSVSource(BaseSourceDriver):
220
220
 
221
221
  def get_spark_options(self):
222
222
  store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
223
- spark_options = store.get_spark_options()
223
+ spark_options = store.get_spark_options(store.spark_url + path)
224
224
  spark_options.update(
225
225
  {
226
226
  "path": store.spark_url + path,
@@ -407,7 +407,7 @@ class ParquetSource(BaseSourceDriver):
407
407
 
408
408
  def get_spark_options(self):
409
409
  store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
410
- spark_options = store.get_spark_options()
410
+ spark_options = store.get_spark_options(store.spark_url + path)
411
411
  spark_options.update(
412
412
  {
413
413
  "path": store.spark_url + path,
@@ -970,7 +970,7 @@ class ParquetTarget(BaseStoreTarget):
970
970
  break
971
971
 
972
972
  store, path, url = self._get_store_and_path()
973
- spark_options = store.get_spark_options()
973
+ spark_options = store.get_spark_options(store.spark_url + path)
974
974
  spark_options.update(
975
975
  {
976
976
  "path": store.spark_url + path,
@@ -1104,7 +1104,7 @@ class CSVTarget(BaseStoreTarget):
1104
1104
 
1105
1105
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1106
1106
  store, path, url = self._get_store_and_path()
1107
- spark_options = store.get_spark_options()
1107
+ spark_options = store.get_spark_options(store.spark_url + path)
1108
1108
  spark_options.update(
1109
1109
  {
1110
1110
  "path": store.spark_url + path,
mlrun/db/__init__.py CHANGED
@@ -14,7 +14,6 @@
14
14
  from os import environ
15
15
 
16
16
  from ..config import config
17
- from . import sql_types
18
17
  from .base import RunDBError, RunDBInterface # noqa
19
18
 
20
19
 
mlrun/db/base.py CHANGED
@@ -638,6 +638,17 @@ class RunDBInterface(ABC):
638
638
  ):
639
639
  pass
640
640
 
641
+ @abstractmethod
642
+ def retry_pipeline(
643
+ self,
644
+ run_id: str,
645
+ project: str,
646
+ namespace: Optional[str] = None,
647
+ timeout: int = 30,
648
+ submit_mode: str = "",
649
+ ):
650
+ pass
651
+
641
652
  @abstractmethod
642
653
  def list_project_secrets(
643
654
  self,
@@ -1034,6 +1045,13 @@ class RunDBInterface(ABC):
1034
1045
  ):
1035
1046
  pass
1036
1047
 
1048
+ def get_project_background_task(
1049
+ self,
1050
+ project: str,
1051
+ name: str,
1052
+ ) -> mlrun.common.schemas.BackgroundTask:
1053
+ pass
1054
+
1037
1055
  @abstractmethod
1038
1056
  def submit_workflow(
1039
1057
  self,
@@ -1113,6 +1131,17 @@ class RunDBInterface(ABC):
1113
1131
  ) -> list[mlrun.common.schemas.model_monitoring.FunctionSummary]:
1114
1132
  pass
1115
1133
 
1134
+ @abstractmethod
1135
+ def get_monitoring_function_summary(
1136
+ self,
1137
+ project: str,
1138
+ function_name: str,
1139
+ start: Optional[datetime.datetime] = None,
1140
+ end: Optional[datetime.datetime] = None,
1141
+ include_latest_metrics: bool = False,
1142
+ ) -> mlrun.common.schemas.model_monitoring.FunctionSummary:
1143
+ pass
1144
+
1116
1145
  @abstractmethod
1117
1146
  def get_project_summary(self, project: str) -> mlrun.common.schemas.ProjectSummary:
1118
1147
  pass
mlrun/db/httpdb.py CHANGED
@@ -2350,6 +2350,7 @@ class HTTPRunDB(RunDBInterface):
2350
2350
  project: str,
2351
2351
  namespace: Optional[str] = None,
2352
2352
  timeout: int = 30,
2353
+ submit_mode: str = "",
2353
2354
  ):
2354
2355
  """
2355
2356
  Retry a specific pipeline run using its run ID. This function sends an API request
@@ -2359,6 +2360,7 @@ class HTTPRunDB(RunDBInterface):
2359
2360
  :param namespace: Kubernetes namespace where the pipeline is running. Optional.
2360
2361
  :param timeout: Timeout (in seconds) for the API call. Defaults to 30 seconds.
2361
2362
  :param project: Name of the MLRun project associated with the pipeline.
2363
+ :param submit_mode: Whether to submit the pipeline directly to the API.
2362
2364
 
2363
2365
  :raises ValueError: Raised if the API response is not successful or contains an
2364
2366
  error.
@@ -2370,6 +2372,9 @@ class HTTPRunDB(RunDBInterface):
2370
2372
  if namespace:
2371
2373
  params["namespace"] = namespace
2372
2374
 
2375
+ if submit_mode:
2376
+ params["submit-mode"] = submit_mode
2377
+
2373
2378
  resp_text = ""
2374
2379
  resp_code = None
2375
2380
  try:
@@ -4188,6 +4193,36 @@ class HTTPRunDB(RunDBInterface):
4188
4193
  results.append(FunctionSummary(**item))
4189
4194
  return results
4190
4195
 
4196
+ def get_monitoring_function_summary(
4197
+ self,
4198
+ project: str,
4199
+ function_name: str,
4200
+ start: Optional[datetime] = None,
4201
+ end: Optional[datetime] = None,
4202
+ include_latest_metrics: bool = False,
4203
+ ) -> FunctionSummary:
4204
+ """
4205
+ Get a monitoring function summary for the specified project and function.
4206
+ :param project: The name of the project.
4207
+ :param function_name: The name of the function.
4208
+ :param start: Start time for filtering the results (optional).
4209
+ :param end: End time for filtering the results (optional).
4210
+ :param include_latest_metrics: Whether to include the latest metrics in the response (default is False).
4211
+
4212
+ :return: A FunctionSummary object containing information about the monitoring function.
4213
+ """
4214
+
4215
+ response = self.api_call(
4216
+ method=mlrun.common.types.HTTPMethod.GET,
4217
+ path=f"projects/{project}/model-monitoring/function-summaries/{function_name}",
4218
+ params={
4219
+ "start": datetime_to_iso(start),
4220
+ "end": datetime_to_iso(end),
4221
+ "include-latest-metrics": include_latest_metrics,
4222
+ },
4223
+ )
4224
+ return FunctionSummary(**response.json())
4225
+
4191
4226
  def create_hub_source(
4192
4227
  self, source: Union[dict, mlrun.common.schemas.IndexedHubSource]
4193
4228
  ):
mlrun/db/nopdb.py CHANGED
@@ -524,6 +524,15 @@ class NopDB(RunDBInterface):
524
524
  ):
525
525
  pass
526
526
 
527
+ def retry_pipeline(
528
+ self,
529
+ run_id: str,
530
+ project: str,
531
+ namespace: Optional[str] = None,
532
+ timeout: int = 30,
533
+ ):
534
+ pass
535
+
527
536
  def list_pipelines(
528
537
  self,
529
538
  project: str,
@@ -893,6 +902,16 @@ class NopDB(RunDBInterface):
893
902
  ) -> [mlrun.common.schemas.model_monitoring.FunctionSummary]:
894
903
  pass
895
904
 
905
+ def get_monitoring_function_summary(
906
+ self,
907
+ project: str,
908
+ function_name: str,
909
+ start: Optional[datetime.datetime] = None,
910
+ end: Optional[datetime.datetime] = None,
911
+ include_latest_metrics: bool = False,
912
+ ) -> mlrun.common.schemas.model_monitoring.FunctionSummary:
913
+ pass
914
+
896
915
  def generate_event(
897
916
  self, name: str, event_data: Union[dict, mlrun.common.schemas.Event], project=""
898
917
  ):
mlrun/execution.py CHANGED
@@ -1286,6 +1286,18 @@ class MLClientCtx:
1286
1286
  self.to_dict(), self._uid, self.project, iter=self._iteration
1287
1287
  )
1288
1288
 
1289
+ def update_run(self):
1290
+ """
1291
+ Store the run object in the DB - removes missing fields.
1292
+ Use _update_run for coherent updates.
1293
+ Should be called by the logging worker only (see is_logging_worker()).
1294
+ """
1295
+ self._write_tmpfile()
1296
+ if self._rundb:
1297
+ self._rundb.update_run(
1298
+ self.to_dict(), self._uid, self.project, iter=self._iteration
1299
+ )
1300
+
1289
1301
  def is_logging_worker(self):
1290
1302
  """
1291
1303
  Check if the current worker is the logging worker.
@@ -107,14 +107,10 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
107
107
  )
108
108
 
109
109
  # Call the pre compile method:
110
- (optimizer, experimental_run_tf_function) = self._pre_compile(
111
- optimizer=kwargs["optimizer"]
112
- )
110
+ optimizer = self._pre_compile(optimizer=kwargs["optimizer"])
113
111
 
114
112
  # Assign parameters:
115
113
  kwargs["optimizer"] = optimizer
116
- if experimental_run_tf_function is not None:
117
- kwargs["experimental_run_tf_function"] = experimental_run_tf_function
118
114
 
119
115
  # Call the original compile method:
120
116
  return self.original_compile(*args, **kwargs)
@@ -235,23 +231,20 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
235
231
  """
236
232
  self._RANK_0_ONLY_CALLBACKS.add(callback_name)
237
233
 
238
- def _pre_compile(self, optimizer: Optimizer) -> tuple[Optimizer, Union[bool, None]]:
234
+ def _pre_compile(self, optimizer: Optimizer) -> Optimizer:
239
235
  """
240
236
  Method to call before calling 'compile' to setup the run and inputs for using horovod.
241
237
 
242
238
  :param optimizer: The optimzier to compile. It will be wrapped in horovod's distributed optimizer:
243
239
  'hvd.DistributedOptimizer'.
244
240
 
245
- :return: The updated parameters:
246
- [0] = Wrapped optimizer.
247
- [1] = The 'experimental_run_tf_function' parameter for 'compile' kwargs or 'None' if horovod should not
248
- be used.
241
+ :return: The updated Wrapped optimizer.
249
242
 
250
243
  :raise MLRunInvalidArgumentError: In case the optimizer was passed as a string.
251
244
  """
252
245
  # Check if needed to run with horovod:
253
246
  if self._hvd is None:
254
- return optimizer, None
247
+ return optimizer
255
248
 
256
249
  # Validate the optimizer input:
257
250
  if isinstance(optimizer, str):
@@ -280,19 +273,15 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
280
273
  print(f"Horovod worker #{self._hvd.rank()} is using CPU")
281
274
 
282
275
  # Adjust learning rate based on the number of GPUs:
283
- if hasattr(self.optimizer, "lr"):
284
- optimizer.lr *= self._hvd.size()
276
+ if hasattr(optimizer, "lr"):
277
+ optimizer.lr = optimizer.lr * self._hvd.size()
285
278
  else:
286
- optimizer.learning_rate *= self._hvd.size()
279
+ optimizer.learning_rate = optimizer.learning_rate * self._hvd.size()
287
280
 
288
281
  # Wrap the optimizer in horovod's distributed optimizer: 'hvd.DistributedOptimizer'.
289
282
  optimizer = self._hvd.DistributedOptimizer(optimizer)
290
283
 
291
- # Compile the model with `experimental_run_tf_function=False` to ensure Tensorflow uses the distributed
292
- # optimizer to compute the gradients:
293
- experimental_run_tf_function = False
294
-
295
- return optimizer, experimental_run_tf_function
284
+ return optimizer
296
285
 
297
286
  def _pre_fit(
298
287
  self,
@@ -518,7 +518,6 @@ class TFKerasModelHandler(DLModelHandler):
518
518
  )
519
519
 
520
520
  # Read additional files according to the model format used:
521
- # # ModelFormats.SAVED_MODEL - Unzip the SavedModel archive:
522
521
  if self._model_format == TFKerasModelHandler.ModelFormats.SAVED_MODEL:
523
522
  # Unzip the SavedModel directory:
524
523
  with zipfile.ZipFile(self._model_file, "r") as zip_file:
@@ -528,21 +527,17 @@ class TFKerasModelHandler(DLModelHandler):
528
527
  os.path.dirname(self._model_file), self._model_name
529
528
  )
530
529
  elif self._model_format == TFKerasModelHandler.ModelFormats.KERAS:
531
- # When keras tried to load it, it validates the suffix. The `artifacts.model.get_model` function is
532
- # downloading the keras file to a temp file with a `pkl` suffix, so it needs to be replaced:
533
- self._model_file = self._model_file.rsplit(".pkl", 1)[0] + ".keras"
530
+ # Rename the model file suffix:
531
+ self._rename_model_file_suffix(suffix="keras")
534
532
  elif self._model_format == TFKerasModelHandler.ModelFormats.H5:
535
- # When keras tried to load it, it validates the suffix. The `artifacts.model.get_model` function is
536
- # downloading the keras file to a temp file with a `pkl` suffix, so it needs to be replaced:
537
- self._model_file = self._model_file.rsplit(".pkl", 1)[0] + ".h5"
538
- # # ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS - Get the weights file:
539
- elif (
533
+ # Rename the model file suffix:
534
+ self._rename_model_file_suffix(suffix="h5")
535
+ elif ( # ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS
540
536
  self._model_format
541
537
  == TFKerasModelHandler.ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS
542
538
  ):
543
- # When keras tried to load it, it validates the suffix. The `artifacts.model.get_model` function is
544
- # downloading the keras file to a temp file with a `pkl` suffix, so it needs to be replaced:
545
- self._model_file = self._model_file.rsplit(".pkl", 1)[0] + ".json"
539
+ # Rename the model file suffix:
540
+ self._rename_model_file_suffix(suffix="json")
546
541
  # Get the weights file:
547
542
  self._weights_file = self._extra_data[
548
543
  self._get_weights_file_artifact_name()
@@ -551,6 +546,20 @@ class TFKerasModelHandler(DLModelHandler):
551
546
  # Continue collecting from abstract class:
552
547
  super()._collect_files_from_store_object()
553
548
 
549
+ def _rename_model_file_suffix(self, suffix: str):
550
+ """
551
+ Rename the model file suffix to the given one.
552
+
553
+ This is used for the case of loading a model from a store object that was saved with a different suffix as when
554
+ keras tries to load it, it validates the suffix. The `artifacts.model.get_model` function is downloading the
555
+ file to a temp file with a `pkl` suffix, so it needs to be replaced:than the one keras expects.
556
+
557
+ :param suffix: The suffix to rename the model file to (without the trailing dot).
558
+ """
559
+ new_name = self._model_file.rsplit(".", 1)[0] + f".{suffix}"
560
+ os.rename(self._model_file, new_name)
561
+ self._model_file = new_name
562
+
554
563
  def _collect_files_from_local_path(self):
555
564
  """
556
565
  If the model path given is of a local path, search for the needed model files and collect them into this handler
mlrun/launcher/base.py CHANGED
@@ -82,6 +82,7 @@ class BaseLauncher(abc.ABC):
82
82
  runtime: "mlrun.runtimes.base.BaseRuntime",
83
83
  project_name: Optional[str] = "",
84
84
  full: bool = True,
85
+ client_version: str = "",
85
86
  ):
86
87
  pass
87
88