mlrun 1.10.0rc9__py3-none-any.whl → 1.10.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/manager.py +1 -1
- mlrun/common/constants.py +12 -0
- mlrun/common/schemas/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/functions.py +2 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +19 -1
- mlrun/common/schemas/serving.py +1 -0
- mlrun/common/schemas/workflow.py +8 -0
- mlrun/datastore/azure_blob.py +1 -1
- mlrun/datastore/base.py +4 -2
- mlrun/datastore/datastore.py +46 -14
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/s3.py +16 -5
- mlrun/datastore/sources.py +2 -2
- mlrun/datastore/targets.py +2 -2
- mlrun/db/__init__.py +0 -1
- mlrun/db/base.py +29 -0
- mlrun/db/httpdb.py +35 -0
- mlrun/db/nopdb.py +19 -0
- mlrun/execution.py +12 -0
- mlrun/frameworks/tf_keras/mlrun_interface.py +8 -19
- mlrun/frameworks/tf_keras/model_handler.py +21 -12
- mlrun/launcher/base.py +1 -0
- mlrun/launcher/client.py +1 -0
- mlrun/launcher/local.py +4 -0
- mlrun/model.py +15 -4
- mlrun/model_monitoring/applications/base.py +74 -56
- mlrun/model_monitoring/db/tsdb/base.py +52 -19
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +179 -11
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +26 -11
- mlrun/model_monitoring/helpers.py +48 -0
- mlrun/projects/__init__.py +1 -0
- mlrun/projects/pipelines.py +44 -1
- mlrun/projects/project.py +30 -0
- mlrun/runtimes/daskjob.py +2 -0
- mlrun/runtimes/kubejob.py +4 -0
- mlrun/runtimes/mpijob/abstract.py +2 -0
- mlrun/runtimes/mpijob/v1.py +2 -0
- mlrun/runtimes/nuclio/function.py +2 -0
- mlrun/runtimes/nuclio/serving.py +59 -0
- mlrun/runtimes/pod.py +3 -0
- mlrun/runtimes/remotesparkjob.py +2 -0
- mlrun/runtimes/sparkjob/spark3job.py +2 -0
- mlrun/serving/routers.py +17 -13
- mlrun/serving/server.py +97 -3
- mlrun/serving/states.py +146 -38
- mlrun/serving/system_steps.py +2 -1
- mlrun/serving/v2_serving.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/METADATA +13 -7
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/RECORD +55 -57
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/licenses/LICENSE +1 -1
- mlrun/db/sql_types.py +0 -160
- mlrun/utils/db.py +0 -71
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc9.dist-info → mlrun-1.10.0rc11.dist-info}/top_level.txt +0 -0
mlrun/artifacts/manager.py
CHANGED
mlrun/common/constants.py
CHANGED
|
@@ -27,6 +27,10 @@ DASK_LABEL_PREFIX = "dask.org/"
|
|
|
27
27
|
NUCLIO_LABEL_PREFIX = "nuclio.io/"
|
|
28
28
|
RESERVED_TAG_NAME_LATEST = "latest"
|
|
29
29
|
|
|
30
|
+
JOB_TYPE_WORKFLOW_RUNNER = "workflow-runner"
|
|
31
|
+
JOB_TYPE_PROJECT_LOADER = "project-loader"
|
|
32
|
+
JOB_TYPE_RERUN_WORKFLOW_RUNNER = "rerun-workflow-runner"
|
|
33
|
+
|
|
30
34
|
|
|
31
35
|
class MLRunInternalLabels:
|
|
32
36
|
### dask
|
|
@@ -76,6 +80,9 @@ class MLRunInternalLabels:
|
|
|
76
80
|
kind = "kind"
|
|
77
81
|
component = "component"
|
|
78
82
|
mlrun_type = "mlrun__type"
|
|
83
|
+
rerun_of = "rerun-of"
|
|
84
|
+
original_workflow_id = "original-workflow-id"
|
|
85
|
+
workflow_id = "workflow-id"
|
|
79
86
|
|
|
80
87
|
owner = "owner"
|
|
81
88
|
v3io_user = "v3io_user"
|
|
@@ -101,3 +108,8 @@ class MLRunInternalLabels:
|
|
|
101
108
|
class DeployStatusTextKind(mlrun.common.types.StrEnum):
|
|
102
109
|
logs = "logs"
|
|
103
110
|
events = "events"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class WorkflowSubmitMode(mlrun.common.types.StrEnum):
|
|
114
|
+
direct = "direct" # call KFP retry API directly
|
|
115
|
+
rerun = "rerun" # launch a RerunRunner function
|
mlrun/common/schemas/__init__.py
CHANGED
|
@@ -34,6 +34,7 @@ class FunctionSummary(BaseModel):
|
|
|
34
34
|
type: FunctionsType
|
|
35
35
|
name: str
|
|
36
36
|
application_class: str
|
|
37
|
+
project_name: str
|
|
37
38
|
updated_time: datetime
|
|
38
39
|
status: Optional[str] = None
|
|
39
40
|
base_period: Optional[int] = None
|
|
@@ -59,6 +60,7 @@ class FunctionSummary(BaseModel):
|
|
|
59
60
|
else func_dict["spec"]["graph"]["steps"]["PushToMonitoringWriter"]["after"][
|
|
60
61
|
0
|
|
61
62
|
],
|
|
63
|
+
project_name=func_dict["metadata"]["project"],
|
|
62
64
|
updated_time=func_dict["metadata"].get("updated"),
|
|
63
65
|
status=func_dict["status"].get("state"),
|
|
64
66
|
base_period=base_period,
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
import abc
|
|
15
15
|
import json
|
|
16
16
|
from datetime import datetime
|
|
17
|
-
from typing import Any, NamedTuple, Optional, TypeVar
|
|
17
|
+
from typing import Any, Literal, NamedTuple, Optional, TypeVar
|
|
18
18
|
from uuid import UUID
|
|
19
19
|
|
|
20
20
|
from pydantic import validator # use `validator` if you’re still on Pydantic v1
|
|
@@ -334,6 +334,24 @@ class ModelEndpointMonitoringMetricNoData(_ModelEndpointMonitoringMetricValuesBa
|
|
|
334
334
|
data: bool = False
|
|
335
335
|
|
|
336
336
|
|
|
337
|
+
class ApplicationBaseRecord(BaseModel):
|
|
338
|
+
type: Literal["metric", "result"]
|
|
339
|
+
time: datetime
|
|
340
|
+
value: float
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class ApplicationResultRecord(ApplicationBaseRecord):
|
|
344
|
+
kind: ResultKindApp
|
|
345
|
+
status: ResultStatusApp
|
|
346
|
+
result_name: str
|
|
347
|
+
type: Literal["result"] = "result"
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class ApplicationMetricRecord(ApplicationBaseRecord):
|
|
351
|
+
metric_name: str
|
|
352
|
+
type: Literal["metric"] = "metric"
|
|
353
|
+
|
|
354
|
+
|
|
337
355
|
def _mapping_attributes(
|
|
338
356
|
model_class: type[Model],
|
|
339
357
|
flattened_dictionary: dict,
|
mlrun/common/schemas/serving.py
CHANGED
mlrun/common/schemas/workflow.py
CHANGED
|
@@ -46,6 +46,14 @@ class WorkflowRequest(pydantic.v1.BaseModel):
|
|
|
46
46
|
notifications: typing.Optional[list[Notification]] = None
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
class RerunWorkflowRequest(pydantic.v1.BaseModel):
|
|
50
|
+
run_name: typing.Optional[str] = None
|
|
51
|
+
run_id: typing.Optional[str] = None
|
|
52
|
+
original_workflow_id: typing.Optional[str] = None
|
|
53
|
+
notifications: typing.Optional[list[Notification]] = None
|
|
54
|
+
workflow_runner_node_selector: typing.Optional[dict[str, str]] = None
|
|
55
|
+
|
|
56
|
+
|
|
49
57
|
class WorkflowResponse(pydantic.v1.BaseModel):
|
|
50
58
|
project: str = None
|
|
51
59
|
name: str = None
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -224,7 +224,7 @@ class AzureBlobStore(DataStore):
|
|
|
224
224
|
path = self._convert_key_to_remote_path(key=path)
|
|
225
225
|
super().rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
226
226
|
|
|
227
|
-
def get_spark_options(self):
|
|
227
|
+
def get_spark_options(self, path=None):
|
|
228
228
|
res = {}
|
|
229
229
|
st = self.storage_options
|
|
230
230
|
service = "blob"
|
mlrun/datastore/base.py
CHANGED
|
@@ -48,7 +48,9 @@ class FileStats:
|
|
|
48
48
|
class DataStore:
|
|
49
49
|
using_bucket = False
|
|
50
50
|
|
|
51
|
-
def __init__(
|
|
51
|
+
def __init__(
|
|
52
|
+
self, parent, name, kind, endpoint="", secrets: Optional[dict] = None, **kwargs
|
|
53
|
+
):
|
|
52
54
|
self._parent = parent
|
|
53
55
|
self.kind = kind
|
|
54
56
|
self.name = name
|
|
@@ -176,7 +178,7 @@ class DataStore:
|
|
|
176
178
|
def upload(self, key, src_path):
|
|
177
179
|
pass
|
|
178
180
|
|
|
179
|
-
def get_spark_options(self):
|
|
181
|
+
def get_spark_options(self, path=None):
|
|
180
182
|
return {}
|
|
181
183
|
|
|
182
184
|
@staticmethod
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import warnings
|
|
14
15
|
from typing import Optional
|
|
15
16
|
from urllib.parse import urlparse
|
|
16
17
|
|
|
@@ -105,8 +106,7 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
|
|
|
105
106
|
from .alibaba_oss import OSSStore
|
|
106
107
|
|
|
107
108
|
return OSSStore
|
|
108
|
-
|
|
109
|
-
raise ValueError(f"unsupported store scheme ({schema})")
|
|
109
|
+
raise ValueError(f"unsupported store scheme ({schema})")
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def uri_to_ipython(link):
|
|
@@ -210,12 +210,20 @@ class StoreManager:
|
|
|
210
210
|
artifact_url=artifact_url,
|
|
211
211
|
)
|
|
212
212
|
|
|
213
|
-
def
|
|
214
|
-
self,
|
|
213
|
+
def _get_or_create_remote_client(
|
|
214
|
+
self,
|
|
215
|
+
url,
|
|
216
|
+
secrets: Optional[dict] = None,
|
|
217
|
+
project_name="",
|
|
218
|
+
cache: Optional[dict] = None,
|
|
219
|
+
schema_to_class: callable = schema_to_store,
|
|
220
|
+
**kwargs,
|
|
215
221
|
) -> (DataStore, str, str):
|
|
222
|
+
# The cache can be an empty dictionary ({}), even if it is a _stores object
|
|
223
|
+
cache = cache if cache is not None else {}
|
|
216
224
|
schema, endpoint, parsed_url = parse_url(url)
|
|
217
225
|
subpath = parsed_url.path
|
|
218
|
-
|
|
226
|
+
cache_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
|
|
219
227
|
|
|
220
228
|
if schema == "ds":
|
|
221
229
|
datastore_profile = datastore_profile_read(url, project_name, secrets)
|
|
@@ -237,24 +245,48 @@ class StoreManager:
|
|
|
237
245
|
subpath = url.replace("file://", "", 1)
|
|
238
246
|
|
|
239
247
|
if not schema and endpoint:
|
|
240
|
-
if endpoint in
|
|
241
|
-
return
|
|
248
|
+
if endpoint in cache.keys():
|
|
249
|
+
return cache[endpoint], subpath, url
|
|
242
250
|
else:
|
|
243
251
|
raise ValueError(f"no such store ({endpoint})")
|
|
244
252
|
|
|
245
253
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
246
|
-
if
|
|
247
|
-
return
|
|
254
|
+
if cache_key in cache.keys():
|
|
255
|
+
return cache[cache_key], subpath, url
|
|
248
256
|
|
|
249
257
|
# support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
|
|
250
258
|
# when running on server we don't cache the datastore, because there are multiple users and we don't want to
|
|
251
259
|
# cache the credentials, so for each new request we create a new store
|
|
252
|
-
|
|
253
|
-
|
|
260
|
+
remote_client_class = schema_to_class(schema)
|
|
261
|
+
remote_client = None
|
|
262
|
+
if remote_client_class:
|
|
263
|
+
remote_client = remote_client_class(
|
|
264
|
+
self, schema, cache_key, parsed_url.netloc, secrets=secrets, **kwargs
|
|
265
|
+
)
|
|
266
|
+
if not secrets and not mlrun.config.is_running_as_api():
|
|
267
|
+
cache[cache_key] = remote_client
|
|
268
|
+
else:
|
|
269
|
+
warnings.warn("scheme not found. Returning None")
|
|
270
|
+
return remote_client, subpath, url
|
|
271
|
+
|
|
272
|
+
def get_or_create_store(
|
|
273
|
+
self,
|
|
274
|
+
url,
|
|
275
|
+
secrets: Optional[dict] = None,
|
|
276
|
+
project_name="",
|
|
277
|
+
) -> (DataStore, str, str):
|
|
278
|
+
datastore, sub_path, url = self._get_or_create_remote_client(
|
|
279
|
+
url=url,
|
|
280
|
+
secrets=secrets,
|
|
281
|
+
project_name=project_name,
|
|
282
|
+
cache=self._stores,
|
|
283
|
+
schema_to_class=schema_to_store,
|
|
254
284
|
)
|
|
255
|
-
if not
|
|
256
|
-
|
|
257
|
-
|
|
285
|
+
if not isinstance(datastore, DataStore):
|
|
286
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
287
|
+
"remote client by url is not datastore"
|
|
288
|
+
)
|
|
289
|
+
return datastore, sub_path, url
|
|
258
290
|
|
|
259
291
|
def reset_secrets(self):
|
|
260
292
|
self._secrets = {}
|
|
@@ -194,7 +194,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
194
194
|
self.filesystem.exists(path)
|
|
195
195
|
super().rm(path, recursive=recursive, maxdepth=maxdepth)
|
|
196
196
|
|
|
197
|
-
def get_spark_options(self):
|
|
197
|
+
def get_spark_options(self, path=None):
|
|
198
198
|
res = {}
|
|
199
199
|
st = self._get_credentials()
|
|
200
200
|
if "token" in st:
|
mlrun/datastore/s3.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import time
|
|
16
16
|
from typing import Optional
|
|
17
|
+
from urllib.parse import urlparse
|
|
17
18
|
|
|
18
19
|
import boto3
|
|
19
20
|
from boto3.s3.transfer import TransferConfig
|
|
@@ -115,17 +116,27 @@ class S3Store(DataStore):
|
|
|
115
116
|
byterange += str(offset + size - 1)
|
|
116
117
|
return byterange
|
|
117
118
|
|
|
118
|
-
def get_spark_options(self):
|
|
119
|
+
def get_spark_options(self, path=None):
|
|
119
120
|
res = {}
|
|
121
|
+
bucket_str = ""
|
|
122
|
+
if path:
|
|
123
|
+
parsed = urlparse(path)
|
|
124
|
+
if parsed.scheme: # s3:// or s3a://
|
|
125
|
+
bucket = parsed.hostname
|
|
126
|
+
else:
|
|
127
|
+
# drop a leading slash, if any and take 1st segment
|
|
128
|
+
bucket = path.lstrip("/").split("/", 1)[0]
|
|
129
|
+
bucket_str = f".bucket.{bucket}"
|
|
130
|
+
|
|
120
131
|
st = self.get_storage_options()
|
|
121
132
|
if st.get("key"):
|
|
122
|
-
res["spark.hadoop.fs.s3a.access.key"] = st.get("key")
|
|
133
|
+
res[f"spark.hadoop.fs.s3a{bucket_str}.access.key"] = st.get("key")
|
|
123
134
|
if st.get("secret"):
|
|
124
|
-
res["spark.hadoop.fs.s3a.secret.key"] = st.get("secret")
|
|
135
|
+
res[f"spark.hadoop.fs.s3a{bucket_str}.secret.key"] = st.get("secret")
|
|
125
136
|
if st.get("endpoint_url"):
|
|
126
|
-
res["spark.hadoop.fs.s3a.endpoint"] = st.get("endpoint_url")
|
|
137
|
+
res[f"spark.hadoop.fs.s3a{bucket_str}.endpoint"] = st.get("endpoint_url")
|
|
127
138
|
if st.get("profile"):
|
|
128
|
-
res["spark.hadoop.fs.s3a.aws.profile"] = st.get("profile")
|
|
139
|
+
res[f"spark.hadoop.fs.s3a{bucket_str}.aws.profile"] = st.get("profile")
|
|
129
140
|
return res
|
|
130
141
|
|
|
131
142
|
@property
|
mlrun/datastore/sources.py
CHANGED
|
@@ -220,7 +220,7 @@ class CSVSource(BaseSourceDriver):
|
|
|
220
220
|
|
|
221
221
|
def get_spark_options(self):
|
|
222
222
|
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
223
|
-
spark_options = store.get_spark_options()
|
|
223
|
+
spark_options = store.get_spark_options(store.spark_url + path)
|
|
224
224
|
spark_options.update(
|
|
225
225
|
{
|
|
226
226
|
"path": store.spark_url + path,
|
|
@@ -407,7 +407,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
407
407
|
|
|
408
408
|
def get_spark_options(self):
|
|
409
409
|
store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
|
|
410
|
-
spark_options = store.get_spark_options()
|
|
410
|
+
spark_options = store.get_spark_options(store.spark_url + path)
|
|
411
411
|
spark_options.update(
|
|
412
412
|
{
|
|
413
413
|
"path": store.spark_url + path,
|
mlrun/datastore/targets.py
CHANGED
|
@@ -970,7 +970,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
970
970
|
break
|
|
971
971
|
|
|
972
972
|
store, path, url = self._get_store_and_path()
|
|
973
|
-
spark_options = store.get_spark_options()
|
|
973
|
+
spark_options = store.get_spark_options(store.spark_url + path)
|
|
974
974
|
spark_options.update(
|
|
975
975
|
{
|
|
976
976
|
"path": store.spark_url + path,
|
|
@@ -1104,7 +1104,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1104
1104
|
|
|
1105
1105
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1106
1106
|
store, path, url = self._get_store_and_path()
|
|
1107
|
-
spark_options = store.get_spark_options()
|
|
1107
|
+
spark_options = store.get_spark_options(store.spark_url + path)
|
|
1108
1108
|
spark_options.update(
|
|
1109
1109
|
{
|
|
1110
1110
|
"path": store.spark_url + path,
|
mlrun/db/__init__.py
CHANGED
mlrun/db/base.py
CHANGED
|
@@ -638,6 +638,17 @@ class RunDBInterface(ABC):
|
|
|
638
638
|
):
|
|
639
639
|
pass
|
|
640
640
|
|
|
641
|
+
@abstractmethod
|
|
642
|
+
def retry_pipeline(
|
|
643
|
+
self,
|
|
644
|
+
run_id: str,
|
|
645
|
+
project: str,
|
|
646
|
+
namespace: Optional[str] = None,
|
|
647
|
+
timeout: int = 30,
|
|
648
|
+
submit_mode: str = "",
|
|
649
|
+
):
|
|
650
|
+
pass
|
|
651
|
+
|
|
641
652
|
@abstractmethod
|
|
642
653
|
def list_project_secrets(
|
|
643
654
|
self,
|
|
@@ -1034,6 +1045,13 @@ class RunDBInterface(ABC):
|
|
|
1034
1045
|
):
|
|
1035
1046
|
pass
|
|
1036
1047
|
|
|
1048
|
+
def get_project_background_task(
|
|
1049
|
+
self,
|
|
1050
|
+
project: str,
|
|
1051
|
+
name: str,
|
|
1052
|
+
) -> mlrun.common.schemas.BackgroundTask:
|
|
1053
|
+
pass
|
|
1054
|
+
|
|
1037
1055
|
@abstractmethod
|
|
1038
1056
|
def submit_workflow(
|
|
1039
1057
|
self,
|
|
@@ -1113,6 +1131,17 @@ class RunDBInterface(ABC):
|
|
|
1113
1131
|
) -> list[mlrun.common.schemas.model_monitoring.FunctionSummary]:
|
|
1114
1132
|
pass
|
|
1115
1133
|
|
|
1134
|
+
@abstractmethod
|
|
1135
|
+
def get_monitoring_function_summary(
|
|
1136
|
+
self,
|
|
1137
|
+
project: str,
|
|
1138
|
+
function_name: str,
|
|
1139
|
+
start: Optional[datetime.datetime] = None,
|
|
1140
|
+
end: Optional[datetime.datetime] = None,
|
|
1141
|
+
include_latest_metrics: bool = False,
|
|
1142
|
+
) -> mlrun.common.schemas.model_monitoring.FunctionSummary:
|
|
1143
|
+
pass
|
|
1144
|
+
|
|
1116
1145
|
@abstractmethod
|
|
1117
1146
|
def get_project_summary(self, project: str) -> mlrun.common.schemas.ProjectSummary:
|
|
1118
1147
|
pass
|
mlrun/db/httpdb.py
CHANGED
|
@@ -2350,6 +2350,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
2350
2350
|
project: str,
|
|
2351
2351
|
namespace: Optional[str] = None,
|
|
2352
2352
|
timeout: int = 30,
|
|
2353
|
+
submit_mode: str = "",
|
|
2353
2354
|
):
|
|
2354
2355
|
"""
|
|
2355
2356
|
Retry a specific pipeline run using its run ID. This function sends an API request
|
|
@@ -2359,6 +2360,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
2359
2360
|
:param namespace: Kubernetes namespace where the pipeline is running. Optional.
|
|
2360
2361
|
:param timeout: Timeout (in seconds) for the API call. Defaults to 30 seconds.
|
|
2361
2362
|
:param project: Name of the MLRun project associated with the pipeline.
|
|
2363
|
+
:param submit_mode: Whether to submit the pipeline directly to the API.
|
|
2362
2364
|
|
|
2363
2365
|
:raises ValueError: Raised if the API response is not successful or contains an
|
|
2364
2366
|
error.
|
|
@@ -2370,6 +2372,9 @@ class HTTPRunDB(RunDBInterface):
|
|
|
2370
2372
|
if namespace:
|
|
2371
2373
|
params["namespace"] = namespace
|
|
2372
2374
|
|
|
2375
|
+
if submit_mode:
|
|
2376
|
+
params["submit-mode"] = submit_mode
|
|
2377
|
+
|
|
2373
2378
|
resp_text = ""
|
|
2374
2379
|
resp_code = None
|
|
2375
2380
|
try:
|
|
@@ -4188,6 +4193,36 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4188
4193
|
results.append(FunctionSummary(**item))
|
|
4189
4194
|
return results
|
|
4190
4195
|
|
|
4196
|
+
def get_monitoring_function_summary(
|
|
4197
|
+
self,
|
|
4198
|
+
project: str,
|
|
4199
|
+
function_name: str,
|
|
4200
|
+
start: Optional[datetime] = None,
|
|
4201
|
+
end: Optional[datetime] = None,
|
|
4202
|
+
include_latest_metrics: bool = False,
|
|
4203
|
+
) -> FunctionSummary:
|
|
4204
|
+
"""
|
|
4205
|
+
Get a monitoring function summary for the specified project and function.
|
|
4206
|
+
:param project: The name of the project.
|
|
4207
|
+
:param function_name: The name of the function.
|
|
4208
|
+
:param start: Start time for filtering the results (optional).
|
|
4209
|
+
:param end: End time for filtering the results (optional).
|
|
4210
|
+
:param include_latest_metrics: Whether to include the latest metrics in the response (default is False).
|
|
4211
|
+
|
|
4212
|
+
:return: A FunctionSummary object containing information about the monitoring function.
|
|
4213
|
+
"""
|
|
4214
|
+
|
|
4215
|
+
response = self.api_call(
|
|
4216
|
+
method=mlrun.common.types.HTTPMethod.GET,
|
|
4217
|
+
path=f"projects/{project}/model-monitoring/function-summaries/{function_name}",
|
|
4218
|
+
params={
|
|
4219
|
+
"start": datetime_to_iso(start),
|
|
4220
|
+
"end": datetime_to_iso(end),
|
|
4221
|
+
"include-latest-metrics": include_latest_metrics,
|
|
4222
|
+
},
|
|
4223
|
+
)
|
|
4224
|
+
return FunctionSummary(**response.json())
|
|
4225
|
+
|
|
4191
4226
|
def create_hub_source(
|
|
4192
4227
|
self, source: Union[dict, mlrun.common.schemas.IndexedHubSource]
|
|
4193
4228
|
):
|
mlrun/db/nopdb.py
CHANGED
|
@@ -524,6 +524,15 @@ class NopDB(RunDBInterface):
|
|
|
524
524
|
):
|
|
525
525
|
pass
|
|
526
526
|
|
|
527
|
+
def retry_pipeline(
|
|
528
|
+
self,
|
|
529
|
+
run_id: str,
|
|
530
|
+
project: str,
|
|
531
|
+
namespace: Optional[str] = None,
|
|
532
|
+
timeout: int = 30,
|
|
533
|
+
):
|
|
534
|
+
pass
|
|
535
|
+
|
|
527
536
|
def list_pipelines(
|
|
528
537
|
self,
|
|
529
538
|
project: str,
|
|
@@ -893,6 +902,16 @@ class NopDB(RunDBInterface):
|
|
|
893
902
|
) -> [mlrun.common.schemas.model_monitoring.FunctionSummary]:
|
|
894
903
|
pass
|
|
895
904
|
|
|
905
|
+
def get_monitoring_function_summary(
|
|
906
|
+
self,
|
|
907
|
+
project: str,
|
|
908
|
+
function_name: str,
|
|
909
|
+
start: Optional[datetime.datetime] = None,
|
|
910
|
+
end: Optional[datetime.datetime] = None,
|
|
911
|
+
include_latest_metrics: bool = False,
|
|
912
|
+
) -> mlrun.common.schemas.model_monitoring.FunctionSummary:
|
|
913
|
+
pass
|
|
914
|
+
|
|
896
915
|
def generate_event(
|
|
897
916
|
self, name: str, event_data: Union[dict, mlrun.common.schemas.Event], project=""
|
|
898
917
|
):
|
mlrun/execution.py
CHANGED
|
@@ -1286,6 +1286,18 @@ class MLClientCtx:
|
|
|
1286
1286
|
self.to_dict(), self._uid, self.project, iter=self._iteration
|
|
1287
1287
|
)
|
|
1288
1288
|
|
|
1289
|
+
def update_run(self):
|
|
1290
|
+
"""
|
|
1291
|
+
Store the run object in the DB - removes missing fields.
|
|
1292
|
+
Use _update_run for coherent updates.
|
|
1293
|
+
Should be called by the logging worker only (see is_logging_worker()).
|
|
1294
|
+
"""
|
|
1295
|
+
self._write_tmpfile()
|
|
1296
|
+
if self._rundb:
|
|
1297
|
+
self._rundb.update_run(
|
|
1298
|
+
self.to_dict(), self._uid, self.project, iter=self._iteration
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1289
1301
|
def is_logging_worker(self):
|
|
1290
1302
|
"""
|
|
1291
1303
|
Check if the current worker is the logging worker.
|
|
@@ -107,14 +107,10 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
|
|
|
107
107
|
)
|
|
108
108
|
|
|
109
109
|
# Call the pre compile method:
|
|
110
|
-
|
|
111
|
-
optimizer=kwargs["optimizer"]
|
|
112
|
-
)
|
|
110
|
+
optimizer = self._pre_compile(optimizer=kwargs["optimizer"])
|
|
113
111
|
|
|
114
112
|
# Assign parameters:
|
|
115
113
|
kwargs["optimizer"] = optimizer
|
|
116
|
-
if experimental_run_tf_function is not None:
|
|
117
|
-
kwargs["experimental_run_tf_function"] = experimental_run_tf_function
|
|
118
114
|
|
|
119
115
|
# Call the original compile method:
|
|
120
116
|
return self.original_compile(*args, **kwargs)
|
|
@@ -235,23 +231,20 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
|
|
|
235
231
|
"""
|
|
236
232
|
self._RANK_0_ONLY_CALLBACKS.add(callback_name)
|
|
237
233
|
|
|
238
|
-
def _pre_compile(self, optimizer: Optimizer) ->
|
|
234
|
+
def _pre_compile(self, optimizer: Optimizer) -> Optimizer:
|
|
239
235
|
"""
|
|
240
236
|
Method to call before calling 'compile' to setup the run and inputs for using horovod.
|
|
241
237
|
|
|
242
238
|
:param optimizer: The optimzier to compile. It will be wrapped in horovod's distributed optimizer:
|
|
243
239
|
'hvd.DistributedOptimizer'.
|
|
244
240
|
|
|
245
|
-
:return: The updated
|
|
246
|
-
[0] = Wrapped optimizer.
|
|
247
|
-
[1] = The 'experimental_run_tf_function' parameter for 'compile' kwargs or 'None' if horovod should not
|
|
248
|
-
be used.
|
|
241
|
+
:return: The updated Wrapped optimizer.
|
|
249
242
|
|
|
250
243
|
:raise MLRunInvalidArgumentError: In case the optimizer was passed as a string.
|
|
251
244
|
"""
|
|
252
245
|
# Check if needed to run with horovod:
|
|
253
246
|
if self._hvd is None:
|
|
254
|
-
return optimizer
|
|
247
|
+
return optimizer
|
|
255
248
|
|
|
256
249
|
# Validate the optimizer input:
|
|
257
250
|
if isinstance(optimizer, str):
|
|
@@ -280,19 +273,15 @@ class TFKerasMLRunInterface(MLRunInterface, ABC):
|
|
|
280
273
|
print(f"Horovod worker #{self._hvd.rank()} is using CPU")
|
|
281
274
|
|
|
282
275
|
# Adjust learning rate based on the number of GPUs:
|
|
283
|
-
if hasattr(
|
|
284
|
-
optimizer.lr
|
|
276
|
+
if hasattr(optimizer, "lr"):
|
|
277
|
+
optimizer.lr = optimizer.lr * self._hvd.size()
|
|
285
278
|
else:
|
|
286
|
-
optimizer.learning_rate
|
|
279
|
+
optimizer.learning_rate = optimizer.learning_rate * self._hvd.size()
|
|
287
280
|
|
|
288
281
|
# Wrap the optimizer in horovod's distributed optimizer: 'hvd.DistributedOptimizer'.
|
|
289
282
|
optimizer = self._hvd.DistributedOptimizer(optimizer)
|
|
290
283
|
|
|
291
|
-
|
|
292
|
-
# optimizer to compute the gradients:
|
|
293
|
-
experimental_run_tf_function = False
|
|
294
|
-
|
|
295
|
-
return optimizer, experimental_run_tf_function
|
|
284
|
+
return optimizer
|
|
296
285
|
|
|
297
286
|
def _pre_fit(
|
|
298
287
|
self,
|
|
@@ -518,7 +518,6 @@ class TFKerasModelHandler(DLModelHandler):
|
|
|
518
518
|
)
|
|
519
519
|
|
|
520
520
|
# Read additional files according to the model format used:
|
|
521
|
-
# # ModelFormats.SAVED_MODEL - Unzip the SavedModel archive:
|
|
522
521
|
if self._model_format == TFKerasModelHandler.ModelFormats.SAVED_MODEL:
|
|
523
522
|
# Unzip the SavedModel directory:
|
|
524
523
|
with zipfile.ZipFile(self._model_file, "r") as zip_file:
|
|
@@ -528,21 +527,17 @@ class TFKerasModelHandler(DLModelHandler):
|
|
|
528
527
|
os.path.dirname(self._model_file), self._model_name
|
|
529
528
|
)
|
|
530
529
|
elif self._model_format == TFKerasModelHandler.ModelFormats.KERAS:
|
|
531
|
-
#
|
|
532
|
-
|
|
533
|
-
self._model_file = self._model_file.rsplit(".pkl", 1)[0] + ".keras"
|
|
530
|
+
# Rename the model file suffix:
|
|
531
|
+
self._rename_model_file_suffix(suffix="keras")
|
|
534
532
|
elif self._model_format == TFKerasModelHandler.ModelFormats.H5:
|
|
535
|
-
#
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
# # ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS - Get the weights file:
|
|
539
|
-
elif (
|
|
533
|
+
# Rename the model file suffix:
|
|
534
|
+
self._rename_model_file_suffix(suffix="h5")
|
|
535
|
+
elif ( # ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS
|
|
540
536
|
self._model_format
|
|
541
537
|
== TFKerasModelHandler.ModelFormats.JSON_ARCHITECTURE_H5_WEIGHTS
|
|
542
538
|
):
|
|
543
|
-
#
|
|
544
|
-
|
|
545
|
-
self._model_file = self._model_file.rsplit(".pkl", 1)[0] + ".json"
|
|
539
|
+
# Rename the model file suffix:
|
|
540
|
+
self._rename_model_file_suffix(suffix="json")
|
|
546
541
|
# Get the weights file:
|
|
547
542
|
self._weights_file = self._extra_data[
|
|
548
543
|
self._get_weights_file_artifact_name()
|
|
@@ -551,6 +546,20 @@ class TFKerasModelHandler(DLModelHandler):
|
|
|
551
546
|
# Continue collecting from abstract class:
|
|
552
547
|
super()._collect_files_from_store_object()
|
|
553
548
|
|
|
549
|
+
def _rename_model_file_suffix(self, suffix: str):
|
|
550
|
+
"""
|
|
551
|
+
Rename the model file suffix to the given one.
|
|
552
|
+
|
|
553
|
+
This is used for the case of loading a model from a store object that was saved with a different suffix as when
|
|
554
|
+
keras tries to load it, it validates the suffix. The `artifacts.model.get_model` function is downloading the
|
|
555
|
+
file to a temp file with a `pkl` suffix, so it needs to be replaced:than the one keras expects.
|
|
556
|
+
|
|
557
|
+
:param suffix: The suffix to rename the model file to (without the trailing dot).
|
|
558
|
+
"""
|
|
559
|
+
new_name = self._model_file.rsplit(".", 1)[0] + f".{suffix}"
|
|
560
|
+
os.rename(self._model_file, new_name)
|
|
561
|
+
self._model_file = new_name
|
|
562
|
+
|
|
554
563
|
def _collect_files_from_local_path(self):
|
|
555
564
|
"""
|
|
556
565
|
If the model path given is of a local path, search for the needed model files and collect them into this handler
|