mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +1 -40
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/functions.py +6 -1
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +1 -1
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +3 -0
- mlrun/api/crud/model_monitoring/deployment.py +36 -7
- mlrun/api/crud/model_monitoring/grafana.py +1 -1
- mlrun/api/crud/model_monitoring/helpers.py +32 -2
- mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +4 -9
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/sqldb/db.py +97 -16
- mlrun/api/launcher.py +26 -7
- mlrun/api/main.py +3 -4
- mlrun/{mlutils → api/rundb}/__init__.py +2 -6
- mlrun/{db → api/rundb}/sqldb.py +35 -83
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +4 -0
- mlrun/artifacts/manager.py +1 -2
- mlrun/common/schemas/__init__.py +6 -0
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +11 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +42 -40
- mlrun/datastore/sources.py +1 -1
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -19
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/feature_set.py +0 -10
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +21 -15
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +3 -3
- mlrun/launcher/client.py +3 -2
- mlrun/launcher/factory.py +16 -13
- mlrun/lists.py +0 -11
- mlrun/model.py +9 -15
- mlrun/model_monitoring/helpers.py +15 -25
- mlrun/model_monitoring/model_monitoring_batch.py +72 -4
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +15 -9
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
- mlrun/model_monitoring/stream_processing.py +181 -29
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +57 -158
- mlrun/projects/project.py +6 -32
- mlrun/render.py +1 -1
- mlrun/run.py +2 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +26 -1241
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/function.py +16 -5
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +0 -10
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +1 -1
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/serving/states.py +1 -1
- mlrun/utils/db.py +0 -2
- mlrun/utils/helpers.py +19 -13
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# Copyright 2018 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
import collections.abc
|
|
16
|
+
import copy
|
|
17
|
+
import traceback
|
|
18
|
+
import typing
|
|
19
|
+
from http import HTTPStatus
|
|
20
|
+
from typing import Dict
|
|
21
|
+
|
|
22
|
+
import fastapi
|
|
23
|
+
from fastapi.concurrency import run_in_threadpool
|
|
24
|
+
from sqlalchemy.orm import Session
|
|
25
|
+
|
|
26
|
+
import mlrun
|
|
27
|
+
import mlrun.api.api.deps
|
|
28
|
+
import mlrun.api.api.utils
|
|
29
|
+
import mlrun.api.crud
|
|
30
|
+
import mlrun.api.utils.auth.verifier
|
|
31
|
+
import mlrun.api.utils.clients.chief
|
|
32
|
+
import mlrun.api.utils.singletons.db
|
|
33
|
+
import mlrun.api.utils.singletons.project_member
|
|
34
|
+
import mlrun.common.schemas
|
|
35
|
+
import mlrun.projects.pipelines
|
|
36
|
+
from mlrun.api.api.utils import log_and_raise
|
|
37
|
+
from mlrun.utils.helpers import logger
|
|
38
|
+
|
|
39
|
+
router = fastapi.APIRouter()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@router.post(
|
|
43
|
+
"/projects/{project}/workflows/{name}/submit",
|
|
44
|
+
status_code=HTTPStatus.ACCEPTED.value,
|
|
45
|
+
response_model=mlrun.common.schemas.WorkflowResponse,
|
|
46
|
+
)
|
|
47
|
+
async def submit_workflow(
|
|
48
|
+
project: str,
|
|
49
|
+
name: str,
|
|
50
|
+
request: fastapi.Request,
|
|
51
|
+
workflow_request: mlrun.common.schemas.WorkflowRequest = mlrun.common.schemas.WorkflowRequest(),
|
|
52
|
+
auth_info: mlrun.common.schemas.AuthInfo = fastapi.Depends(
|
|
53
|
+
mlrun.api.api.deps.authenticate_request
|
|
54
|
+
),
|
|
55
|
+
db_session: Session = fastapi.Depends(mlrun.api.api.deps.get_db_session),
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Submitting a workflow of existing project.
|
|
59
|
+
To support workflow scheduling, we use here an auxiliary function called 'load_and_run'.
|
|
60
|
+
This function runs remotely (in a distinct pod), loads a project and then runs the workflow.
|
|
61
|
+
In this way we can run the workflow remotely with the workflow's engine or
|
|
62
|
+
schedule this function which in every time loads the project and runs the workflow.
|
|
63
|
+
Notice:
|
|
64
|
+
in case of simply running a workflow, the returned run_id value is the id of the run of the auxiliary function.
|
|
65
|
+
For getting the id and status of the workflow, use the `get_workflow_id` endpoint with the returned run id.
|
|
66
|
+
|
|
67
|
+
:param project: name of the project
|
|
68
|
+
:param name: name of the workflow
|
|
69
|
+
:param request: fastapi request for supporting rerouting to chief if needed
|
|
70
|
+
:param workflow_request: the request includes: workflow spec, arguments for the workflow, artifact path
|
|
71
|
+
as the artifact target path of the workflow, source url of the project for overriding
|
|
72
|
+
the existing one, run name to override the default: 'workflow-runner-<workflow name>'
|
|
73
|
+
and kubernetes namespace if other than default
|
|
74
|
+
:param auth_info: auth info of the request
|
|
75
|
+
:param db_session: session that manages the current dialog with the database
|
|
76
|
+
|
|
77
|
+
:returns: response that contains the project name, workflow name, name of the workflow,
|
|
78
|
+
status, run id (in case of a single run) and schedule (in case of scheduling)
|
|
79
|
+
"""
|
|
80
|
+
project = await run_in_threadpool(
|
|
81
|
+
mlrun.api.utils.singletons.project_member.get_project_member().get_project,
|
|
82
|
+
db_session=db_session,
|
|
83
|
+
name=project,
|
|
84
|
+
leader_session=auth_info.session,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# check permission CREATE run
|
|
88
|
+
await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
|
|
89
|
+
resource_type=mlrun.common.schemas.AuthorizationResourceTypes.run,
|
|
90
|
+
project_name=project.metadata.name,
|
|
91
|
+
resource_name=workflow_request.run_name or "",
|
|
92
|
+
action=mlrun.common.schemas.AuthorizationAction.create,
|
|
93
|
+
auth_info=auth_info,
|
|
94
|
+
)
|
|
95
|
+
# check permission READ workflow on project's workflow
|
|
96
|
+
await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
|
|
97
|
+
resource_type=mlrun.common.schemas.AuthorizationResourceTypes.workflow,
|
|
98
|
+
project_name=project.metadata.name,
|
|
99
|
+
resource_name=name,
|
|
100
|
+
action=mlrun.common.schemas.AuthorizationAction.read,
|
|
101
|
+
auth_info=auth_info,
|
|
102
|
+
)
|
|
103
|
+
# Check permission CREATE workflow on new workflow's name
|
|
104
|
+
await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
|
|
105
|
+
resource_type=mlrun.common.schemas.AuthorizationResourceTypes.workflow,
|
|
106
|
+
project_name=project.metadata.name,
|
|
107
|
+
# If workflow spec has not passed need to create on same name:
|
|
108
|
+
resource_name=getattr(workflow_request.spec, "name", name),
|
|
109
|
+
action=mlrun.common.schemas.AuthorizationAction.create,
|
|
110
|
+
auth_info=auth_info,
|
|
111
|
+
)
|
|
112
|
+
# Re-route to chief in case of schedule
|
|
113
|
+
if (
|
|
114
|
+
_is_requested_schedule(name, workflow_request.spec, project)
|
|
115
|
+
and mlrun.mlconf.httpdb.clusterization.role
|
|
116
|
+
!= mlrun.common.schemas.ClusterizationRole.chief
|
|
117
|
+
):
|
|
118
|
+
chief_client = mlrun.api.utils.clients.chief.Client()
|
|
119
|
+
return await chief_client.submit_workflow(
|
|
120
|
+
project=project.metadata.name,
|
|
121
|
+
name=name,
|
|
122
|
+
request=request,
|
|
123
|
+
json=workflow_request.dict(),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
workflow_spec = _fill_workflow_missing_fields_from_project(
|
|
127
|
+
project=project,
|
|
128
|
+
workflow_name=name,
|
|
129
|
+
spec=workflow_request.spec,
|
|
130
|
+
arguments=workflow_request.arguments,
|
|
131
|
+
)
|
|
132
|
+
updated_request = workflow_request.copy()
|
|
133
|
+
updated_request.spec = workflow_spec
|
|
134
|
+
|
|
135
|
+
# This function is for loading the project and running workflow remotely.
|
|
136
|
+
# In this way we can schedule workflows (by scheduling a job that runs the workflow)
|
|
137
|
+
workflow_runner = await run_in_threadpool(
|
|
138
|
+
mlrun.api.crud.WorkflowRunners().create_runner,
|
|
139
|
+
run_name=updated_request.run_name
|
|
140
|
+
or mlrun.mlconf.workflows.default_workflow_runner_name.format(
|
|
141
|
+
workflow_spec.name
|
|
142
|
+
),
|
|
143
|
+
project=project.metadata.name,
|
|
144
|
+
db_session=db_session,
|
|
145
|
+
auth_info=auth_info,
|
|
146
|
+
image=workflow_spec.image
|
|
147
|
+
or project.spec.default_image
|
|
148
|
+
or mlrun.mlconf.default_base_image,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
logger.debug(
|
|
152
|
+
"Saved function for running workflow",
|
|
153
|
+
project_name=workflow_runner.metadata.project,
|
|
154
|
+
function_name=workflow_runner.metadata.name,
|
|
155
|
+
workflow_name=workflow_spec.name,
|
|
156
|
+
arguments=workflow_spec.args,
|
|
157
|
+
source=updated_request.source or project.spec.source,
|
|
158
|
+
kind=workflow_runner.kind,
|
|
159
|
+
image=workflow_runner.spec.image,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
run_uid = None
|
|
163
|
+
status = None
|
|
164
|
+
workflow_action = "schedule" if workflow_spec.schedule else "run"
|
|
165
|
+
try:
|
|
166
|
+
if workflow_spec.schedule:
|
|
167
|
+
await run_in_threadpool(
|
|
168
|
+
mlrun.api.crud.WorkflowRunners().schedule,
|
|
169
|
+
runner=workflow_runner,
|
|
170
|
+
project=project,
|
|
171
|
+
workflow_request=updated_request,
|
|
172
|
+
db_session=db_session,
|
|
173
|
+
auth_info=auth_info,
|
|
174
|
+
)
|
|
175
|
+
status = "scheduled"
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
run = await run_in_threadpool(
|
|
179
|
+
mlrun.api.crud.WorkflowRunners().run,
|
|
180
|
+
runner=workflow_runner,
|
|
181
|
+
project=project,
|
|
182
|
+
workflow_request=updated_request,
|
|
183
|
+
)
|
|
184
|
+
status = mlrun.run.RunStatuses.running
|
|
185
|
+
run_uid = run.uid()
|
|
186
|
+
except Exception as error:
|
|
187
|
+
logger.error(traceback.format_exc())
|
|
188
|
+
log_and_raise(
|
|
189
|
+
reason="Workflow failed",
|
|
190
|
+
workflow_name=workflow_spec.name,
|
|
191
|
+
workflow_action=workflow_action,
|
|
192
|
+
error=mlrun.errors.err_to_str(error),
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return mlrun.common.schemas.WorkflowResponse(
|
|
196
|
+
project=project.metadata.name,
|
|
197
|
+
name=workflow_spec.name,
|
|
198
|
+
status=status,
|
|
199
|
+
run_id=run_uid,
|
|
200
|
+
schedule=workflow_spec.schedule,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _is_requested_schedule(
|
|
205
|
+
name: str,
|
|
206
|
+
workflow_spec: mlrun.common.schemas.WorkflowSpec,
|
|
207
|
+
project: mlrun.common.schemas.Project,
|
|
208
|
+
) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Checks if the workflow needs to be scheduled, which can be decided either the request itself
|
|
211
|
+
contains schedule information or the workflow which was predefined in the project contains schedule.
|
|
212
|
+
|
|
213
|
+
:param name: workflow name
|
|
214
|
+
:param workflow_spec: workflow spec input
|
|
215
|
+
:param project: MLRun project that contains the workflow
|
|
216
|
+
|
|
217
|
+
:return: True if the workflow need to be scheduled and False if not.
|
|
218
|
+
"""
|
|
219
|
+
if workflow_spec:
|
|
220
|
+
return workflow_spec.schedule is not None
|
|
221
|
+
|
|
222
|
+
project_workflow = _get_workflow_by_name(project, name)
|
|
223
|
+
return bool(project_workflow.get("schedule"))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _get_workflow_by_name(
|
|
227
|
+
project: mlrun.common.schemas.Project, name: str
|
|
228
|
+
) -> typing.Optional[Dict]:
|
|
229
|
+
"""
|
|
230
|
+
Getting workflow from project
|
|
231
|
+
|
|
232
|
+
:param project: MLRun project
|
|
233
|
+
:param name: workflow name
|
|
234
|
+
|
|
235
|
+
:return: workflow as a dict if project has the workflow, otherwise raises a bad request exception
|
|
236
|
+
"""
|
|
237
|
+
for workflow in project.spec.workflows:
|
|
238
|
+
if workflow["name"] == name:
|
|
239
|
+
return workflow
|
|
240
|
+
log_and_raise(
|
|
241
|
+
reason=f"workflow {name} not found in project",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _fill_workflow_missing_fields_from_project(
|
|
246
|
+
project: mlrun.common.schemas.Project,
|
|
247
|
+
workflow_name: str,
|
|
248
|
+
spec: mlrun.common.schemas.WorkflowSpec,
|
|
249
|
+
arguments: typing.Dict,
|
|
250
|
+
) -> mlrun.common.schemas.WorkflowSpec:
|
|
251
|
+
"""
|
|
252
|
+
Fill the workflow spec details from the project object, with favour to spec
|
|
253
|
+
|
|
254
|
+
:param project: MLRun project that contains the workflow.
|
|
255
|
+
:param workflow_name: workflow name
|
|
256
|
+
:param spec: workflow spec input
|
|
257
|
+
:param arguments: arguments to workflow
|
|
258
|
+
|
|
259
|
+
:return: completed workflow spec
|
|
260
|
+
"""
|
|
261
|
+
# Verifying workflow exists in project:
|
|
262
|
+
workflow = _get_workflow_by_name(project, workflow_name)
|
|
263
|
+
|
|
264
|
+
if spec:
|
|
265
|
+
# Merge between the workflow spec provided in the request with existing
|
|
266
|
+
# workflow while the provided workflow takes precedence over the existing workflow params
|
|
267
|
+
workflow = copy.deepcopy(workflow)
|
|
268
|
+
workflow = _update_dict(workflow, spec.dict())
|
|
269
|
+
|
|
270
|
+
workflow_spec = mlrun.common.schemas.WorkflowSpec(**workflow)
|
|
271
|
+
# Overriding arguments of the existing workflow:
|
|
272
|
+
if arguments:
|
|
273
|
+
workflow_spec.args = workflow_spec.args or {}
|
|
274
|
+
workflow_spec.args.update(arguments)
|
|
275
|
+
|
|
276
|
+
return workflow_spec
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _update_dict(dict_1: dict, dict_2: dict):
|
|
280
|
+
"""
|
|
281
|
+
Update two dictionaries included nested dictionaries (recursively).
|
|
282
|
+
:param dict_1: The dict to update
|
|
283
|
+
:param dict_2: The values of this dict take precedence over dict_1.
|
|
284
|
+
:return:
|
|
285
|
+
"""
|
|
286
|
+
for key, val in dict_2.items():
|
|
287
|
+
if isinstance(val, collections.abc.Mapping):
|
|
288
|
+
dict_1[key] = _update_dict(dict_1.get(key, {}), val)
|
|
289
|
+
# It is necessary to update only if value is exist because
|
|
290
|
+
# on initialization of the WorkflowSpec object all unfilled values gets None values,
|
|
291
|
+
# and when converting to dict the keys gets those None values.
|
|
292
|
+
elif val:
|
|
293
|
+
dict_1[key] = val
|
|
294
|
+
return dict_1
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
@router.get(
|
|
298
|
+
"/projects/{project}/workflows/{name}/runs/{uid}",
|
|
299
|
+
response_model=mlrun.common.schemas.GetWorkflowResponse,
|
|
300
|
+
)
|
|
301
|
+
async def get_workflow_id(
|
|
302
|
+
project: str,
|
|
303
|
+
name: str,
|
|
304
|
+
uid: str,
|
|
305
|
+
auth_info: mlrun.common.schemas.AuthInfo = fastapi.Depends(
|
|
306
|
+
mlrun.api.api.deps.authenticate_request
|
|
307
|
+
),
|
|
308
|
+
db_session: Session = fastapi.Depends(mlrun.api.api.deps.get_db_session),
|
|
309
|
+
engine: str = "kfp",
|
|
310
|
+
) -> mlrun.common.schemas.GetWorkflowResponse:
|
|
311
|
+
"""
|
|
312
|
+
Retrieve workflow id from the uid of the workflow runner.
|
|
313
|
+
When creating a remote workflow we are creating an auxiliary function
|
|
314
|
+
which is responsible for actually running the workflow,
|
|
315
|
+
as we don't know beforehand the workflow uid but only the run uid of the auxiliary function we ran,
|
|
316
|
+
we have to wait until the running function will log the workflow id it created.
|
|
317
|
+
Because we don't know how long it will take for the run to create the workflow
|
|
318
|
+
we decided to implement that in an asynchronous mechanism which at first,
|
|
319
|
+
client will get the run uid and then will pull the workflow id from the run id
|
|
320
|
+
kinda as you would use a background task to query if it finished.
|
|
321
|
+
Supporting workflows that executed by the remote engine **only**.
|
|
322
|
+
|
|
323
|
+
:param project: name of the project
|
|
324
|
+
:param name: name of the workflow
|
|
325
|
+
:param uid: the id of the running job that runs the workflow
|
|
326
|
+
:param auth_info: auth info of the request
|
|
327
|
+
:param db_session: session that manages the current dialog with the database
|
|
328
|
+
:param engine: pipeline runner, for example: "kfp"
|
|
329
|
+
|
|
330
|
+
:returns: workflow id
|
|
331
|
+
"""
|
|
332
|
+
# Check permission READ run:
|
|
333
|
+
await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
|
|
334
|
+
mlrun.common.schemas.AuthorizationResourceTypes.run,
|
|
335
|
+
project,
|
|
336
|
+
uid,
|
|
337
|
+
mlrun.common.schemas.AuthorizationAction.read,
|
|
338
|
+
auth_info,
|
|
339
|
+
)
|
|
340
|
+
# Check permission READ workflow:
|
|
341
|
+
await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
|
|
342
|
+
mlrun.common.schemas.AuthorizationResourceTypes.workflow,
|
|
343
|
+
project,
|
|
344
|
+
name,
|
|
345
|
+
mlrun.common.schemas.AuthorizationAction.read,
|
|
346
|
+
auth_info,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
return await run_in_threadpool(
|
|
350
|
+
mlrun.api.crud.WorkflowRunners().get_workflow_id,
|
|
351
|
+
uid=uid,
|
|
352
|
+
project=project,
|
|
353
|
+
engine=engine,
|
|
354
|
+
db_session=db_session,
|
|
355
|
+
)
|
mlrun/api/api/utils.py
CHANGED
|
@@ -37,12 +37,12 @@ import mlrun.errors
|
|
|
37
37
|
import mlrun.runtimes.pod
|
|
38
38
|
import mlrun.utils.helpers
|
|
39
39
|
from mlrun.api.db.sqldb.db import SQLDB
|
|
40
|
+
from mlrun.api.rundb.sqldb import SQLRunDB
|
|
40
41
|
from mlrun.api.utils.singletons.db import get_db
|
|
41
42
|
from mlrun.api.utils.singletons.logs_dir import get_logs_dir
|
|
42
43
|
from mlrun.api.utils.singletons.scheduler import get_scheduler
|
|
43
44
|
from mlrun.common.helpers import parse_versioned_object_uri
|
|
44
45
|
from mlrun.config import config
|
|
45
|
-
from mlrun.db.sqldb import SQLDB as SQLRunDB
|
|
46
46
|
from mlrun.errors import err_to_str
|
|
47
47
|
from mlrun.run import import_function, new_function
|
|
48
48
|
from mlrun.runtimes.utils import enrich_function_from_dict
|
mlrun/api/crud/__init__.py
CHANGED
mlrun/api/crud/client_spec.py
CHANGED
|
@@ -102,6 +102,9 @@ class ClientSpec(
|
|
|
102
102
|
feature_store_data_prefixes=self._get_config_value_if_not_default(
|
|
103
103
|
"feature_store.data_prefixes"
|
|
104
104
|
),
|
|
105
|
+
model_endpoint_monitoring_store_type=self._get_config_value_if_not_default(
|
|
106
|
+
"model_endpoint_monitoring.store_type"
|
|
107
|
+
),
|
|
105
108
|
)
|
|
106
109
|
|
|
107
110
|
@staticmethod
|
|
@@ -28,6 +28,7 @@ import mlrun.model_monitoring.stream_processing
|
|
|
28
28
|
import mlrun.model_monitoring.tracking_policy
|
|
29
29
|
from mlrun import feature_store as fstore
|
|
30
30
|
from mlrun.api.api import deps
|
|
31
|
+
from mlrun.api.crud.model_monitoring.helpers import Seconds, seconds2minutes
|
|
31
32
|
from mlrun.utils import logger
|
|
32
33
|
|
|
33
34
|
_MODEL_MONITORING_COMMON_PATH = pathlib.Path(__file__).parents[3] / "model_monitoring"
|
|
@@ -40,6 +41,24 @@ _MONITORING_BATCH_FUNCTION_PATH = (
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
class MonitoringDeployment:
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
parquet_batching_max_events: int = mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
47
|
+
max_parquet_save_interval: int = mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
48
|
+
) -> None:
|
|
49
|
+
self._parquet_batching_max_events = parquet_batching_max_events
|
|
50
|
+
self._max_parquet_save_interval = max_parquet_save_interval
|
|
51
|
+
"""
|
|
52
|
+
Initialize a MonitoringDeployment object, which handles the deployment of both model monitoring stream nuclio
|
|
53
|
+
function and the scheduled batch drift job.
|
|
54
|
+
|
|
55
|
+
:param parquet_batching_max_events: Maximum number of events that will be used for writing the monitoring
|
|
56
|
+
parquet by the monitoring stream function.
|
|
57
|
+
:param max_parquet_save_interval: Maximum number of seconds to hold events before they are written to the
|
|
58
|
+
monitoring parquet target. Note that this value will be used to handle the
|
|
59
|
+
offset by the scheduled batch job.
|
|
60
|
+
"""
|
|
61
|
+
|
|
43
62
|
def deploy_monitoring_functions(
|
|
44
63
|
self,
|
|
45
64
|
project: str,
|
|
@@ -70,6 +89,7 @@ class MonitoringDeployment:
|
|
|
70
89
|
db_session=db_session,
|
|
71
90
|
auth_info=auth_info,
|
|
72
91
|
tracking_policy=tracking_policy,
|
|
92
|
+
tracking_offset=Seconds(self._max_parquet_save_interval),
|
|
73
93
|
)
|
|
74
94
|
|
|
75
95
|
def deploy_model_monitoring_stream_processing(
|
|
@@ -79,7 +99,7 @@ class MonitoringDeployment:
|
|
|
79
99
|
db_session: sqlalchemy.orm.Session,
|
|
80
100
|
auth_info: mlrun.common.schemas.AuthInfo,
|
|
81
101
|
tracking_policy: mlrun.model_monitoring.tracking_policy.TrackingPolicy,
|
|
82
|
-
):
|
|
102
|
+
) -> None:
|
|
83
103
|
"""
|
|
84
104
|
Deploying model monitoring stream real time nuclio function. The goal of this real time function is
|
|
85
105
|
to monitor the log of the data stream. It is triggered when a new log entry is detected.
|
|
@@ -129,6 +149,9 @@ class MonitoringDeployment:
|
|
|
129
149
|
parquet_target=parquet_target,
|
|
130
150
|
)
|
|
131
151
|
|
|
152
|
+
# Adding label to the function - will be used to identify the stream pod
|
|
153
|
+
fn.metadata.labels = {"type": "model-monitoring-stream"}
|
|
154
|
+
|
|
132
155
|
mlrun.api.api.endpoints.functions._build_function(
|
|
133
156
|
db_session=db_session, auth_info=auth_info, function=fn
|
|
134
157
|
)
|
|
@@ -140,6 +163,7 @@ class MonitoringDeployment:
|
|
|
140
163
|
db_session: sqlalchemy.orm.Session,
|
|
141
164
|
auth_info: mlrun.common.schemas.AuthInfo,
|
|
142
165
|
tracking_policy: mlrun.model_monitoring.tracking_policy.TrackingPolicy,
|
|
166
|
+
tracking_offset: Seconds = Seconds(0),
|
|
143
167
|
):
|
|
144
168
|
"""
|
|
145
169
|
Deploying model monitoring batch job. The goal of this job is to identify drift in the data
|
|
@@ -152,6 +176,7 @@ class MonitoringDeployment:
|
|
|
152
176
|
:param db_session: A session that manages the current dialog with the database.
|
|
153
177
|
:param auth_info: The auth info of the request.
|
|
154
178
|
:param tracking_policy: Model monitoring configurations.
|
|
179
|
+
:param tracking_offset: Offset for the tracking policy (for synchronization with the stream)
|
|
155
180
|
"""
|
|
156
181
|
|
|
157
182
|
logger.info(
|
|
@@ -210,7 +235,8 @@ class MonitoringDeployment:
|
|
|
210
235
|
data = {
|
|
211
236
|
"task": task.to_dict(),
|
|
212
237
|
"schedule": mlrun.api.crud.model_monitoring.helpers.convert_to_cron_string(
|
|
213
|
-
tracking_policy.default_batch_intervals
|
|
238
|
+
tracking_policy.default_batch_intervals,
|
|
239
|
+
minute_delay=seconds2minutes(tracking_offset),
|
|
214
240
|
),
|
|
215
241
|
}
|
|
216
242
|
|
|
@@ -247,11 +273,14 @@ class MonitoringDeployment:
|
|
|
247
273
|
"""
|
|
248
274
|
|
|
249
275
|
# Initialize Stream Processor object
|
|
250
|
-
stream_processor =
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
276
|
+
stream_processor = (
|
|
277
|
+
mlrun.model_monitoring.stream_processing.EventStreamProcessor(
|
|
278
|
+
project=project,
|
|
279
|
+
parquet_batching_max_events=self._parquet_batching_max_events,
|
|
280
|
+
parquet_batching_timeout_secs=self._max_parquet_save_interval,
|
|
281
|
+
parquet_target=parquet_target,
|
|
282
|
+
model_monitoring_access_key=model_monitoring_access_key,
|
|
283
|
+
)
|
|
255
284
|
)
|
|
256
285
|
|
|
257
286
|
# Create a new serving function for the streaming process
|
|
@@ -149,7 +149,7 @@ async def grafana_list_endpoints(
|
|
|
149
149
|
if (
|
|
150
150
|
filter_router
|
|
151
151
|
and endpoint.status.endpoint_type
|
|
152
|
-
== mlrun.common.model_monitoring.EndpointType.ROUTER
|
|
152
|
+
== mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
|
|
153
153
|
):
|
|
154
154
|
continue
|
|
155
155
|
row = [
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
import json
|
|
16
|
+
import math
|
|
16
17
|
import typing
|
|
17
18
|
|
|
18
19
|
import sqlalchemy.orm
|
|
@@ -23,6 +24,16 @@ import mlrun.common.model_monitoring.helpers
|
|
|
23
24
|
import mlrun.common.schemas.schedule
|
|
24
25
|
import mlrun.errors
|
|
25
26
|
|
|
27
|
+
Seconds = typing.NewType("Seconds", int)
|
|
28
|
+
Minutes = typing.NewType("Minutes", int)
|
|
29
|
+
|
|
30
|
+
_SECONDS_IN_MINUTE: Seconds = Seconds(60)
|
|
31
|
+
_MINUTES_IN_HOUR: Minutes = Minutes(60)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def seconds2minutes(seconds: Seconds) -> Minutes:
|
|
35
|
+
return Minutes(math.ceil(seconds / _SECONDS_IN_MINUTE))
|
|
36
|
+
|
|
26
37
|
|
|
27
38
|
def get_batching_interval_param(intervals_list: typing.List):
|
|
28
39
|
"""Convert each value in the intervals list into a float number. None
|
|
@@ -47,12 +58,31 @@ def get_batching_interval_param(intervals_list: typing.List):
|
|
|
47
58
|
)
|
|
48
59
|
|
|
49
60
|
|
|
61
|
+
def _add_minutes_offset(
|
|
62
|
+
minute: typing.Optional[typing.Union[int, str]],
|
|
63
|
+
offset: Minutes,
|
|
64
|
+
) -> typing.Optional[typing.Union[int, str]]:
|
|
65
|
+
"""
|
|
66
|
+
:param minute: the minute specification in the cron schedule, e.g. "0".
|
|
67
|
+
:param offset: the offset in minutes to add to the cron minute specification.
|
|
68
|
+
:return: the minute cron with the offset applied (if supported).
|
|
69
|
+
"""
|
|
70
|
+
if minute and (
|
|
71
|
+
(isinstance(minute, str) and str.isdigit(minute)) or isinstance(minute, int)
|
|
72
|
+
):
|
|
73
|
+
minute = (int(minute) + offset) % _MINUTES_IN_HOUR
|
|
74
|
+
return minute
|
|
75
|
+
|
|
76
|
+
|
|
50
77
|
def convert_to_cron_string(
|
|
51
78
|
cron_trigger: mlrun.common.schemas.schedule.ScheduleCronTrigger,
|
|
52
|
-
)
|
|
79
|
+
minute_delay: Minutes = Minutes(0),
|
|
80
|
+
) -> str:
|
|
53
81
|
"""Convert the batch interval `ScheduleCronTrigger` into a cron trigger expression"""
|
|
54
82
|
return "{} {} {} * *".format(
|
|
55
|
-
cron_trigger.minute,
|
|
83
|
+
_add_minutes_offset(cron_trigger.minute, minute_delay),
|
|
84
|
+
cron_trigger.hour,
|
|
85
|
+
cron_trigger.day,
|
|
56
86
|
).replace("None", "*")
|
|
57
87
|
|
|
58
88
|
|
|
@@ -22,6 +22,8 @@ import sqlalchemy.orm
|
|
|
22
22
|
import mlrun.api.api.utils
|
|
23
23
|
import mlrun.api.crud.model_monitoring.deployment
|
|
24
24
|
import mlrun.api.crud.model_monitoring.helpers
|
|
25
|
+
import mlrun.api.crud.secrets
|
|
26
|
+
import mlrun.api.rundb.sqldb
|
|
25
27
|
import mlrun.artifacts
|
|
26
28
|
import mlrun.common.helpers
|
|
27
29
|
import mlrun.common.schemas.model_monitoring
|
|
@@ -155,6 +157,9 @@ class ModelEndpoints:
|
|
|
155
157
|
# Write the new model endpoint
|
|
156
158
|
model_endpoint_store = get_model_endpoint_store(
|
|
157
159
|
project=model_endpoint.metadata.project,
|
|
160
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
161
|
+
project=model_endpoint.metadata.project
|
|
162
|
+
),
|
|
158
163
|
)
|
|
159
164
|
model_endpoint_store.write_model_endpoint(endpoint=model_endpoint.flat_dict())
|
|
160
165
|
|
|
@@ -184,6 +189,9 @@ class ModelEndpoints:
|
|
|
184
189
|
# Generate a model endpoint store object and apply the update process
|
|
185
190
|
model_endpoint_store = get_model_endpoint_store(
|
|
186
191
|
project=project,
|
|
192
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
193
|
+
project=project
|
|
194
|
+
),
|
|
187
195
|
)
|
|
188
196
|
model_endpoint_store.update_model_endpoint(
|
|
189
197
|
endpoint_id=endpoint_id, attributes=attributes
|
|
@@ -203,7 +211,7 @@ class ModelEndpoints:
|
|
|
203
211
|
model_endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
204
212
|
model_obj: mlrun.artifacts.ModelArtifact,
|
|
205
213
|
db_session: sqlalchemy.orm.Session,
|
|
206
|
-
run_db: mlrun.
|
|
214
|
+
run_db: mlrun.api.rundb.sqldb.SQLRunDB,
|
|
207
215
|
):
|
|
208
216
|
"""
|
|
209
217
|
Create monitoring feature set with the relevant parquet target.
|
|
@@ -290,7 +298,6 @@ class ModelEndpoints:
|
|
|
290
298
|
driver.update_resource_status("created")
|
|
291
299
|
|
|
292
300
|
# Save the new feature set
|
|
293
|
-
feature_set._override_run_db(db_session)
|
|
294
301
|
feature_set.save()
|
|
295
302
|
logger.info(
|
|
296
303
|
"Monitoring feature set created",
|
|
@@ -313,6 +320,9 @@ class ModelEndpoints:
|
|
|
313
320
|
"""
|
|
314
321
|
model_endpoint_store = get_model_endpoint_store(
|
|
315
322
|
project=project,
|
|
323
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
324
|
+
project=project
|
|
325
|
+
),
|
|
316
326
|
)
|
|
317
327
|
|
|
318
328
|
model_endpoint_store.delete_model_endpoint(endpoint_id=endpoint_id)
|
|
@@ -361,7 +371,11 @@ class ModelEndpoints:
|
|
|
361
371
|
|
|
362
372
|
# Generate a model endpoint store object and get the model endpoint record as a dictionary
|
|
363
373
|
model_endpoint_store = get_model_endpoint_store(
|
|
364
|
-
project=project,
|
|
374
|
+
project=project,
|
|
375
|
+
access_key=auth_info.data_session,
|
|
376
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
377
|
+
project=project
|
|
378
|
+
),
|
|
365
379
|
)
|
|
366
380
|
|
|
367
381
|
model_endpoint_record = model_endpoint_store.get_model_endpoint(
|
|
@@ -454,7 +468,11 @@ class ModelEndpoints:
|
|
|
454
468
|
|
|
455
469
|
# Generate a model endpoint store object and get a list of model endpoint dictionaries
|
|
456
470
|
endpoint_store = get_model_endpoint_store(
|
|
457
|
-
access_key=auth_info.data_session,
|
|
471
|
+
access_key=auth_info.data_session,
|
|
472
|
+
project=project,
|
|
473
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
474
|
+
project=project
|
|
475
|
+
),
|
|
458
476
|
)
|
|
459
477
|
|
|
460
478
|
endpoint_dictionary_list = endpoint_store.list_model_endpoints(
|
|
@@ -523,7 +541,11 @@ class ModelEndpoints:
|
|
|
523
541
|
|
|
524
542
|
# Generate a model endpoint store object and get a list of model endpoint dictionaries
|
|
525
543
|
endpoint_store = get_model_endpoint_store(
|
|
526
|
-
access_key=auth_info.data_session,
|
|
544
|
+
access_key=auth_info.data_session,
|
|
545
|
+
project=project_name,
|
|
546
|
+
secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
|
|
547
|
+
project=project_name
|
|
548
|
+
),
|
|
527
549
|
)
|
|
528
550
|
endpoints = endpoint_store.list_model_endpoints()
|
|
529
551
|
|
mlrun/api/crud/notifications.py
CHANGED
|
@@ -34,13 +34,18 @@ class Notifications(
|
|
|
34
34
|
notification_objects: typing.List[mlrun.model.Notification],
|
|
35
35
|
run_uid: str,
|
|
36
36
|
project: str = None,
|
|
37
|
+
mask_params: bool = True,
|
|
37
38
|
):
|
|
38
39
|
project = project or mlrun.mlconf.default_project
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
|
|
41
|
+
# we don't mask the notification params when it's a status update as they are already masked
|
|
42
|
+
notification_objects_to_store = notification_objects
|
|
43
|
+
if mask_params:
|
|
44
|
+
notification_objects_to_store = (
|
|
45
|
+
mlrun.api.api.utils.validate_and_mask_notification_list(
|
|
46
|
+
notification_objects, run_uid, project
|
|
47
|
+
)
|
|
42
48
|
)
|
|
43
|
-
)
|
|
44
49
|
|
|
45
50
|
mlrun.api.utils.singletons.db.get_db().store_run_notifications(
|
|
46
51
|
session, notification_objects_to_store, run_uid, project
|