mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (119) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +1 -40
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/functions.py +6 -1
  7. mlrun/api/api/endpoints/logs.py +17 -3
  8. mlrun/api/api/endpoints/pipelines.py +1 -5
  9. mlrun/api/api/endpoints/projects.py +88 -0
  10. mlrun/api/api/endpoints/runs.py +48 -6
  11. mlrun/api/api/endpoints/workflows.py +355 -0
  12. mlrun/api/api/utils.py +1 -1
  13. mlrun/api/crud/__init__.py +1 -0
  14. mlrun/api/crud/client_spec.py +3 -0
  15. mlrun/api/crud/model_monitoring/deployment.py +36 -7
  16. mlrun/api/crud/model_monitoring/grafana.py +1 -1
  17. mlrun/api/crud/model_monitoring/helpers.py +32 -2
  18. mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
  19. mlrun/api/crud/notifications.py +9 -4
  20. mlrun/api/crud/pipelines.py +4 -9
  21. mlrun/api/crud/runtime_resources.py +4 -3
  22. mlrun/api/crud/secrets.py +21 -0
  23. mlrun/api/crud/workflows.py +352 -0
  24. mlrun/api/db/base.py +16 -1
  25. mlrun/api/db/sqldb/db.py +97 -16
  26. mlrun/api/launcher.py +26 -7
  27. mlrun/api/main.py +3 -4
  28. mlrun/{mlutils → api/rundb}/__init__.py +2 -6
  29. mlrun/{db → api/rundb}/sqldb.py +35 -83
  30. mlrun/api/runtime_handlers/__init__.py +56 -0
  31. mlrun/api/runtime_handlers/base.py +1247 -0
  32. mlrun/api/runtime_handlers/daskjob.py +209 -0
  33. mlrun/api/runtime_handlers/kubejob.py +37 -0
  34. mlrun/api/runtime_handlers/mpijob.py +147 -0
  35. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  36. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  37. mlrun/api/utils/builder.py +1 -4
  38. mlrun/api/utils/clients/chief.py +14 -0
  39. mlrun/api/utils/scheduler.py +98 -15
  40. mlrun/api/utils/singletons/db.py +4 -0
  41. mlrun/artifacts/manager.py +1 -2
  42. mlrun/common/schemas/__init__.py +6 -0
  43. mlrun/common/schemas/auth.py +4 -1
  44. mlrun/common/schemas/client_spec.py +1 -1
  45. mlrun/common/schemas/model_monitoring/__init__.py +1 -0
  46. mlrun/common/schemas/model_monitoring/constants.py +11 -0
  47. mlrun/common/schemas/project.py +1 -0
  48. mlrun/common/schemas/runs.py +1 -8
  49. mlrun/common/schemas/schedule.py +1 -8
  50. mlrun/common/schemas/workflow.py +54 -0
  51. mlrun/config.py +42 -40
  52. mlrun/datastore/sources.py +1 -1
  53. mlrun/db/__init__.py +4 -68
  54. mlrun/db/base.py +12 -0
  55. mlrun/db/factory.py +65 -0
  56. mlrun/db/httpdb.py +175 -19
  57. mlrun/db/nopdb.py +4 -2
  58. mlrun/execution.py +4 -2
  59. mlrun/feature_store/__init__.py +1 -0
  60. mlrun/feature_store/api.py +1 -2
  61. mlrun/feature_store/feature_set.py +0 -10
  62. mlrun/feature_store/feature_vector.py +340 -2
  63. mlrun/feature_store/ingestion.py +5 -10
  64. mlrun/feature_store/retrieval/base.py +118 -104
  65. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  66. mlrun/feature_store/retrieval/job.py +4 -1
  67. mlrun/feature_store/retrieval/local_merger.py +18 -18
  68. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  69. mlrun/feature_store/retrieval/storey_merger.py +21 -15
  70. mlrun/kfpops.py +3 -9
  71. mlrun/launcher/base.py +3 -3
  72. mlrun/launcher/client.py +3 -2
  73. mlrun/launcher/factory.py +16 -13
  74. mlrun/lists.py +0 -11
  75. mlrun/model.py +9 -15
  76. mlrun/model_monitoring/helpers.py +15 -25
  77. mlrun/model_monitoring/model_monitoring_batch.py +72 -4
  78. mlrun/model_monitoring/prometheus.py +219 -0
  79. mlrun/model_monitoring/stores/__init__.py +15 -9
  80. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
  81. mlrun/model_monitoring/stream_processing.py +181 -29
  82. mlrun/package/packager.py +6 -8
  83. mlrun/package/packagers/default_packager.py +121 -10
  84. mlrun/platforms/__init__.py +0 -2
  85. mlrun/platforms/iguazio.py +0 -56
  86. mlrun/projects/pipelines.py +57 -158
  87. mlrun/projects/project.py +6 -32
  88. mlrun/render.py +1 -1
  89. mlrun/run.py +2 -124
  90. mlrun/runtimes/__init__.py +6 -42
  91. mlrun/runtimes/base.py +26 -1241
  92. mlrun/runtimes/daskjob.py +2 -198
  93. mlrun/runtimes/function.py +16 -5
  94. mlrun/runtimes/kubejob.py +5 -29
  95. mlrun/runtimes/mpijob/__init__.py +2 -2
  96. mlrun/runtimes/mpijob/abstract.py +10 -1
  97. mlrun/runtimes/mpijob/v1.py +0 -76
  98. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  99. mlrun/runtimes/nuclio.py +3 -2
  100. mlrun/runtimes/pod.py +0 -10
  101. mlrun/runtimes/remotesparkjob.py +1 -15
  102. mlrun/runtimes/serving.py +1 -1
  103. mlrun/runtimes/sparkjob/__init__.py +0 -1
  104. mlrun/runtimes/sparkjob/abstract.py +4 -131
  105. mlrun/serving/states.py +1 -1
  106. mlrun/utils/db.py +0 -2
  107. mlrun/utils/helpers.py +19 -13
  108. mlrun/utils/notifications/notification_pusher.py +5 -25
  109. mlrun/utils/regex.py +7 -2
  110. mlrun/utils/version/version.json +2 -2
  111. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
  112. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
  113. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  114. mlrun/mlutils/data.py +0 -160
  115. mlrun/mlutils/models.py +0 -78
  116. mlrun/mlutils/plots.py +0 -902
  117. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  118. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  119. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ # Copyright 2018 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import collections.abc
16
+ import copy
17
+ import traceback
18
+ import typing
19
+ from http import HTTPStatus
20
+ from typing import Dict
21
+
22
+ import fastapi
23
+ from fastapi.concurrency import run_in_threadpool
24
+ from sqlalchemy.orm import Session
25
+
26
+ import mlrun
27
+ import mlrun.api.api.deps
28
+ import mlrun.api.api.utils
29
+ import mlrun.api.crud
30
+ import mlrun.api.utils.auth.verifier
31
+ import mlrun.api.utils.clients.chief
32
+ import mlrun.api.utils.singletons.db
33
+ import mlrun.api.utils.singletons.project_member
34
+ import mlrun.common.schemas
35
+ import mlrun.projects.pipelines
36
+ from mlrun.api.api.utils import log_and_raise
37
+ from mlrun.utils.helpers import logger
38
+
39
+ router = fastapi.APIRouter()
40
+
41
+
42
+ @router.post(
43
+ "/projects/{project}/workflows/{name}/submit",
44
+ status_code=HTTPStatus.ACCEPTED.value,
45
+ response_model=mlrun.common.schemas.WorkflowResponse,
46
+ )
47
+ async def submit_workflow(
48
+ project: str,
49
+ name: str,
50
+ request: fastapi.Request,
51
+ workflow_request: mlrun.common.schemas.WorkflowRequest = mlrun.common.schemas.WorkflowRequest(),
52
+ auth_info: mlrun.common.schemas.AuthInfo = fastapi.Depends(
53
+ mlrun.api.api.deps.authenticate_request
54
+ ),
55
+ db_session: Session = fastapi.Depends(mlrun.api.api.deps.get_db_session),
56
+ ):
57
+ """
58
+ Submitting a workflow of existing project.
59
+ To support workflow scheduling, we use here an auxiliary function called 'load_and_run'.
60
+ This function runs remotely (in a distinct pod), loads a project and then runs the workflow.
61
+ In this way we can run the workflow remotely with the workflow's engine or
62
+ schedule this function which in every time loads the project and runs the workflow.
63
+ Notice:
64
+ in case of simply running a workflow, the returned run_id value is the id of the run of the auxiliary function.
65
+ For getting the id and status of the workflow, use the `get_workflow_id` endpoint with the returned run id.
66
+
67
+ :param project: name of the project
68
+ :param name: name of the workflow
69
+ :param request: fastapi request for supporting rerouting to chief if needed
70
+ :param workflow_request: the request includes: workflow spec, arguments for the workflow, artifact path
71
+ as the artifact target path of the workflow, source url of the project for overriding
72
+ the existing one, run name to override the default: 'workflow-runner-<workflow name>'
73
+ and kubernetes namespace if other than default
74
+ :param auth_info: auth info of the request
75
+ :param db_session: session that manages the current dialog with the database
76
+
77
+ :returns: response that contains the project name, workflow name, name of the workflow,
78
+ status, run id (in case of a single run) and schedule (in case of scheduling)
79
+ """
80
+ project = await run_in_threadpool(
81
+ mlrun.api.utils.singletons.project_member.get_project_member().get_project,
82
+ db_session=db_session,
83
+ name=project,
84
+ leader_session=auth_info.session,
85
+ )
86
+
87
+ # check permission CREATE run
88
+ await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
89
+ resource_type=mlrun.common.schemas.AuthorizationResourceTypes.run,
90
+ project_name=project.metadata.name,
91
+ resource_name=workflow_request.run_name or "",
92
+ action=mlrun.common.schemas.AuthorizationAction.create,
93
+ auth_info=auth_info,
94
+ )
95
+ # check permission READ workflow on project's workflow
96
+ await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
97
+ resource_type=mlrun.common.schemas.AuthorizationResourceTypes.workflow,
98
+ project_name=project.metadata.name,
99
+ resource_name=name,
100
+ action=mlrun.common.schemas.AuthorizationAction.read,
101
+ auth_info=auth_info,
102
+ )
103
+ # Check permission CREATE workflow on new workflow's name
104
+ await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
105
+ resource_type=mlrun.common.schemas.AuthorizationResourceTypes.workflow,
106
+ project_name=project.metadata.name,
107
+ # If workflow spec has not passed need to create on same name:
108
+ resource_name=getattr(workflow_request.spec, "name", name),
109
+ action=mlrun.common.schemas.AuthorizationAction.create,
110
+ auth_info=auth_info,
111
+ )
112
+ # Re-route to chief in case of schedule
113
+ if (
114
+ _is_requested_schedule(name, workflow_request.spec, project)
115
+ and mlrun.mlconf.httpdb.clusterization.role
116
+ != mlrun.common.schemas.ClusterizationRole.chief
117
+ ):
118
+ chief_client = mlrun.api.utils.clients.chief.Client()
119
+ return await chief_client.submit_workflow(
120
+ project=project.metadata.name,
121
+ name=name,
122
+ request=request,
123
+ json=workflow_request.dict(),
124
+ )
125
+
126
+ workflow_spec = _fill_workflow_missing_fields_from_project(
127
+ project=project,
128
+ workflow_name=name,
129
+ spec=workflow_request.spec,
130
+ arguments=workflow_request.arguments,
131
+ )
132
+ updated_request = workflow_request.copy()
133
+ updated_request.spec = workflow_spec
134
+
135
+ # This function is for loading the project and running workflow remotely.
136
+ # In this way we can schedule workflows (by scheduling a job that runs the workflow)
137
+ workflow_runner = await run_in_threadpool(
138
+ mlrun.api.crud.WorkflowRunners().create_runner,
139
+ run_name=updated_request.run_name
140
+ or mlrun.mlconf.workflows.default_workflow_runner_name.format(
141
+ workflow_spec.name
142
+ ),
143
+ project=project.metadata.name,
144
+ db_session=db_session,
145
+ auth_info=auth_info,
146
+ image=workflow_spec.image
147
+ or project.spec.default_image
148
+ or mlrun.mlconf.default_base_image,
149
+ )
150
+
151
+ logger.debug(
152
+ "Saved function for running workflow",
153
+ project_name=workflow_runner.metadata.project,
154
+ function_name=workflow_runner.metadata.name,
155
+ workflow_name=workflow_spec.name,
156
+ arguments=workflow_spec.args,
157
+ source=updated_request.source or project.spec.source,
158
+ kind=workflow_runner.kind,
159
+ image=workflow_runner.spec.image,
160
+ )
161
+
162
+ run_uid = None
163
+ status = None
164
+ workflow_action = "schedule" if workflow_spec.schedule else "run"
165
+ try:
166
+ if workflow_spec.schedule:
167
+ await run_in_threadpool(
168
+ mlrun.api.crud.WorkflowRunners().schedule,
169
+ runner=workflow_runner,
170
+ project=project,
171
+ workflow_request=updated_request,
172
+ db_session=db_session,
173
+ auth_info=auth_info,
174
+ )
175
+ status = "scheduled"
176
+
177
+ else:
178
+ run = await run_in_threadpool(
179
+ mlrun.api.crud.WorkflowRunners().run,
180
+ runner=workflow_runner,
181
+ project=project,
182
+ workflow_request=updated_request,
183
+ )
184
+ status = mlrun.run.RunStatuses.running
185
+ run_uid = run.uid()
186
+ except Exception as error:
187
+ logger.error(traceback.format_exc())
188
+ log_and_raise(
189
+ reason="Workflow failed",
190
+ workflow_name=workflow_spec.name,
191
+ workflow_action=workflow_action,
192
+ error=mlrun.errors.err_to_str(error),
193
+ )
194
+
195
+ return mlrun.common.schemas.WorkflowResponse(
196
+ project=project.metadata.name,
197
+ name=workflow_spec.name,
198
+ status=status,
199
+ run_id=run_uid,
200
+ schedule=workflow_spec.schedule,
201
+ )
202
+
203
+
204
+ def _is_requested_schedule(
205
+ name: str,
206
+ workflow_spec: mlrun.common.schemas.WorkflowSpec,
207
+ project: mlrun.common.schemas.Project,
208
+ ) -> bool:
209
+ """
210
+ Checks if the workflow needs to be scheduled, which can be decided either the request itself
211
+ contains schedule information or the workflow which was predefined in the project contains schedule.
212
+
213
+ :param name: workflow name
214
+ :param workflow_spec: workflow spec input
215
+ :param project: MLRun project that contains the workflow
216
+
217
+ :return: True if the workflow need to be scheduled and False if not.
218
+ """
219
+ if workflow_spec:
220
+ return workflow_spec.schedule is not None
221
+
222
+ project_workflow = _get_workflow_by_name(project, name)
223
+ return bool(project_workflow.get("schedule"))
224
+
225
+
226
+ def _get_workflow_by_name(
227
+ project: mlrun.common.schemas.Project, name: str
228
+ ) -> typing.Optional[Dict]:
229
+ """
230
+ Getting workflow from project
231
+
232
+ :param project: MLRun project
233
+ :param name: workflow name
234
+
235
+ :return: workflow as a dict if project has the workflow, otherwise raises a bad request exception
236
+ """
237
+ for workflow in project.spec.workflows:
238
+ if workflow["name"] == name:
239
+ return workflow
240
+ log_and_raise(
241
+ reason=f"workflow {name} not found in project",
242
+ )
243
+
244
+
245
+ def _fill_workflow_missing_fields_from_project(
246
+ project: mlrun.common.schemas.Project,
247
+ workflow_name: str,
248
+ spec: mlrun.common.schemas.WorkflowSpec,
249
+ arguments: typing.Dict,
250
+ ) -> mlrun.common.schemas.WorkflowSpec:
251
+ """
252
+ Fill the workflow spec details from the project object, with favour to spec
253
+
254
+ :param project: MLRun project that contains the workflow.
255
+ :param workflow_name: workflow name
256
+ :param spec: workflow spec input
257
+ :param arguments: arguments to workflow
258
+
259
+ :return: completed workflow spec
260
+ """
261
+ # Verifying workflow exists in project:
262
+ workflow = _get_workflow_by_name(project, workflow_name)
263
+
264
+ if spec:
265
+ # Merge between the workflow spec provided in the request with existing
266
+ # workflow while the provided workflow takes precedence over the existing workflow params
267
+ workflow = copy.deepcopy(workflow)
268
+ workflow = _update_dict(workflow, spec.dict())
269
+
270
+ workflow_spec = mlrun.common.schemas.WorkflowSpec(**workflow)
271
+ # Overriding arguments of the existing workflow:
272
+ if arguments:
273
+ workflow_spec.args = workflow_spec.args or {}
274
+ workflow_spec.args.update(arguments)
275
+
276
+ return workflow_spec
277
+
278
+
279
+ def _update_dict(dict_1: dict, dict_2: dict):
280
+ """
281
+ Update two dictionaries included nested dictionaries (recursively).
282
+ :param dict_1: The dict to update
283
+ :param dict_2: The values of this dict take precedence over dict_1.
284
+ :return:
285
+ """
286
+ for key, val in dict_2.items():
287
+ if isinstance(val, collections.abc.Mapping):
288
+ dict_1[key] = _update_dict(dict_1.get(key, {}), val)
289
+ # It is necessary to update only if value is exist because
290
+ # on initialization of the WorkflowSpec object all unfilled values gets None values,
291
+ # and when converting to dict the keys gets those None values.
292
+ elif val:
293
+ dict_1[key] = val
294
+ return dict_1
295
+
296
+
297
+ @router.get(
298
+ "/projects/{project}/workflows/{name}/runs/{uid}",
299
+ response_model=mlrun.common.schemas.GetWorkflowResponse,
300
+ )
301
+ async def get_workflow_id(
302
+ project: str,
303
+ name: str,
304
+ uid: str,
305
+ auth_info: mlrun.common.schemas.AuthInfo = fastapi.Depends(
306
+ mlrun.api.api.deps.authenticate_request
307
+ ),
308
+ db_session: Session = fastapi.Depends(mlrun.api.api.deps.get_db_session),
309
+ engine: str = "kfp",
310
+ ) -> mlrun.common.schemas.GetWorkflowResponse:
311
+ """
312
+ Retrieve workflow id from the uid of the workflow runner.
313
+ When creating a remote workflow we are creating an auxiliary function
314
+ which is responsible for actually running the workflow,
315
+ as we don't know beforehand the workflow uid but only the run uid of the auxiliary function we ran,
316
+ we have to wait until the running function will log the workflow id it created.
317
+ Because we don't know how long it will take for the run to create the workflow
318
+ we decided to implement that in an asynchronous mechanism which at first,
319
+ client will get the run uid and then will pull the workflow id from the run id
320
+ kinda as you would use a background task to query if it finished.
321
+ Supporting workflows that executed by the remote engine **only**.
322
+
323
+ :param project: name of the project
324
+ :param name: name of the workflow
325
+ :param uid: the id of the running job that runs the workflow
326
+ :param auth_info: auth info of the request
327
+ :param db_session: session that manages the current dialog with the database
328
+ :param engine: pipeline runner, for example: "kfp"
329
+
330
+ :returns: workflow id
331
+ """
332
+ # Check permission READ run:
333
+ await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
334
+ mlrun.common.schemas.AuthorizationResourceTypes.run,
335
+ project,
336
+ uid,
337
+ mlrun.common.schemas.AuthorizationAction.read,
338
+ auth_info,
339
+ )
340
+ # Check permission READ workflow:
341
+ await mlrun.api.utils.auth.verifier.AuthVerifier().query_project_resource_permissions(
342
+ mlrun.common.schemas.AuthorizationResourceTypes.workflow,
343
+ project,
344
+ name,
345
+ mlrun.common.schemas.AuthorizationAction.read,
346
+ auth_info,
347
+ )
348
+
349
+ return await run_in_threadpool(
350
+ mlrun.api.crud.WorkflowRunners().get_workflow_id,
351
+ uid=uid,
352
+ project=project,
353
+ engine=engine,
354
+ db_session=db_session,
355
+ )
mlrun/api/api/utils.py CHANGED
@@ -37,12 +37,12 @@ import mlrun.errors
37
37
  import mlrun.runtimes.pod
38
38
  import mlrun.utils.helpers
39
39
  from mlrun.api.db.sqldb.db import SQLDB
40
+ from mlrun.api.rundb.sqldb import SQLRunDB
40
41
  from mlrun.api.utils.singletons.db import get_db
41
42
  from mlrun.api.utils.singletons.logs_dir import get_logs_dir
42
43
  from mlrun.api.utils.singletons.scheduler import get_scheduler
43
44
  from mlrun.common.helpers import parse_versioned_object_uri
44
45
  from mlrun.config import config
45
- from mlrun.db.sqldb import SQLDB as SQLRunDB
46
46
  from mlrun.errors import err_to_str
47
47
  from mlrun.run import import_function, new_function
48
48
  from mlrun.runtimes.utils import enrich_function_from_dict
@@ -29,3 +29,4 @@ from .runs import Runs
29
29
  from .runtime_resources import RuntimeResources
30
30
  from .secrets import Secrets, SecretsClientType
31
31
  from .tags import Tags
32
+ from .workflows import WorkflowRunners
@@ -102,6 +102,9 @@ class ClientSpec(
102
102
  feature_store_data_prefixes=self._get_config_value_if_not_default(
103
103
  "feature_store.data_prefixes"
104
104
  ),
105
+ model_endpoint_monitoring_store_type=self._get_config_value_if_not_default(
106
+ "model_endpoint_monitoring.store_type"
107
+ ),
105
108
  )
106
109
 
107
110
  @staticmethod
@@ -28,6 +28,7 @@ import mlrun.model_monitoring.stream_processing
28
28
  import mlrun.model_monitoring.tracking_policy
29
29
  from mlrun import feature_store as fstore
30
30
  from mlrun.api.api import deps
31
+ from mlrun.api.crud.model_monitoring.helpers import Seconds, seconds2minutes
31
32
  from mlrun.utils import logger
32
33
 
33
34
  _MODEL_MONITORING_COMMON_PATH = pathlib.Path(__file__).parents[3] / "model_monitoring"
@@ -40,6 +41,24 @@ _MONITORING_BATCH_FUNCTION_PATH = (
40
41
 
41
42
 
42
43
  class MonitoringDeployment:
44
+ def __init__(
45
+ self,
46
+ parquet_batching_max_events: int = mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
47
+ max_parquet_save_interval: int = mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
48
+ ) -> None:
49
+ self._parquet_batching_max_events = parquet_batching_max_events
50
+ self._max_parquet_save_interval = max_parquet_save_interval
51
+ """
52
+ Initialize a MonitoringDeployment object, which handles the deployment of both model monitoring stream nuclio
53
+ function and the scheduled batch drift job.
54
+
55
+ :param parquet_batching_max_events: Maximum number of events that will be used for writing the monitoring
56
+ parquet by the monitoring stream function.
57
+ :param max_parquet_save_interval: Maximum number of seconds to hold events before they are written to the
58
+ monitoring parquet target. Note that this value will be used to handle the
59
+ offset by the scheduled batch job.
60
+ """
61
+
43
62
  def deploy_monitoring_functions(
44
63
  self,
45
64
  project: str,
@@ -70,6 +89,7 @@ class MonitoringDeployment:
70
89
  db_session=db_session,
71
90
  auth_info=auth_info,
72
91
  tracking_policy=tracking_policy,
92
+ tracking_offset=Seconds(self._max_parquet_save_interval),
73
93
  )
74
94
 
75
95
  def deploy_model_monitoring_stream_processing(
@@ -79,7 +99,7 @@ class MonitoringDeployment:
79
99
  db_session: sqlalchemy.orm.Session,
80
100
  auth_info: mlrun.common.schemas.AuthInfo,
81
101
  tracking_policy: mlrun.model_monitoring.tracking_policy.TrackingPolicy,
82
- ):
102
+ ) -> None:
83
103
  """
84
104
  Deploying model monitoring stream real time nuclio function. The goal of this real time function is
85
105
  to monitor the log of the data stream. It is triggered when a new log entry is detected.
@@ -129,6 +149,9 @@ class MonitoringDeployment:
129
149
  parquet_target=parquet_target,
130
150
  )
131
151
 
152
+ # Adding label to the function - will be used to identify the stream pod
153
+ fn.metadata.labels = {"type": "model-monitoring-stream"}
154
+
132
155
  mlrun.api.api.endpoints.functions._build_function(
133
156
  db_session=db_session, auth_info=auth_info, function=fn
134
157
  )
@@ -140,6 +163,7 @@ class MonitoringDeployment:
140
163
  db_session: sqlalchemy.orm.Session,
141
164
  auth_info: mlrun.common.schemas.AuthInfo,
142
165
  tracking_policy: mlrun.model_monitoring.tracking_policy.TrackingPolicy,
166
+ tracking_offset: Seconds = Seconds(0),
143
167
  ):
144
168
  """
145
169
  Deploying model monitoring batch job. The goal of this job is to identify drift in the data
@@ -152,6 +176,7 @@ class MonitoringDeployment:
152
176
  :param db_session: A session that manages the current dialog with the database.
153
177
  :param auth_info: The auth info of the request.
154
178
  :param tracking_policy: Model monitoring configurations.
179
+ :param tracking_offset: Offset for the tracking policy (for synchronization with the stream)
155
180
  """
156
181
 
157
182
  logger.info(
@@ -210,7 +235,8 @@ class MonitoringDeployment:
210
235
  data = {
211
236
  "task": task.to_dict(),
212
237
  "schedule": mlrun.api.crud.model_monitoring.helpers.convert_to_cron_string(
213
- tracking_policy.default_batch_intervals
238
+ tracking_policy.default_batch_intervals,
239
+ minute_delay=seconds2minutes(tracking_offset),
214
240
  ),
215
241
  }
216
242
 
@@ -247,11 +273,14 @@ class MonitoringDeployment:
247
273
  """
248
274
 
249
275
  # Initialize Stream Processor object
250
- stream_processor = mlrun.model_monitoring.stream_processing.EventStreamProcessor(
251
- project=project,
252
- parquet_batching_max_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
253
- parquet_target=parquet_target,
254
- model_monitoring_access_key=model_monitoring_access_key,
276
+ stream_processor = (
277
+ mlrun.model_monitoring.stream_processing.EventStreamProcessor(
278
+ project=project,
279
+ parquet_batching_max_events=self._parquet_batching_max_events,
280
+ parquet_batching_timeout_secs=self._max_parquet_save_interval,
281
+ parquet_target=parquet_target,
282
+ model_monitoring_access_key=model_monitoring_access_key,
283
+ )
255
284
  )
256
285
 
257
286
  # Create a new serving function for the streaming process
@@ -149,7 +149,7 @@ async def grafana_list_endpoints(
149
149
  if (
150
150
  filter_router
151
151
  and endpoint.status.endpoint_type
152
- == mlrun.common.model_monitoring.EndpointType.ROUTER
152
+ == mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
153
153
  ):
154
154
  continue
155
155
  row = [
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  import json
16
+ import math
16
17
  import typing
17
18
 
18
19
  import sqlalchemy.orm
@@ -23,6 +24,16 @@ import mlrun.common.model_monitoring.helpers
23
24
  import mlrun.common.schemas.schedule
24
25
  import mlrun.errors
25
26
 
27
+ Seconds = typing.NewType("Seconds", int)
28
+ Minutes = typing.NewType("Minutes", int)
29
+
30
+ _SECONDS_IN_MINUTE: Seconds = Seconds(60)
31
+ _MINUTES_IN_HOUR: Minutes = Minutes(60)
32
+
33
+
34
+ def seconds2minutes(seconds: Seconds) -> Minutes:
35
+ return Minutes(math.ceil(seconds / _SECONDS_IN_MINUTE))
36
+
26
37
 
27
38
  def get_batching_interval_param(intervals_list: typing.List):
28
39
  """Convert each value in the intervals list into a float number. None
@@ -47,12 +58,31 @@ def get_batching_interval_param(intervals_list: typing.List):
47
58
  )
48
59
 
49
60
 
61
+ def _add_minutes_offset(
62
+ minute: typing.Optional[typing.Union[int, str]],
63
+ offset: Minutes,
64
+ ) -> typing.Optional[typing.Union[int, str]]:
65
+ """
66
+ :param minute: the minute specification in the cron schedule, e.g. "0".
67
+ :param offset: the offset in minutes to add to the cron minute specification.
68
+ :return: the minute cron with the offset applied (if supported).
69
+ """
70
+ if minute and (
71
+ (isinstance(minute, str) and str.isdigit(minute)) or isinstance(minute, int)
72
+ ):
73
+ minute = (int(minute) + offset) % _MINUTES_IN_HOUR
74
+ return minute
75
+
76
+
50
77
  def convert_to_cron_string(
51
78
  cron_trigger: mlrun.common.schemas.schedule.ScheduleCronTrigger,
52
- ):
79
+ minute_delay: Minutes = Minutes(0),
80
+ ) -> str:
53
81
  """Convert the batch interval `ScheduleCronTrigger` into a cron trigger expression"""
54
82
  return "{} {} {} * *".format(
55
- cron_trigger.minute, cron_trigger.hour, cron_trigger.day
83
+ _add_minutes_offset(cron_trigger.minute, minute_delay),
84
+ cron_trigger.hour,
85
+ cron_trigger.day,
56
86
  ).replace("None", "*")
57
87
 
58
88
 
@@ -22,6 +22,8 @@ import sqlalchemy.orm
22
22
  import mlrun.api.api.utils
23
23
  import mlrun.api.crud.model_monitoring.deployment
24
24
  import mlrun.api.crud.model_monitoring.helpers
25
+ import mlrun.api.crud.secrets
26
+ import mlrun.api.rundb.sqldb
25
27
  import mlrun.artifacts
26
28
  import mlrun.common.helpers
27
29
  import mlrun.common.schemas.model_monitoring
@@ -155,6 +157,9 @@ class ModelEndpoints:
155
157
  # Write the new model endpoint
156
158
  model_endpoint_store = get_model_endpoint_store(
157
159
  project=model_endpoint.metadata.project,
160
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
161
+ project=model_endpoint.metadata.project
162
+ ),
158
163
  )
159
164
  model_endpoint_store.write_model_endpoint(endpoint=model_endpoint.flat_dict())
160
165
 
@@ -184,6 +189,9 @@ class ModelEndpoints:
184
189
  # Generate a model endpoint store object and apply the update process
185
190
  model_endpoint_store = get_model_endpoint_store(
186
191
  project=project,
192
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
193
+ project=project
194
+ ),
187
195
  )
188
196
  model_endpoint_store.update_model_endpoint(
189
197
  endpoint_id=endpoint_id, attributes=attributes
@@ -203,7 +211,7 @@ class ModelEndpoints:
203
211
  model_endpoint: mlrun.common.schemas.ModelEndpoint,
204
212
  model_obj: mlrun.artifacts.ModelArtifact,
205
213
  db_session: sqlalchemy.orm.Session,
206
- run_db: mlrun.db.sqldb.SQLDB,
214
+ run_db: mlrun.api.rundb.sqldb.SQLRunDB,
207
215
  ):
208
216
  """
209
217
  Create monitoring feature set with the relevant parquet target.
@@ -290,7 +298,6 @@ class ModelEndpoints:
290
298
  driver.update_resource_status("created")
291
299
 
292
300
  # Save the new feature set
293
- feature_set._override_run_db(db_session)
294
301
  feature_set.save()
295
302
  logger.info(
296
303
  "Monitoring feature set created",
@@ -313,6 +320,9 @@ class ModelEndpoints:
313
320
  """
314
321
  model_endpoint_store = get_model_endpoint_store(
315
322
  project=project,
323
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
324
+ project=project
325
+ ),
316
326
  )
317
327
 
318
328
  model_endpoint_store.delete_model_endpoint(endpoint_id=endpoint_id)
@@ -361,7 +371,11 @@ class ModelEndpoints:
361
371
 
362
372
  # Generate a model endpoint store object and get the model endpoint record as a dictionary
363
373
  model_endpoint_store = get_model_endpoint_store(
364
- project=project, access_key=auth_info.data_session
374
+ project=project,
375
+ access_key=auth_info.data_session,
376
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
377
+ project=project
378
+ ),
365
379
  )
366
380
 
367
381
  model_endpoint_record = model_endpoint_store.get_model_endpoint(
@@ -454,7 +468,11 @@ class ModelEndpoints:
454
468
 
455
469
  # Generate a model endpoint store object and get a list of model endpoint dictionaries
456
470
  endpoint_store = get_model_endpoint_store(
457
- access_key=auth_info.data_session, project=project
471
+ access_key=auth_info.data_session,
472
+ project=project,
473
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
474
+ project=project
475
+ ),
458
476
  )
459
477
 
460
478
  endpoint_dictionary_list = endpoint_store.list_model_endpoints(
@@ -523,7 +541,11 @@ class ModelEndpoints:
523
541
 
524
542
  # Generate a model endpoint store object and get a list of model endpoint dictionaries
525
543
  endpoint_store = get_model_endpoint_store(
526
- access_key=auth_info.data_session, project=project_name
544
+ access_key=auth_info.data_session,
545
+ project=project_name,
546
+ secret_provider=mlrun.api.crud.secrets.get_project_secret_provider(
547
+ project=project_name
548
+ ),
527
549
  )
528
550
  endpoints = endpoint_store.list_model_endpoints()
529
551
 
@@ -34,13 +34,18 @@ class Notifications(
34
34
  notification_objects: typing.List[mlrun.model.Notification],
35
35
  run_uid: str,
36
36
  project: str = None,
37
+ mask_params: bool = True,
37
38
  ):
38
39
  project = project or mlrun.mlconf.default_project
39
- notification_objects_to_store = (
40
- mlrun.api.api.utils.validate_and_mask_notification_list(
41
- notification_objects, run_uid, project
40
+
41
+ # we don't mask the notification params when it's a status update as they are already masked
42
+ notification_objects_to_store = notification_objects
43
+ if mask_params:
44
+ notification_objects_to_store = (
45
+ mlrun.api.api.utils.validate_and_mask_notification_list(
46
+ notification_objects, run_uid, project
47
+ )
42
48
  )
43
- )
44
49
 
45
50
  mlrun.api.utils.singletons.db.get_db().store_run_notifications(
46
51
  session, notification_objects_to_store, run_uid, project