mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +3 -41
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/frontend_spec.py +2 -1
- mlrun/api/api/endpoints/functions.py +95 -59
- mlrun/api/api/endpoints/grafana_proxy.py +9 -9
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/model_endpoints.py +3 -2
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/submit.py +2 -1
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +3 -4
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +6 -2
- mlrun/api/crud/feature_store.py +5 -0
- mlrun/api/crud/model_monitoring/__init__.py +1 -0
- mlrun/api/crud/model_monitoring/deployment.py +497 -0
- mlrun/api/crud/model_monitoring/grafana.py +96 -42
- mlrun/api/crud/model_monitoring/helpers.py +159 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +6 -11
- mlrun/api/crud/projects.py +2 -2
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/init_db.py +2 -4
- mlrun/api/db/session.py +1 -1
- mlrun/api/db/sqldb/db.py +129 -31
- mlrun/api/db/sqldb/models/models_mysql.py +15 -1
- mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
- mlrun/api/launcher.py +38 -6
- mlrun/api/main.py +3 -2
- mlrun/api/rundb/__init__.py +13 -0
- mlrun/{db → api/rundb}/sqldb.py +36 -84
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/schemas/__init__.py +17 -6
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/clients/iguazio.py +33 -33
- mlrun/api/utils/clients/nuclio.py +2 -2
- mlrun/api/utils/periodic.py +9 -2
- mlrun/api/utils/projects/follower.py +14 -7
- mlrun/api/utils/projects/leader.py +2 -1
- mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
- mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
- mlrun/api/utils/runtimes/__init__.py +14 -0
- mlrun/api/utils/runtimes/nuclio.py +43 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +5 -1
- mlrun/api/utils/singletons/project_member.py +4 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +4 -4
- mlrun/artifacts/manager.py +2 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/common/db/__init__.py +14 -0
- mlrun/common/helpers.py +37 -0
- mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
- mlrun/common/model_monitoring/helpers.py +69 -0
- mlrun/common/schemas/__init__.py +13 -1
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/function.py +17 -0
- mlrun/common/schemas/model_monitoring/__init__.py +48 -0
- mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
- mlrun/common/schemas/model_monitoring/grafana.py +55 -0
- mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
- mlrun/common/schemas/notification.py +1 -0
- mlrun/common/schemas/object.py +4 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/regex.py +1 -1
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +45 -42
- mlrun/datastore/__init__.py +21 -0
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/datastore.py +9 -0
- mlrun/datastore/dbfs_store.py +168 -0
- mlrun/datastore/helpers.py +18 -0
- mlrun/datastore/sources.py +1 -0
- mlrun/datastore/store_resources.py +2 -5
- mlrun/datastore/v3io.py +1 -2
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -20
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/common.py +2 -1
- mlrun/feature_store/feature_set.py +1 -11
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +22 -16
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +57 -53
- mlrun/launcher/client.py +5 -4
- mlrun/launcher/factory.py +24 -13
- mlrun/launcher/local.py +6 -6
- mlrun/launcher/remote.py +4 -4
- mlrun/lists.py +0 -11
- mlrun/model.py +11 -17
- mlrun/model_monitoring/__init__.py +2 -22
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +22 -210
- mlrun/model_monitoring/model_endpoint.py +1 -1
- mlrun/model_monitoring/model_monitoring_batch.py +127 -50
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +16 -11
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
- mlrun/model_monitoring/stores/models/mysql.py +47 -29
- mlrun/model_monitoring/stores/models/sqlite.py +47 -29
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
- mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
- mlrun/model_monitoring/tracking_policy.py +104 -0
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/package/packagers/numpy_packagers.py +1 -1
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +53 -159
- mlrun/projects/project.py +10 -37
- mlrun/render.py +1 -1
- mlrun/run.py +8 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +29 -1249
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/funcdoc.py +0 -9
- mlrun/runtimes/function.py +25 -29
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +28 -18
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +14 -6
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/runtimes/utils.py +0 -26
- mlrun/serving/routers.py +7 -7
- mlrun/serving/server.py +11 -8
- mlrun/serving/states.py +7 -1
- mlrun/serving/v2_serving.py +6 -6
- mlrun/utils/helpers.py +23 -42
- mlrun/utils/notifications/notification/__init__.py +4 -0
- mlrun/utils/notifications/notification/webhook.py +61 -0
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- mlrun/utils/model_monitoring.py +0 -249
- /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -12,232 +12,44 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
|
-
import pathlib
|
|
16
|
-
import typing
|
|
17
|
-
|
|
18
|
-
import sqlalchemy.orm
|
|
19
|
-
from fastapi import Depends
|
|
20
|
-
|
|
21
|
-
import mlrun
|
|
22
|
-
import mlrun.api.api.utils
|
|
23
|
-
import mlrun.api.crud.secrets
|
|
24
|
-
import mlrun.api.utils.singletons.db
|
|
25
|
-
import mlrun.api.utils.singletons.k8s
|
|
26
|
-
import mlrun.common.model_monitoring as model_monitoring_constants
|
|
27
|
-
import mlrun.common.schemas
|
|
28
|
-
import mlrun.config
|
|
29
|
-
import mlrun.feature_store as fstore
|
|
30
|
-
import mlrun.model_monitoring.stream_processing_fs
|
|
31
|
-
import mlrun.runtimes
|
|
32
|
-
import mlrun.utils.helpers
|
|
33
|
-
import mlrun.utils.model_monitoring
|
|
34
|
-
from mlrun.api.api import deps
|
|
35
|
-
|
|
36
|
-
_CURRENT_FILE_PATH = pathlib.Path(__file__)
|
|
37
|
-
_STREAM_PROCESSING_FUNCTION_PATH = _CURRENT_FILE_PATH.parent / "stream_processing_fs.py"
|
|
38
|
-
_MONIOTINRG_BATCH_FUNCTION_PATH = (
|
|
39
|
-
_CURRENT_FILE_PATH.parent / "model_monitoring_batch.py"
|
|
40
|
-
)
|
|
41
|
-
|
|
42
15
|
|
|
43
|
-
def initial_model_monitoring_stream_processing_function(
|
|
44
|
-
project: str,
|
|
45
|
-
model_monitoring_access_key: str,
|
|
46
|
-
tracking_policy: mlrun.utils.model_monitoring.TrackingPolicy,
|
|
47
|
-
auth_info: mlrun.common.schemas.AuthInfo,
|
|
48
|
-
parquet_target: str,
|
|
49
|
-
):
|
|
50
|
-
"""
|
|
51
|
-
Initialize model monitoring stream processing function.
|
|
52
|
-
|
|
53
|
-
:param project: Project name.
|
|
54
|
-
:param model_monitoring_access_key: Access key to apply the model monitoring process. Please note that in CE
|
|
55
|
-
deployments this parameter will be None.
|
|
56
|
-
:param tracking_policy: Model monitoring configurations.
|
|
57
|
-
:param auth_info: The auth info of the request.
|
|
58
|
-
:parquet_target: Path to model monitoring parquet file that will be generated by the monitoring
|
|
59
|
-
stream nuclio function.
|
|
60
|
-
|
|
61
|
-
:return: A function object from a mlrun runtime class
|
|
62
|
-
|
|
63
|
-
"""
|
|
64
16
|
|
|
65
|
-
|
|
66
|
-
stream_processor = mlrun.model_monitoring.stream_processing_fs.EventStreamProcessor(
|
|
67
|
-
project=project,
|
|
68
|
-
parquet_batching_max_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
69
|
-
parquet_target=parquet_target,
|
|
70
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
71
|
-
)
|
|
17
|
+
import typing
|
|
72
18
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
name="model-monitoring-stream",
|
|
76
|
-
project=project,
|
|
77
|
-
filename=str(_STREAM_PROCESSING_FUNCTION_PATH),
|
|
78
|
-
kind="serving",
|
|
79
|
-
image=tracking_policy.stream_image,
|
|
80
|
-
)
|
|
19
|
+
import mlrun.common.model_monitoring.helpers
|
|
20
|
+
import mlrun.common.schemas
|
|
81
21
|
|
|
82
|
-
# Create monitoring serving graph
|
|
83
|
-
stream_processor.apply_monitoring_serving_graph(function)
|
|
84
22
|
|
|
85
|
-
|
|
86
|
-
|
|
23
|
+
def get_stream_path(project: str = None):
|
|
24
|
+
"""Get stream path from the project secret. If wasn't set, take it from the system configurations"""
|
|
87
25
|
|
|
88
|
-
|
|
89
|
-
|
|
26
|
+
stream_uri = mlrun.get_secret_or_env(
|
|
27
|
+
mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
|
|
28
|
+
) or mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
90
29
|
project=project,
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
auth_info=auth_info,
|
|
30
|
+
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
|
|
31
|
+
target="online",
|
|
94
32
|
)
|
|
95
33
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
function.spec.parameters = run_config.parameters
|
|
99
|
-
|
|
100
|
-
return function
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def get_model_monitoring_batch_function(
|
|
104
|
-
project: str,
|
|
105
|
-
model_monitoring_access_key: str,
|
|
106
|
-
db_session: sqlalchemy.orm.Session,
|
|
107
|
-
auth_info: mlrun.common.schemas.AuthInfo,
|
|
108
|
-
tracking_policy: mlrun.utils.model_monitoring.TrackingPolicy,
|
|
109
|
-
):
|
|
110
|
-
"""
|
|
111
|
-
Initialize model monitoring batch function.
|
|
112
|
-
|
|
113
|
-
:param project: project name.
|
|
114
|
-
:param model_monitoring_access_key: access key to apply the model monitoring process. Please note that in CE
|
|
115
|
-
deployments this parameter will be None.
|
|
116
|
-
:param db_session: A session that manages the current dialog with the database.
|
|
117
|
-
:param auth_info: The auth info of the request.
|
|
118
|
-
:param tracking_policy: Model monitoring configurations.
|
|
119
|
-
|
|
120
|
-
:return: A function object from a mlrun runtime class
|
|
121
|
-
|
|
122
|
-
"""
|
|
123
|
-
|
|
124
|
-
# Create job function runtime for the model monitoring batch
|
|
125
|
-
function: mlrun.runtimes.KubejobRuntime = mlrun.code_to_function(
|
|
126
|
-
name="model-monitoring-batch",
|
|
127
|
-
project=project,
|
|
128
|
-
filename=str(_MONIOTINRG_BATCH_FUNCTION_PATH),
|
|
129
|
-
kind="job",
|
|
130
|
-
image=tracking_policy.default_batch_image,
|
|
131
|
-
handler="handler",
|
|
34
|
+
return mlrun.common.model_monitoring.helpers.parse_monitoring_stream_path(
|
|
35
|
+
stream_uri=stream_uri, project=project
|
|
132
36
|
)
|
|
133
|
-
function.set_db_connection(mlrun.api.api.utils.get_run_db_instance(db_session))
|
|
134
37
|
|
|
135
|
-
# Set the project to the job function
|
|
136
|
-
function.metadata.project = project
|
|
137
38
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
function=function,
|
|
142
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
143
|
-
auth_info=auth_info,
|
|
144
|
-
)
|
|
39
|
+
def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
40
|
+
"""Get endpoint store connection string from the project secret. If wasn't set, take it from the system
|
|
41
|
+
configurations.
|
|
145
42
|
|
|
146
|
-
|
|
147
|
-
mlrun.api.api.utils.apply_enrichment_and_validation_on_function(function, auth_info)
|
|
43
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
148
44
|
|
|
149
|
-
return
|
|
45
|
+
:return: Valid SQL connection string.
|
|
150
46
|
|
|
151
|
-
|
|
152
|
-
def _apply_stream_trigger(
|
|
153
|
-
project: str,
|
|
154
|
-
function: mlrun.runtimes.ServingRuntime,
|
|
155
|
-
model_monitoring_access_key: str = None,
|
|
156
|
-
auth_info: mlrun.common.schemas.AuthInfo = Depends(deps.authenticate_request),
|
|
157
|
-
) -> mlrun.runtimes.ServingRuntime:
|
|
158
|
-
"""Adding stream source for the nuclio serving function. By default, the function has HTTP stream trigger along
|
|
159
|
-
with another supported stream source that can be either Kafka or V3IO, depends on the stream path schema that is
|
|
160
|
-
defined under mlrun.mlconf.model_endpoint_monitoring.store_prefixes. Note that if no valid stream path has been
|
|
161
|
-
provided then the function will have a single HTTP stream source.
|
|
162
|
-
|
|
163
|
-
:param project: Project name.
|
|
164
|
-
:param function: The serving function object that will be applied with the stream trigger.
|
|
165
|
-
:param model_monitoring_access_key: Access key to apply the model monitoring stream function when the stream is
|
|
166
|
-
schema is V3IO.
|
|
167
|
-
:param auth_info: The auth info of the request.
|
|
168
|
-
|
|
169
|
-
:return: ServingRuntime object with stream trigger.
|
|
170
47
|
"""
|
|
171
48
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
if stream_path.startswith("kafka://"):
|
|
177
|
-
|
|
178
|
-
topic, brokers = mlrun.datastore.utils.parse_kafka_url(url=stream_path)
|
|
179
|
-
# Generate Kafka stream source
|
|
180
|
-
stream_source = mlrun.datastore.sources.KafkaSource(
|
|
181
|
-
brokers=brokers,
|
|
182
|
-
topics=[topic],
|
|
49
|
+
return (
|
|
50
|
+
mlrun.get_secret_or_env(
|
|
51
|
+
key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
|
|
52
|
+
secret_provider=secret_provider,
|
|
183
53
|
)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
187
|
-
function = _apply_access_key_and_mount_function(
|
|
188
|
-
project=project,
|
|
189
|
-
function=function,
|
|
190
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
191
|
-
auth_info=auth_info,
|
|
192
|
-
)
|
|
193
|
-
if stream_path.startswith("v3io://"):
|
|
194
|
-
# Generate V3IO stream trigger
|
|
195
|
-
function.add_v3io_stream_trigger(
|
|
196
|
-
stream_path=stream_path, name="monitoring_stream_trigger"
|
|
197
|
-
)
|
|
198
|
-
# Add the default HTTP source
|
|
199
|
-
http_source = mlrun.datastore.sources.HttpSource()
|
|
200
|
-
function = http_source.add_nuclio_trigger(function)
|
|
201
|
-
|
|
202
|
-
return function
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def _apply_access_key_and_mount_function(
|
|
206
|
-
project: str,
|
|
207
|
-
function: typing.Union[
|
|
208
|
-
mlrun.runtimes.KubejobRuntime, mlrun.runtimes.ServingRuntime
|
|
209
|
-
],
|
|
210
|
-
model_monitoring_access_key: str,
|
|
211
|
-
auth_info: mlrun.common.schemas.AuthInfo,
|
|
212
|
-
) -> typing.Union[mlrun.runtimes.KubejobRuntime, mlrun.runtimes.ServingRuntime]:
|
|
213
|
-
"""Applying model monitoring access key on the provided function when using V3IO path. In addition, this method
|
|
214
|
-
mount the V3IO path for the provided function to configure the access to the system files.
|
|
215
|
-
|
|
216
|
-
:param project: Project name.
|
|
217
|
-
:param function: Model monitoring function object that will be filled with the access key and
|
|
218
|
-
the access to the system files.
|
|
219
|
-
:param model_monitoring_access_key: Access key to apply the model monitoring stream function when the stream is
|
|
220
|
-
schema is V3IO.
|
|
221
|
-
:param auth_info: The auth info of the request.
|
|
222
|
-
|
|
223
|
-
:return: function runtime object with access key and access to system files.
|
|
224
|
-
"""
|
|
225
|
-
|
|
226
|
-
# Set model monitoring access key for managing permissions
|
|
227
|
-
function.set_env_from_secret(
|
|
228
|
-
model_monitoring_constants.ProjectSecretKeys.ACCESS_KEY,
|
|
229
|
-
mlrun.api.utils.singletons.k8s.get_k8s_helper().get_project_secret_name(
|
|
230
|
-
project
|
|
231
|
-
),
|
|
232
|
-
mlrun.api.crud.secrets.Secrets().generate_client_project_secret_key(
|
|
233
|
-
mlrun.api.crud.secrets.SecretsClientType.model_monitoring,
|
|
234
|
-
model_monitoring_constants.ProjectSecretKeys.ACCESS_KEY,
|
|
235
|
-
),
|
|
54
|
+
or mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection
|
|
236
55
|
)
|
|
237
|
-
function.metadata.credentials.access_key = model_monitoring_access_key
|
|
238
|
-
function.apply(mlrun.mount_v3io())
|
|
239
|
-
|
|
240
|
-
# Ensure that the auth env vars are set
|
|
241
|
-
mlrun.api.api.utils.ensure_function_has_auth_set(function, auth_info)
|
|
242
|
-
|
|
243
|
-
return function
|
|
@@ -27,16 +27,11 @@ import v3io
|
|
|
27
27
|
import v3io.dataplane
|
|
28
28
|
import v3io_frames
|
|
29
29
|
|
|
30
|
-
import mlrun
|
|
31
|
-
import mlrun.common.model_monitoring
|
|
32
|
-
import mlrun.common.schemas
|
|
30
|
+
import mlrun.common.helpers
|
|
31
|
+
import mlrun.common.model_monitoring.helpers
|
|
32
|
+
import mlrun.common.schemas.model_monitoring
|
|
33
33
|
import mlrun.data_types.infer
|
|
34
34
|
import mlrun.feature_store as fstore
|
|
35
|
-
import mlrun.model_monitoring
|
|
36
|
-
import mlrun.model_monitoring.stores
|
|
37
|
-
import mlrun.run
|
|
38
|
-
import mlrun.utils.helpers
|
|
39
|
-
import mlrun.utils.model_monitoring
|
|
40
35
|
import mlrun.utils.v3io_clients
|
|
41
36
|
from mlrun.utils import logger
|
|
42
37
|
|
|
@@ -497,7 +492,6 @@ class BatchProcessor:
|
|
|
497
492
|
context: mlrun.run.MLClientCtx,
|
|
498
493
|
project: str,
|
|
499
494
|
):
|
|
500
|
-
|
|
501
495
|
"""
|
|
502
496
|
Initialize Batch Processor object.
|
|
503
497
|
|
|
@@ -525,9 +519,7 @@ class BatchProcessor:
|
|
|
525
519
|
|
|
526
520
|
# Get a runtime database
|
|
527
521
|
|
|
528
|
-
self.db = mlrun.model_monitoring.
|
|
529
|
-
project=project
|
|
530
|
-
)
|
|
522
|
+
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
531
523
|
|
|
532
524
|
if not mlrun.mlconf.is_ce_mode():
|
|
533
525
|
# TODO: Once there is a time series DB alternative in a non-CE deployment, we need to update this if
|
|
@@ -539,7 +531,7 @@ class BatchProcessor:
|
|
|
539
531
|
|
|
540
532
|
# Get the batch interval range
|
|
541
533
|
self.batch_dict = context.parameters[
|
|
542
|
-
mlrun.common.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
|
|
534
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.BATCH_INTERVALS_DICT
|
|
543
535
|
]
|
|
544
536
|
|
|
545
537
|
# TODO: This will be removed in 1.5.0 once the job params can be parsed with different types
|
|
@@ -556,23 +548,27 @@ class BatchProcessor:
|
|
|
556
548
|
# Define the required paths for the project objects
|
|
557
549
|
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
558
550
|
project=self.project,
|
|
559
|
-
kind=mlrun.common.model_monitoring.FileTargetKind.EVENTS,
|
|
551
|
+
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.EVENTS,
|
|
560
552
|
)
|
|
561
553
|
(
|
|
562
554
|
_,
|
|
563
555
|
self.tsdb_container,
|
|
564
556
|
self.tsdb_path,
|
|
565
|
-
) = mlrun.
|
|
557
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
558
|
+
tsdb_path
|
|
559
|
+
)
|
|
566
560
|
# stream_path = template.format(project=self.project, kind="log_stream")
|
|
567
561
|
stream_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
568
562
|
project=self.project,
|
|
569
|
-
kind=mlrun.common.model_monitoring.FileTargetKind.LOG_STREAM,
|
|
563
|
+
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.LOG_STREAM,
|
|
570
564
|
)
|
|
571
565
|
(
|
|
572
566
|
_,
|
|
573
567
|
self.stream_container,
|
|
574
568
|
self.stream_path,
|
|
575
|
-
) = mlrun.
|
|
569
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
570
|
+
stream_path
|
|
571
|
+
)
|
|
576
572
|
|
|
577
573
|
# Get the frames clients based on the v3io configuration
|
|
578
574
|
# it will be used later for writing the results into the tsdb
|
|
@@ -619,24 +615,24 @@ class BatchProcessor:
|
|
|
619
615
|
|
|
620
616
|
for endpoint in endpoints:
|
|
621
617
|
if (
|
|
622
|
-
endpoint[mlrun.common.model_monitoring.EventFieldType.ACTIVE]
|
|
618
|
+
endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.ACTIVE]
|
|
623
619
|
and endpoint[
|
|
624
|
-
mlrun.common.model_monitoring.EventFieldType.MONITORING_MODE
|
|
620
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.MONITORING_MODE
|
|
625
621
|
]
|
|
626
|
-
== mlrun.common.model_monitoring.ModelMonitoringMode.enabled.value
|
|
622
|
+
== mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled.value
|
|
627
623
|
):
|
|
628
624
|
# Skip router endpoint:
|
|
629
625
|
if (
|
|
630
626
|
int(
|
|
631
627
|
endpoint[
|
|
632
|
-
mlrun.common.model_monitoring.EventFieldType.ENDPOINT_TYPE
|
|
628
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_TYPE
|
|
633
629
|
]
|
|
634
630
|
)
|
|
635
|
-
== mlrun.common.model_monitoring.EndpointType.ROUTER
|
|
631
|
+
== mlrun.common.schemas.model_monitoring.EndpointType.ROUTER
|
|
636
632
|
):
|
|
637
633
|
# Router endpoint has no feature stats
|
|
638
634
|
logger.info(
|
|
639
|
-
f"{endpoint[mlrun.common.model_monitoring.EventFieldType.UID]} is router skipping"
|
|
635
|
+
f"{endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]} is router skipping"
|
|
640
636
|
)
|
|
641
637
|
continue
|
|
642
638
|
self.update_drift_metrics(endpoint=endpoint)
|
|
@@ -649,12 +645,14 @@ class BatchProcessor:
|
|
|
649
645
|
serving_function_name,
|
|
650
646
|
_,
|
|
651
647
|
_,
|
|
652
|
-
) = mlrun.
|
|
653
|
-
endpoint[
|
|
648
|
+
) = mlrun.common.helpers.parse_versioned_object_uri(
|
|
649
|
+
endpoint[
|
|
650
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.FUNCTION_URI
|
|
651
|
+
]
|
|
654
652
|
)
|
|
655
653
|
|
|
656
654
|
model_name = endpoint[
|
|
657
|
-
mlrun.common.model_monitoring.EventFieldType.MODEL
|
|
655
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.MODEL
|
|
658
656
|
].replace(":", "-")
|
|
659
657
|
|
|
660
658
|
m_fs = fstore.get_feature_set(
|
|
@@ -668,7 +666,7 @@ class BatchProcessor:
|
|
|
668
666
|
df = m_fs.to_dataframe(
|
|
669
667
|
start_time=start_time,
|
|
670
668
|
end_time=end_time,
|
|
671
|
-
time_column=mlrun.common.model_monitoring.EventFieldType.TIMESTAMP,
|
|
669
|
+
time_column=mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
|
|
672
670
|
)
|
|
673
671
|
|
|
674
672
|
if len(df) == 0:
|
|
@@ -676,7 +674,7 @@ class BatchProcessor:
|
|
|
676
674
|
"Not enough model events since the beginning of the batch interval",
|
|
677
675
|
parquet_target=m_fs.status.targets[0].path,
|
|
678
676
|
endpoint=endpoint[
|
|
679
|
-
mlrun.common.model_monitoring.EventFieldType.UID
|
|
677
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
680
678
|
],
|
|
681
679
|
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
682
680
|
start_time=str(
|
|
@@ -694,7 +692,9 @@ class BatchProcessor:
|
|
|
694
692
|
logger.warn(
|
|
695
693
|
"Parquet not found, probably due to not enough model events",
|
|
696
694
|
parquet_target=m_fs.status.targets[0].path,
|
|
697
|
-
endpoint=endpoint[
|
|
695
|
+
endpoint=endpoint[
|
|
696
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
697
|
+
],
|
|
698
698
|
min_rqeuired_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
699
699
|
)
|
|
700
700
|
return
|
|
@@ -706,14 +706,16 @@ class BatchProcessor:
|
|
|
706
706
|
|
|
707
707
|
# Create DataFrame based on the input features
|
|
708
708
|
stats_columns = [
|
|
709
|
-
mlrun.common.model_monitoring.EventFieldType.TIMESTAMP,
|
|
709
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP,
|
|
710
710
|
*feature_names,
|
|
711
711
|
]
|
|
712
712
|
|
|
713
713
|
# Add label names if provided
|
|
714
|
-
if endpoint[
|
|
714
|
+
if endpoint[
|
|
715
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
|
|
716
|
+
]:
|
|
715
717
|
labels = endpoint[
|
|
716
|
-
mlrun.common.model_monitoring.EventFieldType.LABEL_NAMES
|
|
718
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.LABEL_NAMES
|
|
717
719
|
]
|
|
718
720
|
if isinstance(labels, str):
|
|
719
721
|
labels = json.loads(labels)
|
|
@@ -731,13 +733,15 @@ class BatchProcessor:
|
|
|
731
733
|
m_fs.save()
|
|
732
734
|
|
|
733
735
|
# Get the timestamp of the latest request:
|
|
734
|
-
timestamp = df[
|
|
735
|
-
|
|
736
|
-
]
|
|
736
|
+
timestamp = df[
|
|
737
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.TIMESTAMP
|
|
738
|
+
].iloc[-1]
|
|
737
739
|
|
|
738
740
|
# Get the feature stats from the model endpoint for reference data
|
|
739
741
|
feature_stats = json.loads(
|
|
740
|
-
endpoint[
|
|
742
|
+
endpoint[
|
|
743
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS
|
|
744
|
+
]
|
|
741
745
|
)
|
|
742
746
|
|
|
743
747
|
# Get the current stats:
|
|
@@ -758,7 +762,7 @@ class BatchProcessor:
|
|
|
758
762
|
monitor_configuration = (
|
|
759
763
|
json.loads(
|
|
760
764
|
endpoint[
|
|
761
|
-
mlrun.common.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
|
|
765
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.MONITOR_CONFIGURATION
|
|
762
766
|
]
|
|
763
767
|
)
|
|
764
768
|
or {}
|
|
@@ -778,7 +782,9 @@ class BatchProcessor:
|
|
|
778
782
|
)
|
|
779
783
|
logger.info(
|
|
780
784
|
"Drift status",
|
|
781
|
-
endpoint_id=endpoint[
|
|
785
|
+
endpoint_id=endpoint[
|
|
786
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
787
|
+
],
|
|
782
788
|
drift_status=drift_status.value,
|
|
783
789
|
drift_measure=drift_measure,
|
|
784
790
|
)
|
|
@@ -790,40 +796,54 @@ class BatchProcessor:
|
|
|
790
796
|
}
|
|
791
797
|
|
|
792
798
|
self.db.update_model_endpoint(
|
|
793
|
-
endpoint_id=endpoint[
|
|
799
|
+
endpoint_id=endpoint[
|
|
800
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
801
|
+
],
|
|
794
802
|
attributes=attributes,
|
|
795
803
|
)
|
|
796
804
|
|
|
797
805
|
if not mlrun.mlconf.is_ce_mode():
|
|
798
806
|
# Update drift results in TSDB
|
|
799
|
-
self.
|
|
807
|
+
self._update_drift_in_v3io_tsdb(
|
|
800
808
|
endpoint_id=endpoint[
|
|
801
|
-
mlrun.common.model_monitoring.EventFieldType.UID
|
|
809
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
802
810
|
],
|
|
803
811
|
drift_status=drift_status,
|
|
804
812
|
drift_measure=drift_measure,
|
|
805
813
|
drift_result=drift_result,
|
|
806
814
|
timestamp=timestamp,
|
|
807
815
|
)
|
|
808
|
-
|
|
809
|
-
|
|
816
|
+
|
|
817
|
+
else:
|
|
818
|
+
# Update drift results in Prometheus
|
|
819
|
+
self._update_drift_in_prometheus(
|
|
810
820
|
endpoint_id=endpoint[
|
|
811
|
-
mlrun.common.model_monitoring.EventFieldType.UID
|
|
821
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
812
822
|
],
|
|
823
|
+
drift_status=drift_status,
|
|
824
|
+
drift_result=drift_result,
|
|
813
825
|
)
|
|
814
826
|
|
|
815
827
|
except Exception as e:
|
|
816
828
|
logger.error(
|
|
817
|
-
f"Exception for endpoint {endpoint[mlrun.common.model_monitoring.EventFieldType.UID]}"
|
|
829
|
+
f"Exception for endpoint {endpoint[mlrun.common.schemas.model_monitoring.EventFieldType.UID]}"
|
|
818
830
|
)
|
|
819
831
|
self.exception = e
|
|
832
|
+
logger.info(
|
|
833
|
+
"Done updating drift measures",
|
|
834
|
+
endpoint_id=endpoint[
|
|
835
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
836
|
+
],
|
|
837
|
+
)
|
|
820
838
|
|
|
821
839
|
def _get_interval_range(self) -> Tuple[datetime.datetime, datetime.datetime]:
|
|
822
840
|
"""Getting batch interval time range"""
|
|
823
841
|
minutes, hours, days = (
|
|
824
|
-
self.batch_dict[
|
|
825
|
-
|
|
826
|
-
|
|
842
|
+
self.batch_dict[
|
|
843
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.MINUTES
|
|
844
|
+
],
|
|
845
|
+
self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.HOURS],
|
|
846
|
+
self.batch_dict[mlrun.common.schemas.model_monitoring.EventFieldType.DAYS],
|
|
827
847
|
)
|
|
828
848
|
start_time = datetime.datetime.now() - datetime.timedelta(
|
|
829
849
|
minutes=minutes, hours=hours, days=days
|
|
@@ -843,7 +863,7 @@ class BatchProcessor:
|
|
|
843
863
|
pair_list = pair.split(":")
|
|
844
864
|
self.batch_dict[pair_list[0]] = float(pair_list[1])
|
|
845
865
|
|
|
846
|
-
def
|
|
866
|
+
def _update_drift_in_v3io_tsdb(
|
|
847
867
|
self,
|
|
848
868
|
endpoint_id: str,
|
|
849
869
|
drift_status: DriftStatus,
|
|
@@ -888,7 +908,7 @@ class BatchProcessor:
|
|
|
888
908
|
"endpoint_id": endpoint_id,
|
|
889
909
|
"timestamp": pd.to_datetime(
|
|
890
910
|
timestamp,
|
|
891
|
-
format=mlrun.common.model_monitoring.EventFieldType.TIME_FORMAT,
|
|
911
|
+
format=mlrun.common.schemas.model_monitoring.EventFieldType.TIME_FORMAT,
|
|
892
912
|
),
|
|
893
913
|
"record_type": "drift_measures",
|
|
894
914
|
"tvd_mean": drift_result["tvd_mean"],
|
|
@@ -911,6 +931,63 @@ class BatchProcessor:
|
|
|
911
931
|
endpoint=endpoint_id,
|
|
912
932
|
)
|
|
913
933
|
|
|
934
|
+
def _update_drift_in_prometheus(
|
|
935
|
+
self,
|
|
936
|
+
endpoint_id: str,
|
|
937
|
+
drift_status: DriftStatus,
|
|
938
|
+
drift_result: Dict[str, Dict[str, Any]],
|
|
939
|
+
):
|
|
940
|
+
"""Push drift metrics to Prometheus registry. Please note that the metrics are being pushed through HTTP
|
|
941
|
+
to the monitoring stream pod that writes them into a local registry. Afterwards, Prometheus wil scrape these
|
|
942
|
+
metrics that will be available in the Grafana charts.
|
|
943
|
+
|
|
944
|
+
:param endpoint_id: The unique id of the model endpoint.
|
|
945
|
+
:param drift_status: Drift status result. Possible values can be found under DriftStatus enum class.
|
|
946
|
+
:param drift_result: A dictionary that includes the drift results for each feature.
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
"""
|
|
950
|
+
stream_http_path = (
|
|
951
|
+
mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
|
|
952
|
+
project=self.project
|
|
953
|
+
)
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
statistical_metrics = ["hellinger_mean", "tvd_mean", "kld_mean"]
|
|
957
|
+
metrics = []
|
|
958
|
+
for metric in statistical_metrics:
|
|
959
|
+
metrics.append(
|
|
960
|
+
{
|
|
961
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
962
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.METRIC: metric,
|
|
963
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.VALUE: drift_result[
|
|
964
|
+
metric
|
|
965
|
+
],
|
|
966
|
+
}
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
http_session = mlrun.utils.HTTPSessionWithRetry(
|
|
970
|
+
retry_on_post=True,
|
|
971
|
+
verbose=True,
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
http_session.request(
|
|
975
|
+
method="POST",
|
|
976
|
+
url=stream_http_path + "/monitoring-batch-metrics",
|
|
977
|
+
data=json.dumps(metrics),
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
drift_status_dict = {
|
|
981
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
982
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DRIFT_STATUS: drift_status.value,
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
http_session.request(
|
|
986
|
+
method="POST",
|
|
987
|
+
url=stream_http_path + "/monitoring-drift-status",
|
|
988
|
+
data=json.dumps(drift_status_dict),
|
|
989
|
+
)
|
|
990
|
+
|
|
914
991
|
|
|
915
992
|
def handler(context: mlrun.run.MLClientCtx):
|
|
916
993
|
batch_processor = BatchProcessor(
|