mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +1 -40
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/functions.py +6 -1
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +1 -1
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +3 -0
- mlrun/api/crud/model_monitoring/deployment.py +36 -7
- mlrun/api/crud/model_monitoring/grafana.py +1 -1
- mlrun/api/crud/model_monitoring/helpers.py +32 -2
- mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +4 -9
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/sqldb/db.py +97 -16
- mlrun/api/launcher.py +26 -7
- mlrun/api/main.py +3 -4
- mlrun/{mlutils → api/rundb}/__init__.py +2 -6
- mlrun/{db → api/rundb}/sqldb.py +35 -83
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +4 -0
- mlrun/artifacts/manager.py +1 -2
- mlrun/common/schemas/__init__.py +6 -0
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +11 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +42 -40
- mlrun/datastore/sources.py +1 -1
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -19
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/feature_set.py +0 -10
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +21 -15
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +3 -3
- mlrun/launcher/client.py +3 -2
- mlrun/launcher/factory.py +16 -13
- mlrun/lists.py +0 -11
- mlrun/model.py +9 -15
- mlrun/model_monitoring/helpers.py +15 -25
- mlrun/model_monitoring/model_monitoring_batch.py +72 -4
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +15 -9
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
- mlrun/model_monitoring/stream_processing.py +181 -29
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +57 -158
- mlrun/projects/project.py +6 -32
- mlrun/render.py +1 -1
- mlrun/run.py +2 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +26 -1241
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/function.py +16 -5
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +0 -10
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +1 -1
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/serving/states.py +1 -1
- mlrun/utils/db.py +0 -2
- mlrun/utils/helpers.py +19 -13
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
import typing
|
|
16
|
+
|
|
17
|
+
import prometheus_client
|
|
18
|
+
|
|
19
|
+
from mlrun.common.schemas.model_monitoring import EventFieldType, PrometheusMetric
|
|
20
|
+
|
|
21
|
+
# Memory path for Prometheus registry file
|
|
22
|
+
_registry_path = "/tmp/prom-reg.txt"
|
|
23
|
+
|
|
24
|
+
# Initializing Promethues metric collector registry
|
|
25
|
+
_registry: prometheus_client.CollectorRegistry = prometheus_client.CollectorRegistry()
|
|
26
|
+
|
|
27
|
+
# The following real-time metrics are being updated through the monitoring stream graph steps
|
|
28
|
+
_prediction_counter: prometheus_client.Counter = prometheus_client.Counter(
|
|
29
|
+
name=PrometheusMetric.PREDICTIONS_TOTAL,
|
|
30
|
+
documentation="Counter for total predictions",
|
|
31
|
+
registry=_registry,
|
|
32
|
+
labelnames=[
|
|
33
|
+
EventFieldType.PROJECT,
|
|
34
|
+
EventFieldType.ENDPOINT_ID,
|
|
35
|
+
EventFieldType.MODEL,
|
|
36
|
+
EventFieldType.ENDPOINT_TYPE,
|
|
37
|
+
],
|
|
38
|
+
)
|
|
39
|
+
_model_latency: prometheus_client.Summary = prometheus_client.Summary(
|
|
40
|
+
name=PrometheusMetric.MODEL_LATENCY_SECONDS,
|
|
41
|
+
documentation="Summary for for model latency",
|
|
42
|
+
registry=_registry,
|
|
43
|
+
labelnames=[
|
|
44
|
+
EventFieldType.PROJECT,
|
|
45
|
+
EventFieldType.ENDPOINT_ID,
|
|
46
|
+
EventFieldType.MODEL,
|
|
47
|
+
EventFieldType.ENDPOINT_TYPE,
|
|
48
|
+
],
|
|
49
|
+
)
|
|
50
|
+
_income_features: prometheus_client.Gauge = prometheus_client.Gauge(
|
|
51
|
+
name=PrometheusMetric.INCOME_FEATURES,
|
|
52
|
+
documentation="Samples of features and predictions",
|
|
53
|
+
registry=_registry,
|
|
54
|
+
labelnames=[
|
|
55
|
+
EventFieldType.PROJECT,
|
|
56
|
+
EventFieldType.ENDPOINT_ID,
|
|
57
|
+
EventFieldType.METRIC,
|
|
58
|
+
],
|
|
59
|
+
)
|
|
60
|
+
_error_counter: prometheus_client.Counter = prometheus_client.Counter(
|
|
61
|
+
name=PrometheusMetric.ERRORS_TOTAL,
|
|
62
|
+
documentation="Counter for total errors",
|
|
63
|
+
registry=_registry,
|
|
64
|
+
labelnames=[
|
|
65
|
+
EventFieldType.PROJECT,
|
|
66
|
+
EventFieldType.ENDPOINT_ID,
|
|
67
|
+
EventFieldType.MODEL,
|
|
68
|
+
],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# The following metrics are being updated through the model monitoring batch job
|
|
72
|
+
_batch_metrics: prometheus_client.Gauge = prometheus_client.Gauge(
|
|
73
|
+
name=PrometheusMetric.DRIFT_METRICS,
|
|
74
|
+
documentation="Results from the batch drift analysis",
|
|
75
|
+
registry=_registry,
|
|
76
|
+
labelnames=[
|
|
77
|
+
EventFieldType.PROJECT,
|
|
78
|
+
EventFieldType.ENDPOINT_ID,
|
|
79
|
+
EventFieldType.METRIC,
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
_drift_status: prometheus_client.Enum = prometheus_client.Enum(
|
|
83
|
+
name=PrometheusMetric.DRIFT_STATUS,
|
|
84
|
+
documentation="Drift status of the model endpoint",
|
|
85
|
+
registry=_registry,
|
|
86
|
+
states=["NO_DRIFT", "DRIFT_DETECTED", "POSSIBLE_DRIFT"],
|
|
87
|
+
labelnames=[EventFieldType.PROJECT, EventFieldType.ENDPOINT_ID],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _write_registry(func):
|
|
92
|
+
def wrapper(*args, **kwargs):
|
|
93
|
+
global _registry
|
|
94
|
+
"""A wrapper function to update the registry file each time a metric has been updated"""
|
|
95
|
+
func(*args, **kwargs)
|
|
96
|
+
prometheus_client.write_to_textfile(path=_registry_path, registry=_registry)
|
|
97
|
+
|
|
98
|
+
return wrapper
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@_write_registry
|
|
102
|
+
def write_predictions_and_latency_metrics(
|
|
103
|
+
project: str, endpoint_id: str, latency: int, model_name: str, endpoint_type: int
|
|
104
|
+
):
|
|
105
|
+
"""
|
|
106
|
+
Update the prediction counter and the latency value of the provided model endpoint within Prometheus registry.
|
|
107
|
+
Please note that while the prediction counter is ALWAYS increasing by 1,the latency summary metric is being
|
|
108
|
+
increased by the event latency time. Grafana dashboard will query the average latency time by dividing the total
|
|
109
|
+
latency value by the total amount of predictions.
|
|
110
|
+
|
|
111
|
+
:param project: Project name.
|
|
112
|
+
:param endpoint_id: Model endpoint unique id.
|
|
113
|
+
:param latency: Latency time (microsecond) in which the event has been processed through the model server.
|
|
114
|
+
:param model_name: Model name which will be used by Grafana for displaying the results by model.
|
|
115
|
+
:param endpoint_type: Endpoint type that is represented by an int (possible values: 1,2,3) corresponding to the
|
|
116
|
+
Enum class :py:class:`~mlrun.common.schemas.model_monitoring.EndpointType`.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
# Increase the prediction counter by 1
|
|
120
|
+
_prediction_counter.labels(
|
|
121
|
+
project=project,
|
|
122
|
+
endpoint_id=endpoint_id,
|
|
123
|
+
model=model_name,
|
|
124
|
+
endpoint_type=endpoint_type,
|
|
125
|
+
).inc(1)
|
|
126
|
+
|
|
127
|
+
# Increase the latency value according to the provided latency of the current event
|
|
128
|
+
_model_latency.labels(
|
|
129
|
+
project=project,
|
|
130
|
+
endpoint_id=endpoint_id,
|
|
131
|
+
model=model_name,
|
|
132
|
+
endpoint_type=endpoint_type,
|
|
133
|
+
).observe(latency)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@_write_registry
|
|
137
|
+
def write_income_features(
|
|
138
|
+
project: str, endpoint_id: str, features: typing.Dict[str, float]
|
|
139
|
+
):
|
|
140
|
+
"""Update a sample of features.
|
|
141
|
+
|
|
142
|
+
:param project: Project name.
|
|
143
|
+
:param endpoint_id: Model endpoint unique id.
|
|
144
|
+
:param features: Dictionary in which the key is a feature name and the value is a float number.
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
for metric in features:
|
|
150
|
+
_income_features.labels(
|
|
151
|
+
project=project, endpoint_id=endpoint_id, metric=metric
|
|
152
|
+
).set(value=features[metric])
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@_write_registry
|
|
156
|
+
def write_drift_metrics(project: str, endpoint_id: str, metric: str, value: float):
|
|
157
|
+
"""Update drift metrics that have been calculated through the monitoring batch job
|
|
158
|
+
|
|
159
|
+
:param project: Project name.
|
|
160
|
+
:param endpoint_id: Model endpoint unique id.
|
|
161
|
+
:param metric: Metric name (e.g. TVD, Hellinger).
|
|
162
|
+
:param value: Metric value as a float.
|
|
163
|
+
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
_batch_metrics.labels(project=project, endpoint_id=endpoint_id, metric=metric).set(
|
|
167
|
+
value=value
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@_write_registry
|
|
172
|
+
def write_drift_status(project: str, endpoint_id: str, drift_status: str):
|
|
173
|
+
"""
|
|
174
|
+
Update the drift status enum for a specific model endpoint.
|
|
175
|
+
|
|
176
|
+
:param project: Project name.
|
|
177
|
+
:param endpoint_id: Model endpoint unique id.
|
|
178
|
+
:param drift_status: Drift status value, can be one of the following: 'NO_DRIFT', 'DRIFT_DETECTED', or
|
|
179
|
+
'POSSIBLE_DRIFT'.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
_drift_status.labels(project=project, endpoint_id=endpoint_id).state(drift_status)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@_write_registry
|
|
186
|
+
def write_errors(project: str, endpoint_id: str, model_name: str):
|
|
187
|
+
"""
|
|
188
|
+
Update the error counter for a specific model endpoint.
|
|
189
|
+
|
|
190
|
+
:param project: Project name.
|
|
191
|
+
:param endpoint_id: Model endpoint unique id.
|
|
192
|
+
:param model_name: Model name. Will be used by Grafana to show the amount of errors per model by time.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
_error_counter.labels(
|
|
196
|
+
project=project, endpoint_id=endpoint_id, model=model_name
|
|
197
|
+
).inc(1)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def get_registry() -> str:
|
|
201
|
+
"""Returns the parsed registry file according to the exposition format of Prometheus."""
|
|
202
|
+
|
|
203
|
+
# Read the registry file (note that the text is stored in UTF-8 format)
|
|
204
|
+
f = open(_registry_path)
|
|
205
|
+
lines = f.read()
|
|
206
|
+
f.close()
|
|
207
|
+
|
|
208
|
+
# Reset part of the metrics to avoid a repeating scraping of the same value
|
|
209
|
+
clean_metrics()
|
|
210
|
+
|
|
211
|
+
return lines
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@_write_registry
|
|
215
|
+
def clean_metrics():
|
|
216
|
+
"""Clean the income features values. As these results are relevant only for a certain timestamp, we will remove
|
|
217
|
+
them from the global registry after they have been scraped by Prometheus."""
|
|
218
|
+
|
|
219
|
+
_income_features.clear()
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
import enum
|
|
18
18
|
import typing
|
|
19
19
|
|
|
20
|
+
import mlrun.common.schemas.secret
|
|
20
21
|
import mlrun.errors
|
|
21
22
|
|
|
22
23
|
from .model_endpoint_store import ModelEndpointStore
|
|
@@ -33,6 +34,7 @@ class ModelEndpointStoreType(enum.Enum):
|
|
|
33
34
|
project: str,
|
|
34
35
|
access_key: str = None,
|
|
35
36
|
endpoint_store_connection: str = None,
|
|
37
|
+
secret_provider: typing.Callable = None,
|
|
36
38
|
) -> ModelEndpointStore:
|
|
37
39
|
"""
|
|
38
40
|
Return a ModelEndpointStore object based on the provided enum value.
|
|
@@ -46,6 +48,7 @@ class ModelEndpointStoreType(enum.Enum):
|
|
|
46
48
|
e.g. A root user with password 1234, tries to connect a schema called
|
|
47
49
|
mlrun within a local MySQL DB instance:
|
|
48
50
|
'mysql+pymysql://root:1234@localhost:3306/mlrun'.
|
|
51
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
49
52
|
|
|
50
53
|
:return: `ModelEndpointStore` object.
|
|
51
54
|
|
|
@@ -61,15 +64,13 @@ class ModelEndpointStoreType(enum.Enum):
|
|
|
61
64
|
|
|
62
65
|
# Assuming SQL store target if store type is not KV.
|
|
63
66
|
# Update these lines once there are more than two store target types.
|
|
64
|
-
from mlrun.model_monitoring.helpers import get_connection_string
|
|
65
67
|
|
|
66
|
-
sql_connection_string = endpoint_store_connection or get_connection_string(
|
|
67
|
-
project=project
|
|
68
|
-
)
|
|
69
68
|
from .sql_model_endpoint_store import SQLModelEndpointStore
|
|
70
69
|
|
|
71
70
|
return SQLModelEndpointStore(
|
|
72
|
-
project=project,
|
|
71
|
+
project=project,
|
|
72
|
+
sql_connection_string=endpoint_store_connection,
|
|
73
|
+
secret_provider=secret_provider,
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
@classmethod
|
|
@@ -84,13 +85,16 @@ class ModelEndpointStoreType(enum.Enum):
|
|
|
84
85
|
|
|
85
86
|
|
|
86
87
|
def get_model_endpoint_store(
|
|
87
|
-
project: str,
|
|
88
|
+
project: str,
|
|
89
|
+
access_key: str = None,
|
|
90
|
+
secret_provider: typing.Callable = None,
|
|
88
91
|
) -> ModelEndpointStore:
|
|
89
92
|
"""
|
|
90
93
|
Getting the DB target type based on mlrun.config.model_endpoint_monitoring.store_type.
|
|
91
94
|
|
|
92
|
-
:param project:
|
|
93
|
-
:param access_key:
|
|
95
|
+
:param project: The name of the project.
|
|
96
|
+
:param access_key: Access key with permission to the DB table.
|
|
97
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
94
98
|
|
|
95
99
|
:return: `ModelEndpointStore` object. Using this object, the user can apply different operations on the
|
|
96
100
|
model endpoint record such as write, update, get and delete.
|
|
@@ -102,4 +106,6 @@ def get_model_endpoint_store(
|
|
|
102
106
|
)
|
|
103
107
|
|
|
104
108
|
# Convert into model endpoint store target object
|
|
105
|
-
return model_endpoint_store_type.to_endpoint_store(
|
|
109
|
+
return model_endpoint_store_type.to_endpoint_store(
|
|
110
|
+
project=project, access_key=access_key, secret_provider=secret_provider
|
|
111
|
+
)
|
|
@@ -45,12 +45,14 @@ class SQLModelEndpointStore(ModelEndpointStore):
|
|
|
45
45
|
self,
|
|
46
46
|
project: str,
|
|
47
47
|
sql_connection_string: str = None,
|
|
48
|
+
secret_provider: typing.Callable = None,
|
|
48
49
|
):
|
|
49
50
|
"""
|
|
50
51
|
Initialize SQL store target object.
|
|
51
52
|
|
|
52
53
|
:param project: The name of the project.
|
|
53
54
|
:param sql_connection_string: Valid connection string or a path to SQL database with model endpoints table.
|
|
55
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
54
56
|
"""
|
|
55
57
|
|
|
56
58
|
super().__init__(project=project)
|
|
@@ -58,7 +60,7 @@ class SQLModelEndpointStore(ModelEndpointStore):
|
|
|
58
60
|
self.sql_connection_string = (
|
|
59
61
|
sql_connection_string
|
|
60
62
|
or mlrun.model_monitoring.helpers.get_connection_string(
|
|
61
|
-
|
|
63
|
+
secret_provider=secret_provider
|
|
62
64
|
)
|
|
63
65
|
)
|
|
64
66
|
|
|
@@ -21,8 +21,13 @@ import typing
|
|
|
21
21
|
import pandas as pd
|
|
22
22
|
import storey
|
|
23
23
|
|
|
24
|
+
import mlrun
|
|
24
25
|
import mlrun.common.model_monitoring.helpers
|
|
26
|
+
import mlrun.config
|
|
27
|
+
import mlrun.datastore.targets
|
|
25
28
|
import mlrun.feature_store.steps
|
|
29
|
+
import mlrun.model_monitoring.prometheus
|
|
30
|
+
import mlrun.utils
|
|
26
31
|
import mlrun.utils.v3io_clients
|
|
27
32
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
28
33
|
EventFieldType,
|
|
@@ -41,9 +46,9 @@ class EventStreamProcessor:
|
|
|
41
46
|
self,
|
|
42
47
|
project: str,
|
|
43
48
|
parquet_batching_max_events: int,
|
|
49
|
+
parquet_batching_timeout_secs: int,
|
|
44
50
|
parquet_target: str,
|
|
45
51
|
sample_window: int = 10,
|
|
46
|
-
parquet_batching_timeout_secs: int = 30 * 60, # Default 30 minutes
|
|
47
52
|
aggregate_windows: typing.Optional[typing.List[str]] = None,
|
|
48
53
|
aggregate_period: str = "30s",
|
|
49
54
|
model_monitoring_access_key: str = None,
|
|
@@ -74,6 +79,8 @@ class EventStreamProcessor:
|
|
|
74
79
|
self._initialize_v3io_configurations(
|
|
75
80
|
model_monitoring_access_key=model_monitoring_access_key
|
|
76
81
|
)
|
|
82
|
+
elif self.parquet_path.startswith("s3://"):
|
|
83
|
+
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
77
84
|
|
|
78
85
|
def _initialize_v3io_configurations(
|
|
79
86
|
self,
|
|
@@ -132,7 +139,7 @@ class EventStreamProcessor:
|
|
|
132
139
|
of different operations that are executed on the events from the model server. Each event has
|
|
133
140
|
metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
|
|
134
141
|
Throughout the serving graph, the results are written to 3 different databases:
|
|
135
|
-
1. KV/SQL (steps
|
|
142
|
+
1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
|
|
136
143
|
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
|
|
137
144
|
by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
|
|
138
145
|
from other processes, such as current_stats that is being calculated by the monitoring batch job
|
|
@@ -140,12 +147,14 @@ class EventStreamProcessor:
|
|
|
140
147
|
v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
|
|
141
148
|
is stored within the database that was defined in the provided connection string and can be found
|
|
142
149
|
under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
|
|
143
|
-
2. TSDB (steps
|
|
144
|
-
|
|
145
|
-
|
|
150
|
+
2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
|
|
151
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
|
|
152
|
+
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
153
|
+
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
146
154
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
147
|
-
|
|
148
|
-
|
|
155
|
+
If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
|
|
156
|
+
monitoring stream local memory.
|
|
157
|
+
3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
|
|
149
158
|
that run every hour by default. If defined, the parquet target path can be found under
|
|
150
159
|
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
151
160
|
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
@@ -155,17 +164,41 @@ class EventStreamProcessor:
|
|
|
155
164
|
|
|
156
165
|
graph = fn.set_topology("flow")
|
|
157
166
|
|
|
158
|
-
# Step 1 -
|
|
167
|
+
# Step 1 - Event routing based on the provided path
|
|
168
|
+
def apply_event_routing():
|
|
169
|
+
graph.add_step(
|
|
170
|
+
"EventRouting",
|
|
171
|
+
full_event=True,
|
|
172
|
+
project=self.project,
|
|
173
|
+
).respond()
|
|
174
|
+
|
|
175
|
+
apply_event_routing()
|
|
176
|
+
|
|
177
|
+
# Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
|
|
178
|
+
# through the next steps of the stream graph
|
|
179
|
+
def apply_storey_filter_stream_events():
|
|
180
|
+
# Remove none values from each event
|
|
181
|
+
graph.add_step(
|
|
182
|
+
"storey.Filter",
|
|
183
|
+
"filter_stream_event",
|
|
184
|
+
_fn="('-' not in event.path)",
|
|
185
|
+
full_event=True,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
apply_storey_filter_stream_events()
|
|
189
|
+
|
|
190
|
+
# Step 3 - Process endpoint event: splitting into sub-events and validate event data
|
|
159
191
|
def apply_process_endpoint_event():
|
|
160
192
|
graph.add_step(
|
|
161
193
|
"ProcessEndpointEvent",
|
|
162
194
|
full_event=True,
|
|
163
195
|
project=self.project,
|
|
196
|
+
after="filter_stream_event",
|
|
164
197
|
)
|
|
165
198
|
|
|
166
199
|
apply_process_endpoint_event()
|
|
167
200
|
|
|
168
|
-
# Steps
|
|
201
|
+
# Steps 4,5 - Applying Storey operations of filtering and flatten
|
|
169
202
|
def apply_storey_filter_and_flatmap():
|
|
170
203
|
# Remove none values from each event
|
|
171
204
|
graph.add_step(
|
|
@@ -182,7 +215,7 @@ class EventStreamProcessor:
|
|
|
182
215
|
|
|
183
216
|
apply_storey_filter_and_flatmap()
|
|
184
217
|
|
|
185
|
-
# Step
|
|
218
|
+
# Step 6 - Validating feature names and map each feature to its value
|
|
186
219
|
def apply_map_feature_names():
|
|
187
220
|
graph.add_step(
|
|
188
221
|
"MapFeatureNames",
|
|
@@ -194,9 +227,9 @@ class EventStreamProcessor:
|
|
|
194
227
|
|
|
195
228
|
apply_map_feature_names()
|
|
196
229
|
|
|
197
|
-
# Step
|
|
230
|
+
# Step 7 - Calculate number of predictions and average latency
|
|
198
231
|
def apply_storey_aggregations():
|
|
199
|
-
# Step
|
|
232
|
+
# Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
200
233
|
graph.add_step(
|
|
201
234
|
class_name="storey.AggregateByKey",
|
|
202
235
|
aggregates=[
|
|
@@ -214,8 +247,7 @@ class EventStreamProcessor:
|
|
|
214
247
|
table=".",
|
|
215
248
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
216
249
|
)
|
|
217
|
-
|
|
218
|
-
# Step 5.2 - Rename the latency counter field to prediction counter
|
|
250
|
+
# Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
|
|
219
251
|
graph.add_step(
|
|
220
252
|
class_name="storey.Rename",
|
|
221
253
|
mapping={
|
|
@@ -228,7 +260,7 @@ class EventStreamProcessor:
|
|
|
228
260
|
|
|
229
261
|
apply_storey_aggregations()
|
|
230
262
|
|
|
231
|
-
# Step
|
|
263
|
+
# Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
|
|
232
264
|
def apply_storey_sample_window():
|
|
233
265
|
graph.add_step(
|
|
234
266
|
"storey.steps.SampleWindow",
|
|
@@ -240,8 +272,8 @@ class EventStreamProcessor:
|
|
|
240
272
|
|
|
241
273
|
apply_storey_sample_window()
|
|
242
274
|
|
|
243
|
-
# Steps
|
|
244
|
-
# Step
|
|
275
|
+
# Steps 9-11 - KV/SQL branch
|
|
276
|
+
# Step 9 - Filter relevant keys from the event before writing the data into the database table
|
|
245
277
|
def apply_process_before_endpoint_update():
|
|
246
278
|
graph.add_step(
|
|
247
279
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -251,7 +283,7 @@ class EventStreamProcessor:
|
|
|
251
283
|
|
|
252
284
|
apply_process_before_endpoint_update()
|
|
253
285
|
|
|
254
|
-
# Step
|
|
286
|
+
# Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
255
287
|
# about average latency and the amount of predictions over time
|
|
256
288
|
def apply_update_endpoint():
|
|
257
289
|
graph.add_step(
|
|
@@ -264,7 +296,7 @@ class EventStreamProcessor:
|
|
|
264
296
|
|
|
265
297
|
apply_update_endpoint()
|
|
266
298
|
|
|
267
|
-
# Step
|
|
299
|
+
# Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
268
300
|
# which will be used by Grafana monitoring dashboards
|
|
269
301
|
def apply_infer_schema():
|
|
270
302
|
graph.add_step(
|
|
@@ -279,10 +311,12 @@ class EventStreamProcessor:
|
|
|
279
311
|
if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
|
|
280
312
|
apply_infer_schema()
|
|
281
313
|
|
|
282
|
-
# Steps
|
|
283
|
-
|
|
314
|
+
# Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
|
|
315
|
+
# Steps 20-21 - Prometheus branch
|
|
284
316
|
if not mlrun.mlconf.is_ce_mode():
|
|
285
|
-
#
|
|
317
|
+
# TSDB branch
|
|
318
|
+
|
|
319
|
+
# Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
286
320
|
# stats and details about the events
|
|
287
321
|
def apply_process_before_tsdb():
|
|
288
322
|
graph.add_step(
|
|
@@ -291,7 +325,7 @@ class EventStreamProcessor:
|
|
|
291
325
|
|
|
292
326
|
apply_process_before_tsdb()
|
|
293
327
|
|
|
294
|
-
# Steps
|
|
328
|
+
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
295
329
|
def apply_filter_and_unpacked_keys(name, keys):
|
|
296
330
|
graph.add_step(
|
|
297
331
|
"FilterAndUnpackKeys",
|
|
@@ -322,21 +356,21 @@ class EventStreamProcessor:
|
|
|
322
356
|
key=EventFieldType.ENDPOINT_ID,
|
|
323
357
|
)
|
|
324
358
|
|
|
325
|
-
# Steps
|
|
359
|
+
# Steps 13-14 - unpacked base_metrics dictionary
|
|
326
360
|
apply_filter_and_unpacked_keys(
|
|
327
361
|
name="FilterAndUnpackKeys1",
|
|
328
362
|
keys=EventKeyMetrics.BASE_METRICS,
|
|
329
363
|
)
|
|
330
364
|
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
331
365
|
|
|
332
|
-
# Steps
|
|
366
|
+
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
333
367
|
apply_filter_and_unpacked_keys(
|
|
334
368
|
name="FilterAndUnpackKeys2",
|
|
335
369
|
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
336
370
|
)
|
|
337
371
|
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
338
372
|
|
|
339
|
-
# Steps
|
|
373
|
+
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
340
374
|
apply_filter_and_unpacked_keys(
|
|
341
375
|
name="FilterAndUnpackKeys3",
|
|
342
376
|
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
@@ -352,9 +386,30 @@ class EventStreamProcessor:
|
|
|
352
386
|
|
|
353
387
|
apply_storey_filter()
|
|
354
388
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
389
|
+
else:
|
|
390
|
+
# Prometheus branch
|
|
391
|
+
|
|
392
|
+
# Step 20 - Increase the prediction counter by 1 and update the latency value
|
|
393
|
+
graph.add_step(
|
|
394
|
+
"IncCounter",
|
|
395
|
+
name="IncCounter",
|
|
396
|
+
after="MapFeatureNames",
|
|
397
|
+
project=self.project,
|
|
398
|
+
)
|
|
355
399
|
|
|
356
|
-
|
|
357
|
-
|
|
400
|
+
# Step 21 - Record a sample of features and labels
|
|
401
|
+
def apply_record_features_to_prometheus():
|
|
402
|
+
graph.add_step(
|
|
403
|
+
"RecordFeatures",
|
|
404
|
+
name="RecordFeaturesToPrometheus",
|
|
405
|
+
after="sample",
|
|
406
|
+
project=self.project,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
apply_record_features_to_prometheus()
|
|
410
|
+
|
|
411
|
+
# Steps 22-23 - Parquet branch
|
|
412
|
+
# Step 22 - Filter and validate different keys before writing the data to Parquet target
|
|
358
413
|
def apply_process_before_parquet():
|
|
359
414
|
graph.add_step(
|
|
360
415
|
"ProcessBeforeParquet",
|
|
@@ -365,7 +420,7 @@ class EventStreamProcessor:
|
|
|
365
420
|
|
|
366
421
|
apply_process_before_parquet()
|
|
367
422
|
|
|
368
|
-
# Step
|
|
423
|
+
# Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
369
424
|
def apply_parquet_target():
|
|
370
425
|
graph.add_step(
|
|
371
426
|
"storey.ParquetTarget",
|
|
@@ -615,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
615
670
|
error = event.get("error")
|
|
616
671
|
if error:
|
|
617
672
|
self.error_count[endpoint_id] += 1
|
|
673
|
+
mlrun.model_monitoring.prometheus.write_errors(
|
|
674
|
+
project=self.project,
|
|
675
|
+
endpoint_id=event["endpoint_id"],
|
|
676
|
+
model_name=event["model"],
|
|
677
|
+
)
|
|
618
678
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
619
679
|
|
|
620
680
|
# Validate event fields
|
|
@@ -1068,6 +1128,98 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1068
1128
|
return event
|
|
1069
1129
|
|
|
1070
1130
|
|
|
1131
|
+
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
1132
|
+
"""
|
|
1133
|
+
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
1134
|
+
to the caller. At the moment there are several paths:
|
|
1135
|
+
|
|
1136
|
+
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
1137
|
+
to scrape the results from the monitoring stream memory.
|
|
1138
|
+
|
|
1139
|
+
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
1140
|
+
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
1141
|
+
metrics.
|
|
1142
|
+
|
|
1143
|
+
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
1144
|
+
|
|
1145
|
+
"""
|
|
1146
|
+
|
|
1147
|
+
def __init__(
|
|
1148
|
+
self,
|
|
1149
|
+
project: str,
|
|
1150
|
+
**kwargs,
|
|
1151
|
+
):
|
|
1152
|
+
super().__init__(**kwargs)
|
|
1153
|
+
self.project: str = project
|
|
1154
|
+
|
|
1155
|
+
def do(self, event):
|
|
1156
|
+
if event.path == "/model-monitoring-metrics":
|
|
1157
|
+
# Return a parsed Prometheus registry file
|
|
1158
|
+
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1159
|
+
elif event.path == "/monitoring-batch-metrics":
|
|
1160
|
+
# Update statistical metrics
|
|
1161
|
+
for event_metric in event.body:
|
|
1162
|
+
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1163
|
+
project=self.project,
|
|
1164
|
+
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1165
|
+
metric=event_metric[EventFieldType.METRIC],
|
|
1166
|
+
value=event_metric[EventFieldType.VALUE],
|
|
1167
|
+
)
|
|
1168
|
+
elif event.path == "/monitoring-drift-status":
|
|
1169
|
+
# Update drift status
|
|
1170
|
+
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1171
|
+
project=self.project,
|
|
1172
|
+
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1173
|
+
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
return event
|
|
1177
|
+
|
|
1178
|
+
|
|
1179
|
+
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1180
|
+
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1181
|
+
|
|
1182
|
+
def __init__(self, project: str, **kwargs):
|
|
1183
|
+
super().__init__(**kwargs)
|
|
1184
|
+
self.project: str = project
|
|
1185
|
+
|
|
1186
|
+
def do(self, event):
|
|
1187
|
+
# Compute prediction per second
|
|
1188
|
+
|
|
1189
|
+
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1190
|
+
project=self.project,
|
|
1191
|
+
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1192
|
+
latency=event[EventFieldType.LATENCY],
|
|
1193
|
+
model_name=event[EventFieldType.MODEL],
|
|
1194
|
+
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
return event
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1201
|
+
"""Record a sample of features and labels in Prometheus registry"""
|
|
1202
|
+
|
|
1203
|
+
def __init__(self, project: str, **kwargs):
|
|
1204
|
+
super().__init__(**kwargs)
|
|
1205
|
+
self.project: str = project
|
|
1206
|
+
|
|
1207
|
+
def do(self, event):
|
|
1208
|
+
# Generate a dictionary of features and predictions
|
|
1209
|
+
features = {
|
|
1210
|
+
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1211
|
+
**event[EventFieldType.NAMED_FEATURES],
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1215
|
+
project=self.project,
|
|
1216
|
+
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1217
|
+
features=features,
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
return event
|
|
1221
|
+
|
|
1222
|
+
|
|
1071
1223
|
def update_endpoint_record(
|
|
1072
1224
|
project: str,
|
|
1073
1225
|
endpoint_id: str,
|