mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +3 -41
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/frontend_spec.py +2 -1
- mlrun/api/api/endpoints/functions.py +95 -59
- mlrun/api/api/endpoints/grafana_proxy.py +9 -9
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/model_endpoints.py +3 -2
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/submit.py +2 -1
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +3 -4
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +6 -2
- mlrun/api/crud/feature_store.py +5 -0
- mlrun/api/crud/model_monitoring/__init__.py +1 -0
- mlrun/api/crud/model_monitoring/deployment.py +497 -0
- mlrun/api/crud/model_monitoring/grafana.py +96 -42
- mlrun/api/crud/model_monitoring/helpers.py +159 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +6 -11
- mlrun/api/crud/projects.py +2 -2
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/init_db.py +2 -4
- mlrun/api/db/session.py +1 -1
- mlrun/api/db/sqldb/db.py +129 -31
- mlrun/api/db/sqldb/models/models_mysql.py +15 -1
- mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
- mlrun/api/launcher.py +38 -6
- mlrun/api/main.py +3 -2
- mlrun/api/rundb/__init__.py +13 -0
- mlrun/{db → api/rundb}/sqldb.py +36 -84
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/schemas/__init__.py +17 -6
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/clients/iguazio.py +33 -33
- mlrun/api/utils/clients/nuclio.py +2 -2
- mlrun/api/utils/periodic.py +9 -2
- mlrun/api/utils/projects/follower.py +14 -7
- mlrun/api/utils/projects/leader.py +2 -1
- mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
- mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
- mlrun/api/utils/runtimes/__init__.py +14 -0
- mlrun/api/utils/runtimes/nuclio.py +43 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +5 -1
- mlrun/api/utils/singletons/project_member.py +4 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +4 -4
- mlrun/artifacts/manager.py +2 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/common/db/__init__.py +14 -0
- mlrun/common/helpers.py +37 -0
- mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
- mlrun/common/model_monitoring/helpers.py +69 -0
- mlrun/common/schemas/__init__.py +13 -1
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/function.py +17 -0
- mlrun/common/schemas/model_monitoring/__init__.py +48 -0
- mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
- mlrun/common/schemas/model_monitoring/grafana.py +55 -0
- mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
- mlrun/common/schemas/notification.py +1 -0
- mlrun/common/schemas/object.py +4 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/regex.py +1 -1
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +45 -42
- mlrun/datastore/__init__.py +21 -0
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/datastore.py +9 -0
- mlrun/datastore/dbfs_store.py +168 -0
- mlrun/datastore/helpers.py +18 -0
- mlrun/datastore/sources.py +1 -0
- mlrun/datastore/store_resources.py +2 -5
- mlrun/datastore/v3io.py +1 -2
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -20
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/common.py +2 -1
- mlrun/feature_store/feature_set.py +1 -11
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +22 -16
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +57 -53
- mlrun/launcher/client.py +5 -4
- mlrun/launcher/factory.py +24 -13
- mlrun/launcher/local.py +6 -6
- mlrun/launcher/remote.py +4 -4
- mlrun/lists.py +0 -11
- mlrun/model.py +11 -17
- mlrun/model_monitoring/__init__.py +2 -22
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +22 -210
- mlrun/model_monitoring/model_endpoint.py +1 -1
- mlrun/model_monitoring/model_monitoring_batch.py +127 -50
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +16 -11
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
- mlrun/model_monitoring/stores/models/mysql.py +47 -29
- mlrun/model_monitoring/stores/models/sqlite.py +47 -29
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
- mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
- mlrun/model_monitoring/tracking_policy.py +104 -0
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/package/packagers/numpy_packagers.py +1 -1
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +53 -159
- mlrun/projects/project.py +10 -37
- mlrun/render.py +1 -1
- mlrun/run.py +8 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +29 -1249
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/funcdoc.py +0 -9
- mlrun/runtimes/function.py +25 -29
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +28 -18
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +14 -6
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/runtimes/utils.py +0 -26
- mlrun/serving/routers.py +7 -7
- mlrun/serving/server.py +11 -8
- mlrun/serving/states.py +7 -1
- mlrun/serving/v2_serving.py +6 -6
- mlrun/utils/helpers.py +23 -42
- mlrun/utils/notifications/notification/__init__.py +4 -0
- mlrun/utils/notifications/notification/webhook.py +61 -0
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- mlrun/utils/model_monitoring.py +0 -249
- /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -22,14 +22,14 @@ import pandas as pd
|
|
|
22
22
|
import storey
|
|
23
23
|
|
|
24
24
|
import mlrun
|
|
25
|
-
import mlrun.common.model_monitoring
|
|
25
|
+
import mlrun.common.model_monitoring.helpers
|
|
26
26
|
import mlrun.config
|
|
27
27
|
import mlrun.datastore.targets
|
|
28
28
|
import mlrun.feature_store.steps
|
|
29
|
+
import mlrun.model_monitoring.prometheus
|
|
29
30
|
import mlrun.utils
|
|
30
|
-
import mlrun.utils.model_monitoring
|
|
31
31
|
import mlrun.utils.v3io_clients
|
|
32
|
-
from mlrun.common.model_monitoring import (
|
|
32
|
+
from mlrun.common.schemas.model_monitoring.constants import (
|
|
33
33
|
EventFieldType,
|
|
34
34
|
EventKeyMetrics,
|
|
35
35
|
EventLiveStats,
|
|
@@ -37,7 +37,6 @@ from mlrun.common.model_monitoring import (
|
|
|
37
37
|
ModelEndpointTarget,
|
|
38
38
|
ProjectSecretKeys,
|
|
39
39
|
)
|
|
40
|
-
from mlrun.model_monitoring.stores import get_model_endpoint_store
|
|
41
40
|
from mlrun.utils import logger
|
|
42
41
|
|
|
43
42
|
|
|
@@ -47,22 +46,18 @@ class EventStreamProcessor:
|
|
|
47
46
|
self,
|
|
48
47
|
project: str,
|
|
49
48
|
parquet_batching_max_events: int,
|
|
49
|
+
parquet_batching_timeout_secs: int,
|
|
50
50
|
parquet_target: str,
|
|
51
51
|
sample_window: int = 10,
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
aggregate_count_period: str = "30s",
|
|
55
|
-
aggregate_avg_windows: typing.Optional[typing.List[str]] = None,
|
|
56
|
-
aggregate_avg_period: str = "30s",
|
|
52
|
+
aggregate_windows: typing.Optional[typing.List[str]] = None,
|
|
53
|
+
aggregate_period: str = "30s",
|
|
57
54
|
model_monitoring_access_key: str = None,
|
|
58
55
|
):
|
|
59
56
|
# General configurations, mainly used for the storey steps in the future serving graph
|
|
60
57
|
self.project = project
|
|
61
58
|
self.sample_window = sample_window
|
|
62
|
-
self.
|
|
63
|
-
self.
|
|
64
|
-
self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
|
|
65
|
-
self.aggregate_avg_period = aggregate_avg_period
|
|
59
|
+
self.aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
60
|
+
self.aggregate_period = aggregate_period
|
|
66
61
|
|
|
67
62
|
# Parquet path and configurations
|
|
68
63
|
self.parquet_path = parquet_target
|
|
@@ -84,6 +79,8 @@ class EventStreamProcessor:
|
|
|
84
79
|
self._initialize_v3io_configurations(
|
|
85
80
|
model_monitoring_access_key=model_monitoring_access_key
|
|
86
81
|
)
|
|
82
|
+
elif self.parquet_path.startswith("s3://"):
|
|
83
|
+
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
87
84
|
|
|
88
85
|
def _initialize_v3io_configurations(
|
|
89
86
|
self,
|
|
@@ -116,7 +113,9 @@ class EventStreamProcessor:
|
|
|
116
113
|
_,
|
|
117
114
|
self.kv_container,
|
|
118
115
|
self.kv_path,
|
|
119
|
-
) = mlrun.
|
|
116
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
117
|
+
kv_path
|
|
118
|
+
)
|
|
120
119
|
|
|
121
120
|
# TSDB path and configurations
|
|
122
121
|
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
@@ -126,7 +125,9 @@ class EventStreamProcessor:
|
|
|
126
125
|
_,
|
|
127
126
|
self.tsdb_container,
|
|
128
127
|
self.tsdb_path,
|
|
129
|
-
) = mlrun.
|
|
128
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
129
|
+
tsdb_path
|
|
130
|
+
)
|
|
130
131
|
|
|
131
132
|
self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
|
|
132
133
|
self.tsdb_batching_max_events = tsdb_batching_max_events
|
|
@@ -138,7 +139,7 @@ class EventStreamProcessor:
|
|
|
138
139
|
of different operations that are executed on the events from the model server. Each event has
|
|
139
140
|
metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
|
|
140
141
|
Throughout the serving graph, the results are written to 3 different databases:
|
|
141
|
-
1. KV/SQL (steps
|
|
142
|
+
1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
|
|
142
143
|
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
|
|
143
144
|
by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
|
|
144
145
|
from other processes, such as current_stats that is being calculated by the monitoring batch job
|
|
@@ -146,12 +147,14 @@ class EventStreamProcessor:
|
|
|
146
147
|
v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
|
|
147
148
|
is stored within the database that was defined in the provided connection string and can be found
|
|
148
149
|
under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
|
|
149
|
-
2. TSDB (steps
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
|
|
151
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
|
|
152
|
+
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
153
|
+
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
152
154
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
153
|
-
|
|
154
|
-
|
|
155
|
+
If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
|
|
156
|
+
monitoring stream local memory.
|
|
157
|
+
3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
|
|
155
158
|
that run every hour by default. If defined, the parquet target path can be found under
|
|
156
159
|
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
157
160
|
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
@@ -161,17 +164,41 @@ class EventStreamProcessor:
|
|
|
161
164
|
|
|
162
165
|
graph = fn.set_topology("flow")
|
|
163
166
|
|
|
164
|
-
# Step 1 -
|
|
167
|
+
# Step 1 - Event routing based on the provided path
|
|
168
|
+
def apply_event_routing():
|
|
169
|
+
graph.add_step(
|
|
170
|
+
"EventRouting",
|
|
171
|
+
full_event=True,
|
|
172
|
+
project=self.project,
|
|
173
|
+
).respond()
|
|
174
|
+
|
|
175
|
+
apply_event_routing()
|
|
176
|
+
|
|
177
|
+
# Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
|
|
178
|
+
# through the next steps of the stream graph
|
|
179
|
+
def apply_storey_filter_stream_events():
|
|
180
|
+
# Remove none values from each event
|
|
181
|
+
graph.add_step(
|
|
182
|
+
"storey.Filter",
|
|
183
|
+
"filter_stream_event",
|
|
184
|
+
_fn="('-' not in event.path)",
|
|
185
|
+
full_event=True,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
apply_storey_filter_stream_events()
|
|
189
|
+
|
|
190
|
+
# Step 3 - Process endpoint event: splitting into sub-events and validate event data
|
|
165
191
|
def apply_process_endpoint_event():
|
|
166
192
|
graph.add_step(
|
|
167
193
|
"ProcessEndpointEvent",
|
|
168
194
|
full_event=True,
|
|
169
195
|
project=self.project,
|
|
196
|
+
after="filter_stream_event",
|
|
170
197
|
)
|
|
171
198
|
|
|
172
199
|
apply_process_endpoint_event()
|
|
173
200
|
|
|
174
|
-
# Steps
|
|
201
|
+
# Steps 4,5 - Applying Storey operations of filtering and flatten
|
|
175
202
|
def apply_storey_filter_and_flatmap():
|
|
176
203
|
# Remove none values from each event
|
|
177
204
|
graph.add_step(
|
|
@@ -188,7 +215,7 @@ class EventStreamProcessor:
|
|
|
188
215
|
|
|
189
216
|
apply_storey_filter_and_flatmap()
|
|
190
217
|
|
|
191
|
-
# Step
|
|
218
|
+
# Step 6 - Validating feature names and map each feature to its value
|
|
192
219
|
def apply_map_feature_names():
|
|
193
220
|
graph.add_step(
|
|
194
221
|
"MapFeatureNames",
|
|
@@ -200,58 +227,53 @@ class EventStreamProcessor:
|
|
|
200
227
|
|
|
201
228
|
apply_map_feature_names()
|
|
202
229
|
|
|
203
|
-
# Step
|
|
230
|
+
# Step 7 - Calculate number of predictions and average latency
|
|
204
231
|
def apply_storey_aggregations():
|
|
205
|
-
# Step
|
|
232
|
+
# Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
206
233
|
graph.add_step(
|
|
207
234
|
class_name="storey.AggregateByKey",
|
|
208
235
|
aggregates=[
|
|
209
236
|
{
|
|
210
|
-
"name": EventFieldType.
|
|
211
|
-
"column": EventFieldType.
|
|
212
|
-
"operations": ["count"],
|
|
213
|
-
"windows": self.
|
|
214
|
-
"period": self.
|
|
237
|
+
"name": EventFieldType.LATENCY,
|
|
238
|
+
"column": EventFieldType.LATENCY,
|
|
239
|
+
"operations": ["count", "avg"],
|
|
240
|
+
"windows": self.aggregate_windows,
|
|
241
|
+
"period": self.aggregate_period,
|
|
215
242
|
}
|
|
216
243
|
],
|
|
217
|
-
name=EventFieldType.
|
|
244
|
+
name=EventFieldType.LATENCY,
|
|
218
245
|
after="MapFeatureNames",
|
|
219
246
|
step_name="Aggregates",
|
|
220
247
|
table=".",
|
|
248
|
+
key_field=EventFieldType.ENDPOINT_ID,
|
|
221
249
|
)
|
|
222
|
-
# Step
|
|
250
|
+
# Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
|
|
223
251
|
graph.add_step(
|
|
224
|
-
class_name="storey.
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
"period": self.aggregate_avg_period,
|
|
232
|
-
}
|
|
233
|
-
],
|
|
234
|
-
name=EventFieldType.LATENCY,
|
|
235
|
-
after=EventFieldType.PREDICTIONS,
|
|
236
|
-
table=".",
|
|
252
|
+
class_name="storey.Rename",
|
|
253
|
+
mapping={
|
|
254
|
+
"latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
255
|
+
"latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
256
|
+
},
|
|
257
|
+
name="Rename",
|
|
258
|
+
after=EventFieldType.LATENCY,
|
|
237
259
|
)
|
|
238
260
|
|
|
239
261
|
apply_storey_aggregations()
|
|
240
262
|
|
|
241
|
-
# Step
|
|
263
|
+
# Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
|
|
242
264
|
def apply_storey_sample_window():
|
|
243
265
|
graph.add_step(
|
|
244
266
|
"storey.steps.SampleWindow",
|
|
245
267
|
name="sample",
|
|
246
|
-
after=
|
|
268
|
+
after="Rename",
|
|
247
269
|
window_size=self.sample_window,
|
|
248
270
|
key=EventFieldType.ENDPOINT_ID,
|
|
249
271
|
)
|
|
250
272
|
|
|
251
273
|
apply_storey_sample_window()
|
|
252
274
|
|
|
253
|
-
# Steps
|
|
254
|
-
# Step
|
|
275
|
+
# Steps 9-11 - KV/SQL branch
|
|
276
|
+
# Step 9 - Filter relevant keys from the event before writing the data into the database table
|
|
255
277
|
def apply_process_before_endpoint_update():
|
|
256
278
|
graph.add_step(
|
|
257
279
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -261,7 +283,7 @@ class EventStreamProcessor:
|
|
|
261
283
|
|
|
262
284
|
apply_process_before_endpoint_update()
|
|
263
285
|
|
|
264
|
-
# Step
|
|
286
|
+
# Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
265
287
|
# about average latency and the amount of predictions over time
|
|
266
288
|
def apply_update_endpoint():
|
|
267
289
|
graph.add_step(
|
|
@@ -274,7 +296,7 @@ class EventStreamProcessor:
|
|
|
274
296
|
|
|
275
297
|
apply_update_endpoint()
|
|
276
298
|
|
|
277
|
-
# Step
|
|
299
|
+
# Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
278
300
|
# which will be used by Grafana monitoring dashboards
|
|
279
301
|
def apply_infer_schema():
|
|
280
302
|
graph.add_step(
|
|
@@ -289,10 +311,12 @@ class EventStreamProcessor:
|
|
|
289
311
|
if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
|
|
290
312
|
apply_infer_schema()
|
|
291
313
|
|
|
292
|
-
# Steps
|
|
293
|
-
|
|
314
|
+
# Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
|
|
315
|
+
# Steps 20-21 - Prometheus branch
|
|
294
316
|
if not mlrun.mlconf.is_ce_mode():
|
|
295
|
-
#
|
|
317
|
+
# TSDB branch
|
|
318
|
+
|
|
319
|
+
# Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
296
320
|
# stats and details about the events
|
|
297
321
|
def apply_process_before_tsdb():
|
|
298
322
|
graph.add_step(
|
|
@@ -301,7 +325,7 @@ class EventStreamProcessor:
|
|
|
301
325
|
|
|
302
326
|
apply_process_before_tsdb()
|
|
303
327
|
|
|
304
|
-
# Steps
|
|
328
|
+
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
305
329
|
def apply_filter_and_unpacked_keys(name, keys):
|
|
306
330
|
graph.add_step(
|
|
307
331
|
"FilterAndUnpackKeys",
|
|
@@ -332,21 +356,21 @@ class EventStreamProcessor:
|
|
|
332
356
|
key=EventFieldType.ENDPOINT_ID,
|
|
333
357
|
)
|
|
334
358
|
|
|
335
|
-
# Steps
|
|
359
|
+
# Steps 13-14 - unpacked base_metrics dictionary
|
|
336
360
|
apply_filter_and_unpacked_keys(
|
|
337
361
|
name="FilterAndUnpackKeys1",
|
|
338
362
|
keys=EventKeyMetrics.BASE_METRICS,
|
|
339
363
|
)
|
|
340
364
|
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
341
365
|
|
|
342
|
-
# Steps
|
|
366
|
+
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
343
367
|
apply_filter_and_unpacked_keys(
|
|
344
368
|
name="FilterAndUnpackKeys2",
|
|
345
369
|
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
346
370
|
)
|
|
347
371
|
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
348
372
|
|
|
349
|
-
# Steps
|
|
373
|
+
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
350
374
|
apply_filter_and_unpacked_keys(
|
|
351
375
|
name="FilterAndUnpackKeys3",
|
|
352
376
|
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
@@ -362,9 +386,30 @@ class EventStreamProcessor:
|
|
|
362
386
|
|
|
363
387
|
apply_storey_filter()
|
|
364
388
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
389
|
+
else:
|
|
390
|
+
# Prometheus branch
|
|
391
|
+
|
|
392
|
+
# Step 20 - Increase the prediction counter by 1 and update the latency value
|
|
393
|
+
graph.add_step(
|
|
394
|
+
"IncCounter",
|
|
395
|
+
name="IncCounter",
|
|
396
|
+
after="MapFeatureNames",
|
|
397
|
+
project=self.project,
|
|
398
|
+
)
|
|
365
399
|
|
|
366
|
-
|
|
367
|
-
|
|
400
|
+
# Step 21 - Record a sample of features and labels
|
|
401
|
+
def apply_record_features_to_prometheus():
|
|
402
|
+
graph.add_step(
|
|
403
|
+
"RecordFeatures",
|
|
404
|
+
name="RecordFeaturesToPrometheus",
|
|
405
|
+
after="sample",
|
|
406
|
+
project=self.project,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
apply_record_features_to_prometheus()
|
|
410
|
+
|
|
411
|
+
# Steps 22-23 - Parquet branch
|
|
412
|
+
# Step 22 - Filter and validate different keys before writing the data to Parquet target
|
|
368
413
|
def apply_process_before_parquet():
|
|
369
414
|
graph.add_step(
|
|
370
415
|
"ProcessBeforeParquet",
|
|
@@ -375,7 +420,7 @@ class EventStreamProcessor:
|
|
|
375
420
|
|
|
376
421
|
apply_process_before_parquet()
|
|
377
422
|
|
|
378
|
-
# Step
|
|
423
|
+
# Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
379
424
|
def apply_parquet_target():
|
|
380
425
|
graph.add_step(
|
|
381
426
|
"storey.ParquetTarget",
|
|
@@ -625,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
625
670
|
error = event.get("error")
|
|
626
671
|
if error:
|
|
627
672
|
self.error_count[endpoint_id] += 1
|
|
673
|
+
mlrun.model_monitoring.prometheus.write_errors(
|
|
674
|
+
project=self.project,
|
|
675
|
+
endpoint_id=event["endpoint_id"],
|
|
676
|
+
model_name=event["model"],
|
|
677
|
+
)
|
|
628
678
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
629
679
|
|
|
630
680
|
# Validate event fields
|
|
@@ -1078,12 +1128,104 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1078
1128
|
return event
|
|
1079
1129
|
|
|
1080
1130
|
|
|
1131
|
+
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
1132
|
+
"""
|
|
1133
|
+
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
1134
|
+
to the caller. At the moment there are several paths:
|
|
1135
|
+
|
|
1136
|
+
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
1137
|
+
to scrape the results from the monitoring stream memory.
|
|
1138
|
+
|
|
1139
|
+
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
1140
|
+
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
1141
|
+
metrics.
|
|
1142
|
+
|
|
1143
|
+
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
1144
|
+
|
|
1145
|
+
"""
|
|
1146
|
+
|
|
1147
|
+
def __init__(
|
|
1148
|
+
self,
|
|
1149
|
+
project: str,
|
|
1150
|
+
**kwargs,
|
|
1151
|
+
):
|
|
1152
|
+
super().__init__(**kwargs)
|
|
1153
|
+
self.project: str = project
|
|
1154
|
+
|
|
1155
|
+
def do(self, event):
|
|
1156
|
+
if event.path == "/model-monitoring-metrics":
|
|
1157
|
+
# Return a parsed Prometheus registry file
|
|
1158
|
+
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1159
|
+
elif event.path == "/monitoring-batch-metrics":
|
|
1160
|
+
# Update statistical metrics
|
|
1161
|
+
for event_metric in event.body:
|
|
1162
|
+
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1163
|
+
project=self.project,
|
|
1164
|
+
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1165
|
+
metric=event_metric[EventFieldType.METRIC],
|
|
1166
|
+
value=event_metric[EventFieldType.VALUE],
|
|
1167
|
+
)
|
|
1168
|
+
elif event.path == "/monitoring-drift-status":
|
|
1169
|
+
# Update drift status
|
|
1170
|
+
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1171
|
+
project=self.project,
|
|
1172
|
+
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1173
|
+
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
return event
|
|
1177
|
+
|
|
1178
|
+
|
|
1179
|
+
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1180
|
+
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1181
|
+
|
|
1182
|
+
def __init__(self, project: str, **kwargs):
|
|
1183
|
+
super().__init__(**kwargs)
|
|
1184
|
+
self.project: str = project
|
|
1185
|
+
|
|
1186
|
+
def do(self, event):
|
|
1187
|
+
# Compute prediction per second
|
|
1188
|
+
|
|
1189
|
+
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1190
|
+
project=self.project,
|
|
1191
|
+
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1192
|
+
latency=event[EventFieldType.LATENCY],
|
|
1193
|
+
model_name=event[EventFieldType.MODEL],
|
|
1194
|
+
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
return event
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1201
|
+
"""Record a sample of features and labels in Prometheus registry"""
|
|
1202
|
+
|
|
1203
|
+
def __init__(self, project: str, **kwargs):
|
|
1204
|
+
super().__init__(**kwargs)
|
|
1205
|
+
self.project: str = project
|
|
1206
|
+
|
|
1207
|
+
def do(self, event):
|
|
1208
|
+
# Generate a dictionary of features and predictions
|
|
1209
|
+
features = {
|
|
1210
|
+
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1211
|
+
**event[EventFieldType.NAMED_FEATURES],
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1215
|
+
project=self.project,
|
|
1216
|
+
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1217
|
+
features=features,
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
return event
|
|
1221
|
+
|
|
1222
|
+
|
|
1081
1223
|
def update_endpoint_record(
|
|
1082
1224
|
project: str,
|
|
1083
1225
|
endpoint_id: str,
|
|
1084
1226
|
attributes: dict,
|
|
1085
1227
|
):
|
|
1086
|
-
model_endpoint_store = get_model_endpoint_store(
|
|
1228
|
+
model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
|
|
1087
1229
|
project=project,
|
|
1088
1230
|
)
|
|
1089
1231
|
|
|
@@ -1093,7 +1235,7 @@ def update_endpoint_record(
|
|
|
1093
1235
|
|
|
1094
1236
|
|
|
1095
1237
|
def get_endpoint_record(project: str, endpoint_id: str):
|
|
1096
|
-
model_endpoint_store = get_model_endpoint_store(
|
|
1238
|
+
model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
|
|
1097
1239
|
project=project,
|
|
1098
1240
|
)
|
|
1099
1241
|
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
import mlrun.common.schemas.schedule
|
|
19
|
+
import mlrun.model
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TrackingPolicy(mlrun.model.ModelObj):
|
|
23
|
+
"""
|
|
24
|
+
Modified model monitoring configurations. By using TrackingPolicy, the user can apply his model monitoring
|
|
25
|
+
requirements, such as setting the scheduling policy of the model monitoring batch job or changing the image of the
|
|
26
|
+
model monitoring stream.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
_dict_fields = [
|
|
30
|
+
"default_batch_image",
|
|
31
|
+
"stream_image",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
default_batch_intervals: Union[
|
|
37
|
+
mlrun.common.schemas.schedule.ScheduleCronTrigger, str
|
|
38
|
+
] = mlrun.common.schemas.schedule.ScheduleCronTrigger(minute="0", hour="*/1"),
|
|
39
|
+
default_batch_image: str = "mlrun/mlrun",
|
|
40
|
+
stream_image: str = "mlrun/mlrun",
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize TrackingPolicy object.
|
|
44
|
+
:param default_batch_intervals: Model monitoring batch scheduling policy. By default, executed on the hour
|
|
45
|
+
every hour. Can be either a string or a ScheduleCronTrigger object. The
|
|
46
|
+
string time format is based on ScheduleCronTrigger expression:
|
|
47
|
+
minute, hour, day of month, month, day of week. It will be converted into
|
|
48
|
+
a ScheduleCronTrigger object.
|
|
49
|
+
:param default_batch_image: The default image of the model monitoring batch job. By default, the image
|
|
50
|
+
is mlrun/mlrun.
|
|
51
|
+
:param stream_image: The image of the model monitoring stream real-time function. By default,
|
|
52
|
+
the image is mlrun/mlrun.
|
|
53
|
+
"""
|
|
54
|
+
if isinstance(default_batch_intervals, str):
|
|
55
|
+
default_batch_intervals = (
|
|
56
|
+
mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
|
|
57
|
+
default_batch_intervals
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
self.default_batch_intervals = default_batch_intervals
|
|
61
|
+
self.default_batch_image = default_batch_image
|
|
62
|
+
self.stream_image = stream_image
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
|
|
66
|
+
new_obj = super().from_dict(
|
|
67
|
+
struct, fields=cls._dict_fields, deprecated_fields=deprecated_fields
|
|
68
|
+
)
|
|
69
|
+
# Convert default batch interval into ScheduleCronTrigger object
|
|
70
|
+
if (
|
|
71
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
72
|
+
in struct
|
|
73
|
+
):
|
|
74
|
+
if isinstance(
|
|
75
|
+
struct[
|
|
76
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
77
|
+
],
|
|
78
|
+
str,
|
|
79
|
+
):
|
|
80
|
+
new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
|
|
81
|
+
struct[
|
|
82
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.parse_obj(
|
|
87
|
+
struct[
|
|
88
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
89
|
+
]
|
|
90
|
+
)
|
|
91
|
+
return new_obj
|
|
92
|
+
|
|
93
|
+
def to_dict(self, fields=None, exclude=None):
|
|
94
|
+
struct = super().to_dict(
|
|
95
|
+
fields,
|
|
96
|
+
exclude=[
|
|
97
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
98
|
+
],
|
|
99
|
+
)
|
|
100
|
+
if self.default_batch_intervals:
|
|
101
|
+
struct[
|
|
102
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
|
|
103
|
+
] = self.default_batch_intervals.dict()
|
|
104
|
+
return struct
|
mlrun/package/packager.py
CHANGED
|
@@ -107,8 +107,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
|
|
|
107
107
|
|
|
108
108
|
Preferably, each packager should handle a single type of object.
|
|
109
109
|
|
|
110
|
-
Linking Artifacts (extra data)
|
|
111
|
-
------------------------------
|
|
110
|
+
**Linking Artifacts (extra data)**
|
|
112
111
|
|
|
113
112
|
In order to link between packages (using the extra data or metrics spec attributes of an artifact), you should use
|
|
114
113
|
the key as if it exists and as value ellipses (...). The manager will link all packages once it is done packing.
|
|
@@ -118,8 +117,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
|
|
|
118
117
|
artifact = Artifact(key="my_artifact")
|
|
119
118
|
artifact.spec.extra_data = {key: ... for key in extra_data}
|
|
120
119
|
|
|
121
|
-
Clearing Outputs
|
|
122
|
-
----------------
|
|
120
|
+
**Clearing Outputs**
|
|
123
121
|
|
|
124
122
|
Some of the packagers may produce files and temporary directories that should be deleted once done with logging the
|
|
125
123
|
artifact. The packager can mark paths of files and directories to delete after logging using the class method
|
|
@@ -131,15 +129,15 @@ class Packager(ABC, metaclass=_PackagerMeta):
|
|
|
131
129
|
with open("./some_file.txt", "w") as file:
|
|
132
130
|
file.write("Pack me")
|
|
133
131
|
artifact = Artifact(key="my_artifact")
|
|
134
|
-
cls.
|
|
132
|
+
cls.add_future_clearing_path(path="./some_file.txt")
|
|
135
133
|
return artifact, None
|
|
136
134
|
"""
|
|
137
135
|
|
|
138
|
-
|
|
136
|
+
#: The type of object this packager can pack and unpack.
|
|
139
137
|
PACKABLE_OBJECT_TYPE: Type = ...
|
|
140
138
|
|
|
141
|
-
|
|
142
|
-
PRIORITY = ...
|
|
139
|
+
#: The priority of this packager in the packagers collection of the manager (lower is better).
|
|
140
|
+
PRIORITY: int = ...
|
|
143
141
|
|
|
144
142
|
# List of all paths to be deleted by the manager of this packager post logging the packages:
|
|
145
143
|
_CLEARING_PATH_LIST: List[str] = []
|