mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +25 -111
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +38 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +41 -47
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +68 -0
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +25 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +8 -2
- mlrun/common/schemas/client_spec.py +2 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +19 -3
- mlrun/common/schemas/model_monitoring/constants.py +96 -26
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +22 -21
- mlrun/common/types.py +7 -1
- mlrun/config.py +87 -19
- mlrun/data_types/data_types.py +4 -0
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +69 -30
- mlrun/datastore/datastore.py +10 -2
- mlrun/datastore/datastore_profile.py +90 -6
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +172 -44
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +285 -41
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +27 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +149 -14
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +608 -178
- mlrun/db/nopdb.py +191 -7
- mlrun/errors.py +11 -0
- mlrun/execution.py +37 -20
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +21 -52
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +2 -1
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +30 -19
- mlrun/features.py +4 -13
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +9 -3
- mlrun/launcher/remote.py +9 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +58 -19
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +127 -301
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +30 -36
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +100 -7
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +93 -228
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +30 -16
- mlrun/projects/pipelines.py +92 -99
- mlrun/projects/project.py +757 -268
- mlrun/render.py +15 -14
- mlrun/run.py +160 -162
- mlrun/runtimes/__init__.py +55 -3
- mlrun/runtimes/base.py +33 -19
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +98 -58
- mlrun/runtimes/nuclio/serving.py +36 -42
- mlrun/runtimes/pod.py +196 -45
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +7 -4
- mlrun/serving/server.py +7 -8
- mlrun/serving/states.py +73 -43
- mlrun/serving/v2_serving.py +8 -7
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +141 -75
- mlrun/utils/http.py +1 -1
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +12 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
|
@@ -24,11 +24,12 @@ import mlrun
|
|
|
24
24
|
import mlrun.common.model_monitoring.helpers
|
|
25
25
|
import mlrun.config
|
|
26
26
|
import mlrun.datastore.targets
|
|
27
|
+
import mlrun.feature_store as fstore
|
|
27
28
|
import mlrun.feature_store.steps
|
|
29
|
+
import mlrun.model_monitoring.db
|
|
28
30
|
import mlrun.model_monitoring.prometheus
|
|
29
31
|
import mlrun.serving.states
|
|
30
32
|
import mlrun.utils
|
|
31
|
-
import mlrun.utils.v3io_clients
|
|
32
33
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
33
34
|
EventFieldType,
|
|
34
35
|
EventKeyMetrics,
|
|
@@ -36,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
36
37
|
FileTargetKind,
|
|
37
38
|
ModelEndpointTarget,
|
|
38
39
|
ProjectSecretKeys,
|
|
40
|
+
PrometheusEndpoints,
|
|
39
41
|
)
|
|
40
42
|
from mlrun.utils import logger
|
|
41
43
|
|
|
@@ -75,6 +77,7 @@ class EventStreamProcessor:
|
|
|
75
77
|
)
|
|
76
78
|
|
|
77
79
|
self.storage_options = None
|
|
80
|
+
self.tsdb_configurations = {}
|
|
78
81
|
if not mlrun.mlconf.is_ce_mode():
|
|
79
82
|
self._initialize_v3io_configurations(
|
|
80
83
|
model_monitoring_access_key=model_monitoring_access_key
|
|
@@ -133,33 +136,38 @@ class EventStreamProcessor:
|
|
|
133
136
|
self.tsdb_batching_max_events = tsdb_batching_max_events
|
|
134
137
|
self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
|
|
135
138
|
|
|
136
|
-
def apply_monitoring_serving_graph(
|
|
139
|
+
def apply_monitoring_serving_graph(
|
|
140
|
+
self,
|
|
141
|
+
fn: mlrun.runtimes.ServingRuntime,
|
|
142
|
+
tsdb_service_provider: typing.Optional[typing.Callable] = None,
|
|
143
|
+
) -> None:
|
|
137
144
|
"""
|
|
138
|
-
Apply monitoring serving graph to a given serving function. The following serving graph includes about
|
|
139
|
-
of different operations that are executed on the events from
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
145
|
+
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
146
|
+
parts that each one them includes several steps of different operations that are executed on the events from
|
|
147
|
+
the model server.
|
|
148
|
+
Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
|
|
149
|
+
metrics from the model server.
|
|
150
|
+
In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
|
|
151
|
+
validation of the event data and adding important details to the event such as endpoint_id.
|
|
152
|
+
In the next parts, the serving graph stores data to 3 different targets:
|
|
153
|
+
1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
|
|
154
|
+
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
|
|
155
|
+
endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
|
|
156
|
+
represents sample statistics from the training data. If the target is from type KV, then the model endpoints
|
|
157
|
+
table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
|
|
158
|
+
SQL, then the table is stored within the database that was defined in the provided connection string.
|
|
159
|
+
2. TSDB: live data of different key metric dictionaries in tsdb target.
|
|
160
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
|
|
152
161
|
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
153
162
|
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
154
163
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
160
|
-
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
164
|
+
3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
|
|
165
|
+
the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
|
|
166
|
+
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
167
|
+
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
161
168
|
|
|
162
169
|
:param fn: A serving function.
|
|
170
|
+
:param tsdb_service_provider: An optional callable function that provides the TSDB connection string.
|
|
163
171
|
"""
|
|
164
172
|
|
|
165
173
|
graph = typing.cast(
|
|
@@ -167,7 +175,7 @@ class EventStreamProcessor:
|
|
|
167
175
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
168
176
|
)
|
|
169
177
|
|
|
170
|
-
#
|
|
178
|
+
# Event routing based on the provided path
|
|
171
179
|
def apply_event_routing():
|
|
172
180
|
typing.cast(
|
|
173
181
|
mlrun.serving.TaskStep,
|
|
@@ -180,20 +188,20 @@ class EventStreamProcessor:
|
|
|
180
188
|
|
|
181
189
|
apply_event_routing()
|
|
182
190
|
|
|
183
|
-
#
|
|
191
|
+
# Filter out events with '-' in the path basename from going forward
|
|
184
192
|
# through the next steps of the stream graph
|
|
185
193
|
def apply_storey_filter_stream_events():
|
|
186
|
-
#
|
|
194
|
+
# Filter events with Prometheus endpoints path
|
|
187
195
|
graph.add_step(
|
|
188
196
|
"storey.Filter",
|
|
189
197
|
"filter_stream_event",
|
|
190
|
-
_fn="(
|
|
198
|
+
_fn=f"(event.path not in {PrometheusEndpoints.list()})",
|
|
191
199
|
full_event=True,
|
|
192
200
|
)
|
|
193
201
|
|
|
194
202
|
apply_storey_filter_stream_events()
|
|
195
203
|
|
|
196
|
-
#
|
|
204
|
+
# Process endpoint event: splitting into sub-events and validate event data
|
|
197
205
|
def apply_process_endpoint_event():
|
|
198
206
|
graph.add_step(
|
|
199
207
|
"ProcessEndpointEvent",
|
|
@@ -204,7 +212,7 @@ class EventStreamProcessor:
|
|
|
204
212
|
|
|
205
213
|
apply_process_endpoint_event()
|
|
206
214
|
|
|
207
|
-
#
|
|
215
|
+
# Applying Storey operations of filtering and flatten
|
|
208
216
|
def apply_storey_filter_and_flatmap():
|
|
209
217
|
# Remove none values from each event
|
|
210
218
|
graph.add_step(
|
|
@@ -221,7 +229,7 @@ class EventStreamProcessor:
|
|
|
221
229
|
|
|
222
230
|
apply_storey_filter_and_flatmap()
|
|
223
231
|
|
|
224
|
-
#
|
|
232
|
+
# Validating feature names and map each feature to its value
|
|
225
233
|
def apply_map_feature_names():
|
|
226
234
|
graph.add_step(
|
|
227
235
|
"MapFeatureNames",
|
|
@@ -233,9 +241,9 @@ class EventStreamProcessor:
|
|
|
233
241
|
|
|
234
242
|
apply_map_feature_names()
|
|
235
243
|
|
|
236
|
-
#
|
|
244
|
+
# Calculate number of predictions and average latency
|
|
237
245
|
def apply_storey_aggregations():
|
|
238
|
-
#
|
|
246
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
239
247
|
graph.add_step(
|
|
240
248
|
class_name="storey.AggregateByKey",
|
|
241
249
|
aggregates=[
|
|
@@ -253,7 +261,7 @@ class EventStreamProcessor:
|
|
|
253
261
|
table=".",
|
|
254
262
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
255
263
|
)
|
|
256
|
-
#
|
|
264
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
257
265
|
graph.add_step(
|
|
258
266
|
class_name="storey.Rename",
|
|
259
267
|
mapping={
|
|
@@ -266,8 +274,8 @@ class EventStreamProcessor:
|
|
|
266
274
|
|
|
267
275
|
apply_storey_aggregations()
|
|
268
276
|
|
|
269
|
-
#
|
|
270
|
-
#
|
|
277
|
+
# KV/SQL branch
|
|
278
|
+
# Filter relevant keys from the event before writing the data into the database table
|
|
271
279
|
def apply_process_before_endpoint_update():
|
|
272
280
|
graph.add_step(
|
|
273
281
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -277,7 +285,7 @@ class EventStreamProcessor:
|
|
|
277
285
|
|
|
278
286
|
apply_process_before_endpoint_update()
|
|
279
287
|
|
|
280
|
-
#
|
|
288
|
+
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
281
289
|
# about average latency and the amount of predictions over time
|
|
282
290
|
def apply_update_endpoint():
|
|
283
291
|
graph.add_step(
|
|
@@ -290,7 +298,7 @@ class EventStreamProcessor:
|
|
|
290
298
|
|
|
291
299
|
apply_update_endpoint()
|
|
292
300
|
|
|
293
|
-
#
|
|
301
|
+
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
294
302
|
# which will be used by Grafana monitoring dashboards
|
|
295
303
|
def apply_infer_schema():
|
|
296
304
|
graph.add_step(
|
|
@@ -305,7 +313,7 @@ class EventStreamProcessor:
|
|
|
305
313
|
if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
|
|
306
314
|
apply_infer_schema()
|
|
307
315
|
|
|
308
|
-
#
|
|
316
|
+
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
309
317
|
def apply_storey_sample_window():
|
|
310
318
|
graph.add_step(
|
|
311
319
|
"storey.steps.SampleWindow",
|
|
@@ -317,85 +325,16 @@ class EventStreamProcessor:
|
|
|
317
325
|
|
|
318
326
|
apply_storey_sample_window()
|
|
319
327
|
|
|
320
|
-
#
|
|
321
|
-
# Steps 20-21 - Prometheus branch
|
|
328
|
+
# TSDB branch (skip to Prometheus if in CE env)
|
|
322
329
|
if not mlrun.mlconf.is_ce_mode():
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
# Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
326
|
-
# stats and details about the events
|
|
327
|
-
def apply_process_before_tsdb():
|
|
328
|
-
graph.add_step(
|
|
329
|
-
"ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
apply_process_before_tsdb()
|
|
333
|
-
|
|
334
|
-
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
335
|
-
def apply_filter_and_unpacked_keys(name, keys):
|
|
336
|
-
graph.add_step(
|
|
337
|
-
"FilterAndUnpackKeys",
|
|
338
|
-
name=name,
|
|
339
|
-
after="ProcessBeforeTSDB",
|
|
340
|
-
keys=[keys],
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
def apply_tsdb_target(name, after):
|
|
344
|
-
graph.add_step(
|
|
345
|
-
"storey.TSDBTarget",
|
|
346
|
-
name=name,
|
|
347
|
-
after=after,
|
|
348
|
-
path=self.tsdb_path,
|
|
349
|
-
rate="10/m",
|
|
350
|
-
time_col=EventFieldType.TIMESTAMP,
|
|
351
|
-
container=self.tsdb_container,
|
|
352
|
-
access_key=self.v3io_access_key,
|
|
353
|
-
v3io_frames=self.v3io_framesd,
|
|
354
|
-
infer_columns_from_data=True,
|
|
355
|
-
index_cols=[
|
|
356
|
-
EventFieldType.ENDPOINT_ID,
|
|
357
|
-
EventFieldType.RECORD_TYPE,
|
|
358
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
359
|
-
],
|
|
360
|
-
max_events=self.tsdb_batching_max_events,
|
|
361
|
-
flush_after_seconds=self.tsdb_batching_timeout_secs,
|
|
362
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
# Steps 13-14 - unpacked base_metrics dictionary
|
|
366
|
-
apply_filter_and_unpacked_keys(
|
|
367
|
-
name="FilterAndUnpackKeys1",
|
|
368
|
-
keys=EventKeyMetrics.BASE_METRICS,
|
|
369
|
-
)
|
|
370
|
-
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
371
|
-
|
|
372
|
-
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
373
|
-
apply_filter_and_unpacked_keys(
|
|
374
|
-
name="FilterAndUnpackKeys2",
|
|
375
|
-
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
376
|
-
)
|
|
377
|
-
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
378
|
-
|
|
379
|
-
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
380
|
-
apply_filter_and_unpacked_keys(
|
|
381
|
-
name="FilterAndUnpackKeys3",
|
|
382
|
-
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
330
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
331
|
+
project=self.project, secret_provider=tsdb_service_provider
|
|
383
332
|
)
|
|
333
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
384
334
|
|
|
385
|
-
def apply_storey_filter():
|
|
386
|
-
graph.add_step(
|
|
387
|
-
"storey.Filter",
|
|
388
|
-
"FilterNotNone",
|
|
389
|
-
after="FilterAndUnpackKeys3",
|
|
390
|
-
_fn="(event is not None)",
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
apply_storey_filter()
|
|
394
|
-
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
395
335
|
else:
|
|
396
|
-
# Prometheus
|
|
397
|
-
|
|
398
|
-
# Step 20 - Increase the prediction counter by 1 and update the latency value
|
|
336
|
+
# Prometheus
|
|
337
|
+
# Increase the prediction counter by 1 and update the latency value
|
|
399
338
|
graph.add_step(
|
|
400
339
|
"IncCounter",
|
|
401
340
|
name="IncCounter",
|
|
@@ -403,7 +342,7 @@ class EventStreamProcessor:
|
|
|
403
342
|
project=self.project,
|
|
404
343
|
)
|
|
405
344
|
|
|
406
|
-
#
|
|
345
|
+
# Record a sample of features and labels
|
|
407
346
|
def apply_record_features_to_prometheus():
|
|
408
347
|
graph.add_step(
|
|
409
348
|
"RecordFeatures",
|
|
@@ -414,8 +353,8 @@ class EventStreamProcessor:
|
|
|
414
353
|
|
|
415
354
|
apply_record_features_to_prometheus()
|
|
416
355
|
|
|
417
|
-
#
|
|
418
|
-
#
|
|
356
|
+
# Parquet branch
|
|
357
|
+
# Filter and validate different keys before writing the data to Parquet target
|
|
419
358
|
def apply_process_before_parquet():
|
|
420
359
|
graph.add_step(
|
|
421
360
|
"ProcessBeforeParquet",
|
|
@@ -426,7 +365,7 @@ class EventStreamProcessor:
|
|
|
426
365
|
|
|
427
366
|
apply_process_before_parquet()
|
|
428
367
|
|
|
429
|
-
#
|
|
368
|
+
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
430
369
|
def apply_parquet_target():
|
|
431
370
|
graph.add_step(
|
|
432
371
|
"storey.ParquetTarget",
|
|
@@ -500,76 +439,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
500
439
|
return e
|
|
501
440
|
|
|
502
441
|
|
|
503
|
-
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
504
|
-
def __init__(self, **kwargs):
|
|
505
|
-
"""
|
|
506
|
-
Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
|
|
507
|
-
that each one of them contains important details and stats about the events:
|
|
508
|
-
1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
|
|
509
|
-
storey.AggregateByKey which was executed in step 5.
|
|
510
|
-
2. endpoint_features: feature names and values along with the prediction names and value.
|
|
511
|
-
3. custom_metric (opt): optional metrics provided by the user.
|
|
512
|
-
|
|
513
|
-
:returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
|
|
514
|
-
|
|
515
|
-
"""
|
|
516
|
-
super().__init__(**kwargs)
|
|
517
|
-
|
|
518
|
-
def do(self, event):
|
|
519
|
-
# Compute prediction per second
|
|
520
|
-
event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
|
|
521
|
-
float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
|
|
522
|
-
)
|
|
523
|
-
base_fields = [
|
|
524
|
-
EventFieldType.TIMESTAMP,
|
|
525
|
-
EventFieldType.ENDPOINT_ID,
|
|
526
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
527
|
-
]
|
|
528
|
-
|
|
529
|
-
# Getting event timestamp and endpoint_id
|
|
530
|
-
base_event = {k: event[k] for k in base_fields}
|
|
531
|
-
|
|
532
|
-
# base_metrics includes the stats about the average latency and the amount of predictions over time
|
|
533
|
-
base_metrics = {
|
|
534
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
|
|
535
|
-
EventLiveStats.PREDICTIONS_PER_SECOND: event[
|
|
536
|
-
EventLiveStats.PREDICTIONS_PER_SECOND
|
|
537
|
-
],
|
|
538
|
-
EventLiveStats.PREDICTIONS_COUNT_5M: event[
|
|
539
|
-
EventLiveStats.PREDICTIONS_COUNT_5M
|
|
540
|
-
],
|
|
541
|
-
EventLiveStats.PREDICTIONS_COUNT_1H: event[
|
|
542
|
-
EventLiveStats.PREDICTIONS_COUNT_1H
|
|
543
|
-
],
|
|
544
|
-
EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
|
|
545
|
-
EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
|
|
546
|
-
**base_event,
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
# endpoint_features includes the event values of each feature and prediction
|
|
550
|
-
endpoint_features = {
|
|
551
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
552
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
553
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
554
|
-
**base_event,
|
|
555
|
-
}
|
|
556
|
-
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
557
|
-
processed = {
|
|
558
|
-
EventKeyMetrics.BASE_METRICS: base_metrics,
|
|
559
|
-
EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
# If metrics provided, add another dictionary if custom_metrics values
|
|
563
|
-
if event[EventFieldType.METRICS]:
|
|
564
|
-
processed[EventKeyMetrics.CUSTOM_METRICS] = {
|
|
565
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
|
|
566
|
-
**event[EventFieldType.METRICS],
|
|
567
|
-
**base_event,
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
return processed
|
|
571
|
-
|
|
572
|
-
|
|
573
442
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
574
443
|
def __init__(self, **kwargs):
|
|
575
444
|
"""
|
|
@@ -587,6 +456,8 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
|
587
456
|
for key in [
|
|
588
457
|
EventFieldType.FEATURES,
|
|
589
458
|
EventFieldType.NAMED_FEATURES,
|
|
459
|
+
EventFieldType.PREDICTION,
|
|
460
|
+
EventFieldType.NAMED_PREDICTIONS,
|
|
590
461
|
]:
|
|
591
462
|
event.pop(key, None)
|
|
592
463
|
|
|
@@ -802,7 +673,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
802
673
|
# left them
|
|
803
674
|
if endpoint_id not in self.endpoints:
|
|
804
675
|
logger.info("Trying to resume state", endpoint_id=endpoint_id)
|
|
805
|
-
endpoint_record = get_endpoint_record(
|
|
676
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
806
677
|
project=self.project,
|
|
807
678
|
endpoint_id=endpoint_id,
|
|
808
679
|
)
|
|
@@ -848,36 +719,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
|
848
719
|
return False
|
|
849
720
|
|
|
850
721
|
|
|
851
|
-
class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
852
|
-
def __init__(self, keys, **kwargs):
|
|
853
|
-
"""
|
|
854
|
-
Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
|
|
855
|
-
or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
|
|
856
|
-
|
|
857
|
-
:param keys: list of key metrics.
|
|
858
|
-
|
|
859
|
-
:returns: An unpacked dictionary of event filtered by the provided key metrics.
|
|
860
|
-
"""
|
|
861
|
-
super().__init__(**kwargs)
|
|
862
|
-
self.keys = keys
|
|
863
|
-
|
|
864
|
-
def do(self, event):
|
|
865
|
-
# Keep only the relevant dictionary based on the provided keys
|
|
866
|
-
new_event = {}
|
|
867
|
-
for key in self.keys:
|
|
868
|
-
if key in event:
|
|
869
|
-
new_event[key] = event[key]
|
|
870
|
-
|
|
871
|
-
# Create unpacked dictionary
|
|
872
|
-
unpacked = {}
|
|
873
|
-
for key in new_event.keys():
|
|
874
|
-
if key in self.keys:
|
|
875
|
-
unpacked = {**unpacked, **new_event[key]}
|
|
876
|
-
else:
|
|
877
|
-
unpacked[key] = new_event[key]
|
|
878
|
-
return unpacked if unpacked else None
|
|
879
|
-
|
|
880
|
-
|
|
881
722
|
class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
882
723
|
def __init__(
|
|
883
724
|
self,
|
|
@@ -931,9 +772,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
931
772
|
def do(self, event: dict):
|
|
932
773
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
933
774
|
|
|
775
|
+
feature_values = event[EventFieldType.FEATURES]
|
|
776
|
+
label_values = event[EventFieldType.PREDICTION]
|
|
934
777
|
# Get feature names and label columns
|
|
935
778
|
if endpoint_id not in self.feature_names:
|
|
936
|
-
endpoint_record = get_endpoint_record(
|
|
779
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
937
780
|
project=self.project,
|
|
938
781
|
endpoint_id=endpoint_id,
|
|
939
782
|
)
|
|
@@ -966,6 +809,12 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
966
809
|
},
|
|
967
810
|
)
|
|
968
811
|
|
|
812
|
+
update_monitoring_feature_set(
|
|
813
|
+
endpoint_record=endpoint_record,
|
|
814
|
+
feature_names=feature_names,
|
|
815
|
+
feature_values=feature_values,
|
|
816
|
+
)
|
|
817
|
+
|
|
969
818
|
# Similar process with label columns
|
|
970
819
|
if not label_columns and self._infer_columns_from_data:
|
|
971
820
|
label_columns = self._infer_label_columns_from_data(event)
|
|
@@ -984,6 +833,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
984
833
|
endpoint_id=endpoint_id,
|
|
985
834
|
attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
|
|
986
835
|
)
|
|
836
|
+
update_monitoring_feature_set(
|
|
837
|
+
endpoint_record=endpoint_record,
|
|
838
|
+
feature_names=label_columns,
|
|
839
|
+
feature_values=label_values,
|
|
840
|
+
)
|
|
987
841
|
|
|
988
842
|
self.label_columns[endpoint_id] = label_columns
|
|
989
843
|
self.feature_names[endpoint_id] = feature_names
|
|
@@ -1001,7 +855,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
1001
855
|
|
|
1002
856
|
# Add feature_name:value pairs along with a mapping dictionary of all of these pairs
|
|
1003
857
|
feature_names = self.feature_names[endpoint_id]
|
|
1004
|
-
feature_values = event[EventFieldType.FEATURES]
|
|
1005
858
|
self._map_dictionary_values(
|
|
1006
859
|
event=event,
|
|
1007
860
|
named_iters=feature_names,
|
|
@@ -1011,7 +864,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
1011
864
|
|
|
1012
865
|
# Add label_name:value pairs along with a mapping dictionary of all of these pairs
|
|
1013
866
|
label_names = self.label_columns[endpoint_id]
|
|
1014
|
-
label_values = event[EventFieldType.PREDICTION]
|
|
1015
867
|
self._map_dictionary_values(
|
|
1016
868
|
event=event,
|
|
1017
869
|
named_iters=label_names,
|
|
@@ -1102,6 +954,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1102
954
|
def do(self, event: dict):
|
|
1103
955
|
key_set = set(event.keys())
|
|
1104
956
|
if not key_set.issubset(self.keys):
|
|
957
|
+
import mlrun.utils.v3io_clients
|
|
958
|
+
|
|
1105
959
|
self.keys.update(key_set)
|
|
1106
960
|
# Apply infer_schema on the kv table for generating the schema file
|
|
1107
961
|
mlrun.utils.v3io_clients.get_frames_client(
|
|
@@ -1137,10 +991,10 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
|
1137
991
|
self.project: str = project
|
|
1138
992
|
|
|
1139
993
|
def do(self, event):
|
|
1140
|
-
if event.path ==
|
|
994
|
+
if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
|
|
1141
995
|
# Return a parsed Prometheus registry file
|
|
1142
996
|
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1143
|
-
elif event.path ==
|
|
997
|
+
elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
|
|
1144
998
|
# Update statistical metrics
|
|
1145
999
|
for event_metric in event.body:
|
|
1146
1000
|
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
@@ -1149,7 +1003,7 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
|
1149
1003
|
metric=event_metric[EventFieldType.METRIC],
|
|
1150
1004
|
value=event_metric[EventFieldType.VALUE],
|
|
1151
1005
|
)
|
|
1152
|
-
elif event.path ==
|
|
1006
|
+
elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
|
|
1153
1007
|
# Update drift status
|
|
1154
1008
|
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1155
1009
|
project=self.project,
|
|
@@ -1209,7 +1063,7 @@ def update_endpoint_record(
|
|
|
1209
1063
|
endpoint_id: str,
|
|
1210
1064
|
attributes: dict,
|
|
1211
1065
|
):
|
|
1212
|
-
model_endpoint_store = mlrun.model_monitoring.
|
|
1066
|
+
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
1213
1067
|
project=project,
|
|
1214
1068
|
)
|
|
1215
1069
|
|
|
@@ -1218,8 +1072,19 @@ def update_endpoint_record(
|
|
|
1218
1072
|
)
|
|
1219
1073
|
|
|
1220
1074
|
|
|
1221
|
-
def
|
|
1222
|
-
|
|
1223
|
-
|
|
1075
|
+
def update_monitoring_feature_set(
|
|
1076
|
+
endpoint_record: dict[str, typing.Any],
|
|
1077
|
+
feature_names: list[str],
|
|
1078
|
+
feature_values: list[typing.Any],
|
|
1079
|
+
):
|
|
1080
|
+
monitoring_feature_set = fstore.get_feature_set(
|
|
1081
|
+
endpoint_record[
|
|
1082
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
|
|
1083
|
+
]
|
|
1224
1084
|
)
|
|
1225
|
-
|
|
1085
|
+
for name, val in zip(feature_names, feature_values):
|
|
1086
|
+
monitoring_feature_set.add_feature(
|
|
1087
|
+
fstore.Feature(name=name, value_type=type(val))
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
monitoring_feature_set.save()
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
15
|
+
import warnings
|
|
16
16
|
from typing import Union
|
|
17
17
|
|
|
18
18
|
import mlrun.common.schemas.schedule
|
|
@@ -55,6 +55,12 @@ class TrackingPolicy(mlrun.model.ModelObj):
|
|
|
55
55
|
writer function, which is a real time nuclio functino, will be deployed
|
|
56
56
|
with the same image. By default, the image is mlrun/mlrun.
|
|
57
57
|
"""
|
|
58
|
+
warnings.warn(
|
|
59
|
+
"The `TrackingPolicy` class is deprecated from version 1.7.0 and is not "
|
|
60
|
+
"used anymore. It will be removed in 1.9.0.",
|
|
61
|
+
FutureWarning,
|
|
62
|
+
)
|
|
63
|
+
|
|
58
64
|
if isinstance(default_batch_intervals, str):
|
|
59
65
|
default_batch_intervals = (
|
|
60
66
|
mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
|