mlrun 1.7.0rc15__py3-none-any.whl → 1.7.0rc17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +18 -4
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/artifacts/__init__.py +7 -1
- mlrun/artifacts/base.py +28 -3
- mlrun/artifacts/dataset.py +8 -0
- mlrun/artifacts/manager.py +18 -0
- mlrun/artifacts/model.py +8 -1
- mlrun/artifacts/plots.py +13 -0
- mlrun/common/schemas/__init__.py +10 -2
- mlrun/common/schemas/alert.py +64 -5
- mlrun/common/schemas/api_gateway.py +4 -0
- mlrun/common/schemas/artifact.py +15 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -1
- mlrun/common/schemas/model_monitoring/constants.py +17 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +60 -1
- mlrun/common/schemas/project.py +5 -1
- mlrun/config.py +11 -4
- mlrun/datastore/datastore_profile.py +10 -7
- mlrun/db/base.py +24 -4
- mlrun/db/httpdb.py +97 -43
- mlrun/db/nopdb.py +25 -4
- mlrun/errors.py +5 -0
- mlrun/launcher/base.py +3 -2
- mlrun/lists.py +4 -0
- mlrun/model.py +15 -8
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/applications/_application_steps.py +1 -2
- mlrun/model_monitoring/applications/context.py +1 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +64 -38
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/base/store.py +9 -36
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +56 -202
- mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
- mlrun/model_monitoring/db/tsdb/base.py +135 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +442 -0
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
- mlrun/model_monitoring/stream_processing.py +46 -210
- mlrun/model_monitoring/writer.py +50 -100
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +19 -200
- mlrun/projects/operations.py +11 -7
- mlrun/projects/pipelines.py +13 -76
- mlrun/projects/project.py +62 -17
- mlrun/render.py +9 -3
- mlrun/run.py +5 -38
- mlrun/runtimes/__init__.py +1 -0
- mlrun/runtimes/base.py +3 -3
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/nuclio/api_gateway.py +163 -77
- mlrun/runtimes/nuclio/application/application.py +160 -7
- mlrun/runtimes/nuclio/function.py +25 -45
- mlrun/runtimes/pod.py +16 -36
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +0 -38
- mlrun/track/tracker.py +2 -1
- mlrun/utils/helpers.py +51 -31
- mlrun/utils/logger.py +11 -6
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +9 -4
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +21 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/METADATA +4 -3
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/RECORD +75 -69
- mlrun/kfpops.py +0 -860
- mlrun/platforms/other.py +0 -305
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,6 @@ import mlrun.model_monitoring.db
|
|
|
30
30
|
import mlrun.model_monitoring.prometheus
|
|
31
31
|
import mlrun.serving.states
|
|
32
32
|
import mlrun.utils
|
|
33
|
-
import mlrun.utils.v3io_clients
|
|
34
33
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
35
34
|
EventFieldType,
|
|
36
35
|
EventKeyMetrics,
|
|
@@ -78,6 +77,7 @@ class EventStreamProcessor:
|
|
|
78
77
|
)
|
|
79
78
|
|
|
80
79
|
self.storage_options = None
|
|
80
|
+
self.tsdb_configurations = {}
|
|
81
81
|
if not mlrun.mlconf.is_ce_mode():
|
|
82
82
|
self._initialize_v3io_configurations(
|
|
83
83
|
model_monitoring_access_key=model_monitoring_access_key
|
|
@@ -138,29 +138,29 @@ class EventStreamProcessor:
|
|
|
138
138
|
|
|
139
139
|
def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
|
|
140
140
|
"""
|
|
141
|
-
Apply monitoring serving graph to a given serving function. The following serving graph includes about
|
|
142
|
-
of different operations that are executed on the events from
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
141
|
+
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
142
|
+
parts that each one them includes several steps of different operations that are executed on the events from
|
|
143
|
+
the model server.
|
|
144
|
+
Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
|
|
145
|
+
metrics from the model server.
|
|
146
|
+
In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
|
|
147
|
+
validation of the event data and adding important details to the event such as endpoint_id.
|
|
148
|
+
In the next parts, the serving graph stores data to 3 different targets:
|
|
149
|
+
1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
|
|
150
|
+
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
|
|
151
|
+
endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
|
|
152
|
+
represents sample statistics from the training data. If the target is from type KV, then the model endpoints
|
|
153
|
+
table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
|
|
154
|
+
SQL, then the table is stored within the database that was defined in the provided connection string.
|
|
155
|
+
2. TSDB: live data of different key metric dictionaries in tsdb target.
|
|
156
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
|
|
155
157
|
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
156
158
|
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
157
159
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
163
|
-
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
160
|
+
3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
|
|
161
|
+
the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
|
|
162
|
+
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
163
|
+
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
164
164
|
|
|
165
165
|
:param fn: A serving function.
|
|
166
166
|
"""
|
|
@@ -170,7 +170,7 @@ class EventStreamProcessor:
|
|
|
170
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
171
171
|
)
|
|
172
172
|
|
|
173
|
-
#
|
|
173
|
+
# Event routing based on the provided path
|
|
174
174
|
def apply_event_routing():
|
|
175
175
|
typing.cast(
|
|
176
176
|
mlrun.serving.TaskStep,
|
|
@@ -183,7 +183,7 @@ class EventStreamProcessor:
|
|
|
183
183
|
|
|
184
184
|
apply_event_routing()
|
|
185
185
|
|
|
186
|
-
#
|
|
186
|
+
# Filter out events with '-' in the path basename from going forward
|
|
187
187
|
# through the next steps of the stream graph
|
|
188
188
|
def apply_storey_filter_stream_events():
|
|
189
189
|
# Filter events with Prometheus endpoints path
|
|
@@ -196,7 +196,7 @@ class EventStreamProcessor:
|
|
|
196
196
|
|
|
197
197
|
apply_storey_filter_stream_events()
|
|
198
198
|
|
|
199
|
-
#
|
|
199
|
+
# Process endpoint event: splitting into sub-events and validate event data
|
|
200
200
|
def apply_process_endpoint_event():
|
|
201
201
|
graph.add_step(
|
|
202
202
|
"ProcessEndpointEvent",
|
|
@@ -207,7 +207,7 @@ class EventStreamProcessor:
|
|
|
207
207
|
|
|
208
208
|
apply_process_endpoint_event()
|
|
209
209
|
|
|
210
|
-
#
|
|
210
|
+
# Applying Storey operations of filtering and flatten
|
|
211
211
|
def apply_storey_filter_and_flatmap():
|
|
212
212
|
# Remove none values from each event
|
|
213
213
|
graph.add_step(
|
|
@@ -224,7 +224,7 @@ class EventStreamProcessor:
|
|
|
224
224
|
|
|
225
225
|
apply_storey_filter_and_flatmap()
|
|
226
226
|
|
|
227
|
-
#
|
|
227
|
+
# Validating feature names and map each feature to its value
|
|
228
228
|
def apply_map_feature_names():
|
|
229
229
|
graph.add_step(
|
|
230
230
|
"MapFeatureNames",
|
|
@@ -236,9 +236,9 @@ class EventStreamProcessor:
|
|
|
236
236
|
|
|
237
237
|
apply_map_feature_names()
|
|
238
238
|
|
|
239
|
-
#
|
|
239
|
+
# Calculate number of predictions and average latency
|
|
240
240
|
def apply_storey_aggregations():
|
|
241
|
-
#
|
|
241
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
242
242
|
graph.add_step(
|
|
243
243
|
class_name="storey.AggregateByKey",
|
|
244
244
|
aggregates=[
|
|
@@ -256,7 +256,7 @@ class EventStreamProcessor:
|
|
|
256
256
|
table=".",
|
|
257
257
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
258
258
|
)
|
|
259
|
-
#
|
|
259
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
260
260
|
graph.add_step(
|
|
261
261
|
class_name="storey.Rename",
|
|
262
262
|
mapping={
|
|
@@ -269,8 +269,8 @@ class EventStreamProcessor:
|
|
|
269
269
|
|
|
270
270
|
apply_storey_aggregations()
|
|
271
271
|
|
|
272
|
-
#
|
|
273
|
-
#
|
|
272
|
+
# KV/SQL branch
|
|
273
|
+
# Filter relevant keys from the event before writing the data into the database table
|
|
274
274
|
def apply_process_before_endpoint_update():
|
|
275
275
|
graph.add_step(
|
|
276
276
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -280,7 +280,7 @@ class EventStreamProcessor:
|
|
|
280
280
|
|
|
281
281
|
apply_process_before_endpoint_update()
|
|
282
282
|
|
|
283
|
-
#
|
|
283
|
+
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
284
284
|
# about average latency and the amount of predictions over time
|
|
285
285
|
def apply_update_endpoint():
|
|
286
286
|
graph.add_step(
|
|
@@ -293,7 +293,7 @@ class EventStreamProcessor:
|
|
|
293
293
|
|
|
294
294
|
apply_update_endpoint()
|
|
295
295
|
|
|
296
|
-
#
|
|
296
|
+
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
297
297
|
# which will be used by Grafana monitoring dashboards
|
|
298
298
|
def apply_infer_schema():
|
|
299
299
|
graph.add_step(
|
|
@@ -308,7 +308,7 @@ class EventStreamProcessor:
|
|
|
308
308
|
if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
|
|
309
309
|
apply_infer_schema()
|
|
310
310
|
|
|
311
|
-
#
|
|
311
|
+
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
312
312
|
def apply_storey_sample_window():
|
|
313
313
|
graph.add_step(
|
|
314
314
|
"storey.steps.SampleWindow",
|
|
@@ -320,84 +320,18 @@ class EventStreamProcessor:
|
|
|
320
320
|
|
|
321
321
|
apply_storey_sample_window()
|
|
322
322
|
|
|
323
|
-
#
|
|
324
|
-
# Steps 20-21 - Prometheus branch
|
|
323
|
+
# TSDB branch (skip to Prometheus if in CE env)
|
|
325
324
|
if not mlrun.mlconf.is_ce_mode():
|
|
326
325
|
# TSDB branch
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
# stats and details about the events
|
|
330
|
-
def apply_process_before_tsdb():
|
|
331
|
-
graph.add_step(
|
|
332
|
-
"ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
apply_process_before_tsdb()
|
|
336
|
-
|
|
337
|
-
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
338
|
-
def apply_filter_and_unpacked_keys(name, keys):
|
|
339
|
-
graph.add_step(
|
|
340
|
-
"FilterAndUnpackKeys",
|
|
341
|
-
name=name,
|
|
342
|
-
after="ProcessBeforeTSDB",
|
|
343
|
-
keys=[keys],
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
def apply_tsdb_target(name, after):
|
|
347
|
-
graph.add_step(
|
|
348
|
-
"storey.TSDBTarget",
|
|
349
|
-
name=name,
|
|
350
|
-
after=after,
|
|
351
|
-
path=self.tsdb_path,
|
|
352
|
-
rate="10/m",
|
|
353
|
-
time_col=EventFieldType.TIMESTAMP,
|
|
354
|
-
container=self.tsdb_container,
|
|
355
|
-
v3io_frames=self.v3io_framesd,
|
|
356
|
-
infer_columns_from_data=True,
|
|
357
|
-
index_cols=[
|
|
358
|
-
EventFieldType.ENDPOINT_ID,
|
|
359
|
-
EventFieldType.RECORD_TYPE,
|
|
360
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
361
|
-
],
|
|
362
|
-
max_events=self.tsdb_batching_max_events,
|
|
363
|
-
flush_after_seconds=self.tsdb_batching_timeout_secs,
|
|
364
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
365
|
-
)
|
|
366
|
-
|
|
367
|
-
# Steps 13-14 - unpacked base_metrics dictionary
|
|
368
|
-
apply_filter_and_unpacked_keys(
|
|
369
|
-
name="FilterAndUnpackKeys1",
|
|
370
|
-
keys=EventKeyMetrics.BASE_METRICS,
|
|
371
|
-
)
|
|
372
|
-
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
373
|
-
|
|
374
|
-
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
375
|
-
apply_filter_and_unpacked_keys(
|
|
376
|
-
name="FilterAndUnpackKeys2",
|
|
377
|
-
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
378
|
-
)
|
|
379
|
-
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
380
|
-
|
|
381
|
-
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
382
|
-
apply_filter_and_unpacked_keys(
|
|
383
|
-
name="FilterAndUnpackKeys3",
|
|
384
|
-
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
326
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
327
|
+
project=self.project,
|
|
385
328
|
)
|
|
329
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
386
330
|
|
|
387
|
-
def apply_storey_filter():
|
|
388
|
-
graph.add_step(
|
|
389
|
-
"storey.Filter",
|
|
390
|
-
"FilterNotNone",
|
|
391
|
-
after="FilterAndUnpackKeys3",
|
|
392
|
-
_fn="(event is not None)",
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
apply_storey_filter()
|
|
396
|
-
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
397
331
|
else:
|
|
398
|
-
# Prometheus
|
|
332
|
+
# Prometheus
|
|
399
333
|
|
|
400
|
-
#
|
|
334
|
+
# Increase the prediction counter by 1 and update the latency value
|
|
401
335
|
graph.add_step(
|
|
402
336
|
"IncCounter",
|
|
403
337
|
name="IncCounter",
|
|
@@ -405,7 +339,7 @@ class EventStreamProcessor:
|
|
|
405
339
|
project=self.project,
|
|
406
340
|
)
|
|
407
341
|
|
|
408
|
-
#
|
|
342
|
+
# Record a sample of features and labels
|
|
409
343
|
def apply_record_features_to_prometheus():
|
|
410
344
|
graph.add_step(
|
|
411
345
|
"RecordFeatures",
|
|
@@ -416,8 +350,8 @@ class EventStreamProcessor:
|
|
|
416
350
|
|
|
417
351
|
apply_record_features_to_prometheus()
|
|
418
352
|
|
|
419
|
-
#
|
|
420
|
-
#
|
|
353
|
+
# Parquet branch
|
|
354
|
+
# Filter and validate different keys before writing the data to Parquet target
|
|
421
355
|
def apply_process_before_parquet():
|
|
422
356
|
graph.add_step(
|
|
423
357
|
"ProcessBeforeParquet",
|
|
@@ -428,7 +362,7 @@ class EventStreamProcessor:
|
|
|
428
362
|
|
|
429
363
|
apply_process_before_parquet()
|
|
430
364
|
|
|
431
|
-
#
|
|
365
|
+
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
432
366
|
def apply_parquet_target():
|
|
433
367
|
graph.add_step(
|
|
434
368
|
"storey.ParquetTarget",
|
|
@@ -502,76 +436,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
502
436
|
return e
|
|
503
437
|
|
|
504
438
|
|
|
505
|
-
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
506
|
-
def __init__(self, **kwargs):
|
|
507
|
-
"""
|
|
508
|
-
Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
|
|
509
|
-
that each one of them contains important details and stats about the events:
|
|
510
|
-
1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
|
|
511
|
-
storey.AggregateByKey which was executed in step 5.
|
|
512
|
-
2. endpoint_features: feature names and values along with the prediction names and value.
|
|
513
|
-
3. custom_metric (opt): optional metrics provided by the user.
|
|
514
|
-
|
|
515
|
-
:returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
|
|
516
|
-
|
|
517
|
-
"""
|
|
518
|
-
super().__init__(**kwargs)
|
|
519
|
-
|
|
520
|
-
def do(self, event):
|
|
521
|
-
# Compute prediction per second
|
|
522
|
-
event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
|
|
523
|
-
float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
|
|
524
|
-
)
|
|
525
|
-
base_fields = [
|
|
526
|
-
EventFieldType.TIMESTAMP,
|
|
527
|
-
EventFieldType.ENDPOINT_ID,
|
|
528
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
529
|
-
]
|
|
530
|
-
|
|
531
|
-
# Getting event timestamp and endpoint_id
|
|
532
|
-
base_event = {k: event[k] for k in base_fields}
|
|
533
|
-
|
|
534
|
-
# base_metrics includes the stats about the average latency and the amount of predictions over time
|
|
535
|
-
base_metrics = {
|
|
536
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
|
|
537
|
-
EventLiveStats.PREDICTIONS_PER_SECOND: event[
|
|
538
|
-
EventLiveStats.PREDICTIONS_PER_SECOND
|
|
539
|
-
],
|
|
540
|
-
EventLiveStats.PREDICTIONS_COUNT_5M: event[
|
|
541
|
-
EventLiveStats.PREDICTIONS_COUNT_5M
|
|
542
|
-
],
|
|
543
|
-
EventLiveStats.PREDICTIONS_COUNT_1H: event[
|
|
544
|
-
EventLiveStats.PREDICTIONS_COUNT_1H
|
|
545
|
-
],
|
|
546
|
-
EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
|
|
547
|
-
EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
|
|
548
|
-
**base_event,
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
# endpoint_features includes the event values of each feature and prediction
|
|
552
|
-
endpoint_features = {
|
|
553
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
554
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
555
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
556
|
-
**base_event,
|
|
557
|
-
}
|
|
558
|
-
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
559
|
-
processed = {
|
|
560
|
-
EventKeyMetrics.BASE_METRICS: base_metrics,
|
|
561
|
-
EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
# If metrics provided, add another dictionary if custom_metrics values
|
|
565
|
-
if event[EventFieldType.METRICS]:
|
|
566
|
-
processed[EventKeyMetrics.CUSTOM_METRICS] = {
|
|
567
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
|
|
568
|
-
**event[EventFieldType.METRICS],
|
|
569
|
-
**base_event,
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
return processed
|
|
573
|
-
|
|
574
|
-
|
|
575
439
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
576
440
|
def __init__(self, **kwargs):
|
|
577
441
|
"""
|
|
@@ -852,36 +716,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
|
852
716
|
return False
|
|
853
717
|
|
|
854
718
|
|
|
855
|
-
class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
856
|
-
def __init__(self, keys, **kwargs):
|
|
857
|
-
"""
|
|
858
|
-
Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
|
|
859
|
-
or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
|
|
860
|
-
|
|
861
|
-
:param keys: list of key metrics.
|
|
862
|
-
|
|
863
|
-
:returns: An unpacked dictionary of event filtered by the provided key metrics.
|
|
864
|
-
"""
|
|
865
|
-
super().__init__(**kwargs)
|
|
866
|
-
self.keys = keys
|
|
867
|
-
|
|
868
|
-
def do(self, event):
|
|
869
|
-
# Keep only the relevant dictionary based on the provided keys
|
|
870
|
-
new_event = {}
|
|
871
|
-
for key in self.keys:
|
|
872
|
-
if key in event:
|
|
873
|
-
new_event[key] = event[key]
|
|
874
|
-
|
|
875
|
-
# Create unpacked dictionary
|
|
876
|
-
unpacked = {}
|
|
877
|
-
for key in new_event.keys():
|
|
878
|
-
if key in self.keys:
|
|
879
|
-
unpacked = {**unpacked, **new_event[key]}
|
|
880
|
-
else:
|
|
881
|
-
unpacked[key] = new_event[key]
|
|
882
|
-
return unpacked if unpacked else None
|
|
883
|
-
|
|
884
|
-
|
|
885
719
|
class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
886
720
|
def __init__(
|
|
887
721
|
self,
|
|
@@ -1117,6 +951,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1117
951
|
def do(self, event: dict):
|
|
1118
952
|
key_set = set(event.keys())
|
|
1119
953
|
if not key_set.issubset(self.keys):
|
|
954
|
+
import mlrun.utils.v3io_clients
|
|
955
|
+
|
|
1120
956
|
self.keys.update(key_set)
|
|
1121
957
|
# Apply infer_schema on the kv table for generating the schema file
|
|
1122
958
|
mlrun.utils.v3io_clients.get_frames_client(
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -12,24 +12,16 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import datetime
|
|
16
15
|
import json
|
|
17
16
|
from typing import Any, NewType
|
|
18
17
|
|
|
19
|
-
import pandas as pd
|
|
20
|
-
from v3io.dataplane import Client as V3IOClient
|
|
21
|
-
from v3io_frames.client import ClientBase as V3IOFramesClient
|
|
22
|
-
from v3io_frames.errors import Error as V3IOFramesError
|
|
23
|
-
from v3io_frames.frames_pb2 import IGNORE
|
|
24
|
-
|
|
25
18
|
import mlrun.common.model_monitoring
|
|
26
19
|
import mlrun.common.schemas
|
|
27
|
-
import mlrun.common.schemas.alert as
|
|
20
|
+
import mlrun.common.schemas.alert as alert_objects
|
|
28
21
|
import mlrun.model_monitoring
|
|
29
|
-
import mlrun.model_monitoring.db.stores
|
|
30
|
-
import mlrun.utils.v3io_clients
|
|
31
22
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
32
23
|
EventFieldType,
|
|
24
|
+
HistogramDataDriftApplicationConstants,
|
|
33
25
|
MetricData,
|
|
34
26
|
ResultData,
|
|
35
27
|
ResultStatusApp,
|
|
@@ -42,9 +34,6 @@ from mlrun.serving.utils import StepToDict
|
|
|
42
34
|
from mlrun.utils import logger
|
|
43
35
|
from mlrun.utils.notifications.notification_pusher import CustomNotificationPusher
|
|
44
36
|
|
|
45
|
-
_TSDB_BE = "tsdb"
|
|
46
|
-
_TSDB_RATE = "1/s"
|
|
47
|
-
_TSDB_TABLE = "app-results"
|
|
48
37
|
_RawEvent = dict[str, Any]
|
|
49
38
|
_AppResultEvent = NewType("_AppResultEvent", _RawEvent)
|
|
50
39
|
|
|
@@ -107,7 +96,7 @@ Extra data: `{self._event[ResultData.RESULT_EXTRA_DATA]}`\
|
|
|
107
96
|
|
|
108
97
|
class ModelMonitoringWriter(StepToDict):
|
|
109
98
|
"""
|
|
110
|
-
Write monitoring
|
|
99
|
+
Write monitoring application results to the target databases
|
|
111
100
|
"""
|
|
112
101
|
|
|
113
102
|
kind = "monitoring_application_stream_pusher"
|
|
@@ -115,102 +104,38 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
115
104
|
def __init__(self, project: str) -> None:
|
|
116
105
|
self.project = project
|
|
117
106
|
self.name = project # required for the deployment process
|
|
118
|
-
|
|
119
|
-
self._tsdb_client = self._get_v3io_frames_client(self._v3io_container)
|
|
107
|
+
|
|
120
108
|
self._custom_notifier = CustomNotificationPusher(
|
|
121
109
|
notification_types=[NotificationKind.slack]
|
|
122
110
|
)
|
|
123
|
-
self._create_tsdb_table()
|
|
124
|
-
self._endpoints_records = {}
|
|
125
|
-
|
|
126
|
-
@staticmethod
|
|
127
|
-
def get_v3io_container(project_name: str) -> str:
|
|
128
|
-
return f"users/pipelines/{project_name}/monitoring-apps"
|
|
129
|
-
|
|
130
|
-
@staticmethod
|
|
131
|
-
def _get_v3io_client() -> V3IOClient:
|
|
132
|
-
return mlrun.utils.v3io_clients.get_v3io_client(
|
|
133
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
@staticmethod
|
|
137
|
-
def _get_v3io_frames_client(v3io_container: str) -> V3IOFramesClient:
|
|
138
|
-
return mlrun.utils.v3io_clients.get_frames_client(
|
|
139
|
-
address=mlrun.mlconf.v3io_framesd,
|
|
140
|
-
container=v3io_container,
|
|
141
|
-
)
|
|
142
111
|
|
|
143
|
-
|
|
144
|
-
self._tsdb_client.create(
|
|
145
|
-
backend=_TSDB_BE,
|
|
146
|
-
table=_TSDB_TABLE,
|
|
147
|
-
if_exists=IGNORE,
|
|
148
|
-
rate=_TSDB_RATE,
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
def _update_kv_db(self, event: _AppResultEvent, kind: str = "result") -> None:
|
|
152
|
-
if kind == "metric":
|
|
153
|
-
# TODO : Implement the logic for writing metrics to KV
|
|
154
|
-
return
|
|
155
|
-
event = _AppResultEvent(event.copy())
|
|
156
|
-
application_result_store = mlrun.model_monitoring.get_store_object(
|
|
112
|
+
self._app_result_store = mlrun.model_monitoring.get_store_object(
|
|
157
113
|
project=self.project
|
|
158
114
|
)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _update_tsdb(self, event: _AppResultEvent, kind: str = "result") -> None:
|
|
162
|
-
if kind == "metric":
|
|
163
|
-
# TODO : Implement the logic for writing metrics to TSDB
|
|
164
|
-
return
|
|
165
|
-
event = _AppResultEvent(event.copy())
|
|
166
|
-
event[WriterEvent.END_INFER_TIME] = datetime.datetime.fromisoformat(
|
|
167
|
-
event[WriterEvent.END_INFER_TIME]
|
|
115
|
+
self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
116
|
+
project=self.project,
|
|
168
117
|
)
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
self._tsdb_client.write(
|
|
172
|
-
backend=_TSDB_BE,
|
|
173
|
-
table=_TSDB_TABLE,
|
|
174
|
-
dfs=pd.DataFrame.from_records([event]),
|
|
175
|
-
index_cols=[
|
|
176
|
-
WriterEvent.END_INFER_TIME,
|
|
177
|
-
WriterEvent.ENDPOINT_ID,
|
|
178
|
-
WriterEvent.APPLICATION_NAME,
|
|
179
|
-
ResultData.RESULT_NAME,
|
|
180
|
-
],
|
|
181
|
-
)
|
|
182
|
-
logger.info("Updated V3IO TSDB successfully", table=_TSDB_TABLE)
|
|
183
|
-
except V3IOFramesError as err:
|
|
184
|
-
logger.warn(
|
|
185
|
-
"Could not write drift measures to TSDB",
|
|
186
|
-
err=err,
|
|
187
|
-
table=_TSDB_TABLE,
|
|
188
|
-
event=event,
|
|
189
|
-
)
|
|
118
|
+
self._endpoints_records = {}
|
|
190
119
|
|
|
191
120
|
@staticmethod
|
|
192
121
|
def _generate_event_on_drift(
|
|
193
122
|
model_endpoint: str, drift_status: str, event_value: dict, project_name: str
|
|
194
123
|
) -> None:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
event_data = mlrun.common.schemas.Event(
|
|
211
|
-
kind=event_kind, entity=entity, value_dict=event_value
|
|
212
|
-
)
|
|
213
|
-
mlrun.get_run_db().generate_event(event_kind, event_data)
|
|
124
|
+
logger.info("Sending an alert")
|
|
125
|
+
entity = mlrun.common.schemas.alert.EventEntities(
|
|
126
|
+
kind=alert_objects.EventEntityKind.MODEL,
|
|
127
|
+
project=project_name,
|
|
128
|
+
ids=[model_endpoint],
|
|
129
|
+
)
|
|
130
|
+
event_kind = (
|
|
131
|
+
alert_objects.EventKind.DRIFT_DETECTED
|
|
132
|
+
if drift_status == ResultStatusApp.detected.value
|
|
133
|
+
else alert_objects.EventKind.DRIFT_SUSPECTED
|
|
134
|
+
)
|
|
135
|
+
event_data = mlrun.common.schemas.Event(
|
|
136
|
+
kind=event_kind, entity=entity, value_dict=event_value
|
|
137
|
+
)
|
|
138
|
+
mlrun.get_run_db().generate_event(event_kind, event_data)
|
|
214
139
|
|
|
215
140
|
@staticmethod
|
|
216
141
|
def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, str]:
|
|
@@ -255,14 +180,20 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
255
180
|
event, kind = self._reconstruct_event(event)
|
|
256
181
|
logger.info("Starting to write event", event=event)
|
|
257
182
|
|
|
258
|
-
self.
|
|
259
|
-
self.
|
|
183
|
+
self._tsdb_connector.write_application_event(event=event.copy(), kind=kind)
|
|
184
|
+
self._app_result_store.write_application_event(event=event.copy(), kind=kind)
|
|
260
185
|
logger.info("Completed event DB writes")
|
|
186
|
+
|
|
261
187
|
_Notifier(event=event, notification_pusher=self._custom_notifier).notify()
|
|
262
188
|
|
|
263
189
|
if (
|
|
264
190
|
mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
|
|
265
191
|
and kind == WriterEventKind.RESULT
|
|
192
|
+
and (
|
|
193
|
+
event[ResultData.RESULT_STATUS] == ResultStatusApp.detected.value
|
|
194
|
+
or event[ResultData.RESULT_STATUS]
|
|
195
|
+
== ResultStatusApp.potential_detection.value
|
|
196
|
+
)
|
|
266
197
|
):
|
|
267
198
|
endpoint_id = event[WriterEvent.ENDPOINT_ID]
|
|
268
199
|
endpoint_record = self._endpoints_records.setdefault(
|
|
@@ -282,3 +213,22 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
282
213
|
event_value,
|
|
283
214
|
self.project,
|
|
284
215
|
)
|
|
216
|
+
|
|
217
|
+
if (
|
|
218
|
+
kind == WriterEventKind.RESULT
|
|
219
|
+
and event[WriterEvent.APPLICATION_NAME]
|
|
220
|
+
== HistogramDataDriftApplicationConstants.NAME
|
|
221
|
+
and event[ResultData.RESULT_NAME]
|
|
222
|
+
== HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME
|
|
223
|
+
):
|
|
224
|
+
endpoint_id = event[WriterEvent.ENDPOINT_ID]
|
|
225
|
+
logger.info(
|
|
226
|
+
"Updating the model endpoint with metadata specific to the histogram "
|
|
227
|
+
"data drift app",
|
|
228
|
+
endpoint_id=endpoint_id,
|
|
229
|
+
)
|
|
230
|
+
store = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
231
|
+
store.update_model_endpoint(
|
|
232
|
+
endpoint_id=endpoint_id,
|
|
233
|
+
attributes=json.loads(event[ResultData.RESULT_EXTRA_DATA]),
|
|
234
|
+
)
|
mlrun/platforms/__init__.py
CHANGED
|
@@ -17,22 +17,23 @@ import json
|
|
|
17
17
|
from pprint import pprint
|
|
18
18
|
from time import sleep
|
|
19
19
|
|
|
20
|
-
from .
|
|
21
|
-
|
|
22
|
-
VolumeMount,
|
|
23
|
-
add_or_refresh_credentials,
|
|
24
|
-
is_iguazio_session_cookie,
|
|
25
|
-
mount_v3io,
|
|
26
|
-
v3io_cred,
|
|
27
|
-
)
|
|
28
|
-
from .other import (
|
|
20
|
+
from mlrun_pipelines.common.mounts import VolumeMount
|
|
21
|
+
from mlrun_pipelines.mounts import (
|
|
29
22
|
auto_mount,
|
|
30
23
|
mount_configmap,
|
|
31
24
|
mount_hostpath,
|
|
32
25
|
mount_pvc,
|
|
33
26
|
mount_s3,
|
|
34
27
|
mount_secret,
|
|
28
|
+
mount_v3io,
|
|
35
29
|
set_env_variables,
|
|
30
|
+
v3io_cred,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
from .iguazio import (
|
|
34
|
+
V3ioStreamClient,
|
|
35
|
+
add_or_refresh_credentials,
|
|
36
|
+
is_iguazio_session_cookie,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
|