mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
|
@@ -27,7 +27,6 @@ import mlrun.datastore.targets
|
|
|
27
27
|
import mlrun.feature_store as fstore
|
|
28
28
|
import mlrun.feature_store.steps
|
|
29
29
|
import mlrun.model_monitoring.db
|
|
30
|
-
import mlrun.model_monitoring.prometheus
|
|
31
30
|
import mlrun.serving.states
|
|
32
31
|
import mlrun.utils
|
|
33
32
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
@@ -37,8 +36,8 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
37
36
|
FileTargetKind,
|
|
38
37
|
ModelEndpointTarget,
|
|
39
38
|
ProjectSecretKeys,
|
|
40
|
-
PrometheusEndpoints,
|
|
41
39
|
)
|
|
40
|
+
from mlrun.model_monitoring.db import StoreBase, TSDBConnector
|
|
42
41
|
from mlrun.utils import logger
|
|
43
42
|
|
|
44
43
|
|
|
@@ -50,14 +49,12 @@ class EventStreamProcessor:
|
|
|
50
49
|
parquet_batching_max_events: int,
|
|
51
50
|
parquet_batching_timeout_secs: int,
|
|
52
51
|
parquet_target: str,
|
|
53
|
-
sample_window: int = 10,
|
|
54
52
|
aggregate_windows: typing.Optional[list[str]] = None,
|
|
55
|
-
aggregate_period: str = "
|
|
53
|
+
aggregate_period: str = "5m",
|
|
56
54
|
model_monitoring_access_key: str = None,
|
|
57
55
|
):
|
|
58
56
|
# General configurations, mainly used for the storey steps in the future serving graph
|
|
59
57
|
self.project = project
|
|
60
|
-
self.sample_window = sample_window
|
|
61
58
|
self.aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
62
59
|
self.aggregate_period = aggregate_period
|
|
63
60
|
|
|
@@ -135,7 +132,8 @@ class EventStreamProcessor:
|
|
|
135
132
|
def apply_monitoring_serving_graph(
|
|
136
133
|
self,
|
|
137
134
|
fn: mlrun.runtimes.ServingRuntime,
|
|
138
|
-
|
|
135
|
+
tsdb_connector: TSDBConnector,
|
|
136
|
+
endpoint_store: StoreBase,
|
|
139
137
|
) -> None:
|
|
140
138
|
"""
|
|
141
139
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -163,48 +161,46 @@ class EventStreamProcessor:
|
|
|
163
161
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
164
162
|
|
|
165
163
|
:param fn: A serving function.
|
|
166
|
-
:param
|
|
167
|
-
|
|
164
|
+
:param tsdb_connector: Time series database connector.
|
|
165
|
+
:param endpoint_store: KV/SQL store used for endpoint data.
|
|
168
166
|
"""
|
|
169
167
|
|
|
170
168
|
graph = typing.cast(
|
|
171
169
|
mlrun.serving.states.RootFlowStep,
|
|
172
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
173
171
|
)
|
|
172
|
+
graph.add_step(
|
|
173
|
+
"ExtractEndpointID",
|
|
174
|
+
"extract_endpoint",
|
|
175
|
+
full_event=True,
|
|
176
|
+
)
|
|
174
177
|
|
|
175
|
-
#
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
project=self.project,
|
|
183
|
-
),
|
|
184
|
-
).respond()
|
|
185
|
-
|
|
186
|
-
apply_event_routing()
|
|
178
|
+
# split the graph between event with error vs valid event
|
|
179
|
+
graph.add_step(
|
|
180
|
+
"storey.Filter",
|
|
181
|
+
"FilterError",
|
|
182
|
+
after="extract_endpoint",
|
|
183
|
+
_fn="(event.get('error') is None)",
|
|
184
|
+
)
|
|
187
185
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
"filter_stream_event",
|
|
195
|
-
_fn=f"(event.path not in {PrometheusEndpoints.list()})",
|
|
196
|
-
full_event=True,
|
|
197
|
-
)
|
|
186
|
+
graph.add_step(
|
|
187
|
+
"storey.Filter",
|
|
188
|
+
"ForwardError",
|
|
189
|
+
after="extract_endpoint",
|
|
190
|
+
_fn="(event.get('error') is not None)",
|
|
191
|
+
)
|
|
198
192
|
|
|
199
|
-
|
|
193
|
+
tsdb_connector.handle_model_error(
|
|
194
|
+
graph,
|
|
195
|
+
)
|
|
200
196
|
|
|
201
197
|
# Process endpoint event: splitting into sub-events and validate event data
|
|
202
198
|
def apply_process_endpoint_event():
|
|
203
199
|
graph.add_step(
|
|
204
200
|
"ProcessEndpointEvent",
|
|
201
|
+
after="extract_endpoint", # TODO: change this to FilterError in ML-7456
|
|
205
202
|
full_event=True,
|
|
206
203
|
project=self.project,
|
|
207
|
-
after="filter_stream_event",
|
|
208
204
|
)
|
|
209
205
|
|
|
210
206
|
apply_process_endpoint_event()
|
|
@@ -306,51 +302,10 @@ class EventStreamProcessor:
|
|
|
306
302
|
table=self.kv_path,
|
|
307
303
|
)
|
|
308
304
|
|
|
309
|
-
|
|
310
|
-
project=self.project, secret_provider=secret_provider
|
|
311
|
-
)
|
|
312
|
-
if store_object.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
305
|
+
if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
313
306
|
apply_infer_schema()
|
|
314
307
|
|
|
315
|
-
|
|
316
|
-
def apply_storey_sample_window():
|
|
317
|
-
graph.add_step(
|
|
318
|
-
"storey.steps.SampleWindow",
|
|
319
|
-
name="sample",
|
|
320
|
-
after="Rename",
|
|
321
|
-
window_size=self.sample_window,
|
|
322
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
apply_storey_sample_window()
|
|
326
|
-
|
|
327
|
-
# TSDB branch (skip to Prometheus if in CE env)
|
|
328
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
329
|
-
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
330
|
-
project=self.project, secret_provider=secret_provider
|
|
331
|
-
)
|
|
332
|
-
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
333
|
-
|
|
334
|
-
else:
|
|
335
|
-
# Prometheus
|
|
336
|
-
# Increase the prediction counter by 1 and update the latency value
|
|
337
|
-
graph.add_step(
|
|
338
|
-
"IncCounter",
|
|
339
|
-
name="IncCounter",
|
|
340
|
-
after="MapFeatureNames",
|
|
341
|
-
project=self.project,
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
# Record a sample of features and labels
|
|
345
|
-
def apply_record_features_to_prometheus():
|
|
346
|
-
graph.add_step(
|
|
347
|
-
"RecordFeatures",
|
|
348
|
-
name="RecordFeaturesToPrometheus",
|
|
349
|
-
after="sample",
|
|
350
|
-
project=self.project,
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
apply_record_features_to_prometheus()
|
|
308
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
354
309
|
|
|
355
310
|
# Parquet branch
|
|
356
311
|
# Filter and validate different keys before writing the data to Parquet target
|
|
@@ -379,6 +334,7 @@ class EventStreamProcessor:
|
|
|
379
334
|
index_cols=[EventFieldType.ENDPOINT_ID],
|
|
380
335
|
key_bucketing_number=0,
|
|
381
336
|
time_partitioning_granularity="hour",
|
|
337
|
+
time_field=EventFieldType.TIMESTAMP,
|
|
382
338
|
partition_cols=["$key", "$year", "$month", "$day", "$hour"],
|
|
383
339
|
)
|
|
384
340
|
|
|
@@ -438,6 +394,38 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
438
394
|
return e
|
|
439
395
|
|
|
440
396
|
|
|
397
|
+
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
398
|
+
def __init__(self, **kwargs) -> None:
|
|
399
|
+
"""
|
|
400
|
+
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
401
|
+
"""
|
|
402
|
+
super().__init__(**kwargs)
|
|
403
|
+
|
|
404
|
+
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
405
|
+
# Getting model version and function uri from event
|
|
406
|
+
# and use them for retrieving the endpoint_id
|
|
407
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
408
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
412
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
version = full_event.body.get(EventFieldType.VERSION)
|
|
416
|
+
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
417
|
+
|
|
418
|
+
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
419
|
+
function_uri=function_uri,
|
|
420
|
+
versioned_model=versioned_model,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
endpoint_id = str(endpoint_id)
|
|
424
|
+
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
425
|
+
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
426
|
+
return full_event
|
|
427
|
+
|
|
428
|
+
|
|
441
429
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
442
430
|
def __init__(self, **kwargs):
|
|
443
431
|
"""
|
|
@@ -511,28 +499,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
511
499
|
def do(self, full_event):
|
|
512
500
|
event = full_event.body
|
|
513
501
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
function_uri = event
|
|
517
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
518
|
-
return None
|
|
519
|
-
|
|
520
|
-
model = event.get(EventFieldType.MODEL)
|
|
521
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
522
|
-
return None
|
|
523
|
-
|
|
524
|
-
version = event.get(EventFieldType.VERSION)
|
|
525
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
526
|
-
|
|
527
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
528
|
-
function_uri=function_uri,
|
|
529
|
-
versioned_model=versioned_model,
|
|
530
|
-
)
|
|
531
|
-
|
|
532
|
-
endpoint_id = str(endpoint_id)
|
|
533
|
-
|
|
534
|
-
event[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
535
|
-
event[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
502
|
+
versioned_model = event[EventFieldType.VERSIONED_MODEL]
|
|
503
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
504
|
+
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
536
505
|
|
|
537
506
|
# In case this process fails, resume state from existing record
|
|
538
507
|
self.resume_state(endpoint_id)
|
|
@@ -540,13 +509,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
540
509
|
# If error key has been found in the current event,
|
|
541
510
|
# increase the error counter by 1 and raise the error description
|
|
542
511
|
error = event.get("error")
|
|
543
|
-
if error:
|
|
512
|
+
if error: # TODO: delete this in ML-7456
|
|
544
513
|
self.error_count[endpoint_id] += 1
|
|
545
|
-
mlrun.model_monitoring.prometheus.write_errors(
|
|
546
|
-
project=self.project,
|
|
547
|
-
endpoint_id=event["endpoint_id"],
|
|
548
|
-
model_name=event["model"],
|
|
549
|
-
)
|
|
550
514
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
551
515
|
|
|
552
516
|
# Validate event fields
|
|
@@ -613,6 +577,26 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
613
577
|
|
|
614
578
|
# Separate each model invocation into sub events that will be stored as dictionary
|
|
615
579
|
# in list of events. This list will be used as the body for the storey event.
|
|
580
|
+
if not isinstance(features, list):
|
|
581
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
582
|
+
"Model's inputs must be a list"
|
|
583
|
+
)
|
|
584
|
+
features = (
|
|
585
|
+
features
|
|
586
|
+
if not any(not isinstance(feat, list) for feat in features)
|
|
587
|
+
else [features]
|
|
588
|
+
)
|
|
589
|
+
if not isinstance(predictions, list):
|
|
590
|
+
predictions = [[predictions]]
|
|
591
|
+
elif isinstance(predictions, list) and len(predictions) == len(features):
|
|
592
|
+
pass # predictions are already in the right format
|
|
593
|
+
else:
|
|
594
|
+
predictions = (
|
|
595
|
+
predictions
|
|
596
|
+
if not any(not isinstance(pred, list) for pred in predictions)
|
|
597
|
+
else [predictions]
|
|
598
|
+
)
|
|
599
|
+
|
|
616
600
|
events = []
|
|
617
601
|
for i, (feature, prediction) in enumerate(zip(features, predictions)):
|
|
618
602
|
if not isinstance(prediction, list):
|
|
@@ -634,6 +618,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
634
618
|
EventFieldType.PREDICTION: prediction,
|
|
635
619
|
EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
|
|
636
620
|
EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
|
|
621
|
+
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
622
|
+
self.last_request[endpoint_id]
|
|
623
|
+
).timestamp(),
|
|
637
624
|
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
638
625
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
639
626
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
|
@@ -773,6 +760,12 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
773
760
|
|
|
774
761
|
feature_values = event[EventFieldType.FEATURES]
|
|
775
762
|
label_values = event[EventFieldType.PREDICTION]
|
|
763
|
+
|
|
764
|
+
for index in range(len(feature_values)):
|
|
765
|
+
feature_value = feature_values[index]
|
|
766
|
+
if isinstance(feature_value, int):
|
|
767
|
+
feature_values[index] = float(feature_value)
|
|
768
|
+
|
|
776
769
|
# Get feature names and label columns
|
|
777
770
|
if endpoint_id not in self.feature_names:
|
|
778
771
|
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
@@ -967,98 +960,6 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
967
960
|
return event
|
|
968
961
|
|
|
969
962
|
|
|
970
|
-
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
971
|
-
"""
|
|
972
|
-
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
973
|
-
to the caller. At the moment there are several paths:
|
|
974
|
-
|
|
975
|
-
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
976
|
-
to scrape the results from the monitoring stream memory.
|
|
977
|
-
|
|
978
|
-
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
979
|
-
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
980
|
-
metrics.
|
|
981
|
-
|
|
982
|
-
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
983
|
-
|
|
984
|
-
"""
|
|
985
|
-
|
|
986
|
-
def __init__(
|
|
987
|
-
self,
|
|
988
|
-
project: str,
|
|
989
|
-
**kwargs,
|
|
990
|
-
):
|
|
991
|
-
super().__init__(**kwargs)
|
|
992
|
-
self.project: str = project
|
|
993
|
-
|
|
994
|
-
def do(self, event):
|
|
995
|
-
if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
|
|
996
|
-
# Return a parsed Prometheus registry file
|
|
997
|
-
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
998
|
-
elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
|
|
999
|
-
# Update statistical metrics
|
|
1000
|
-
for event_metric in event.body:
|
|
1001
|
-
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1002
|
-
project=self.project,
|
|
1003
|
-
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1004
|
-
metric=event_metric[EventFieldType.METRIC],
|
|
1005
|
-
value=event_metric[EventFieldType.VALUE],
|
|
1006
|
-
)
|
|
1007
|
-
elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
|
|
1008
|
-
# Update drift status
|
|
1009
|
-
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1010
|
-
project=self.project,
|
|
1011
|
-
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1012
|
-
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1013
|
-
)
|
|
1014
|
-
|
|
1015
|
-
return event
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1019
|
-
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1020
|
-
|
|
1021
|
-
def __init__(self, project: str, **kwargs):
|
|
1022
|
-
super().__init__(**kwargs)
|
|
1023
|
-
self.project: str = project
|
|
1024
|
-
|
|
1025
|
-
def do(self, event):
|
|
1026
|
-
# Compute prediction per second
|
|
1027
|
-
|
|
1028
|
-
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1029
|
-
project=self.project,
|
|
1030
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1031
|
-
latency=event[EventFieldType.LATENCY],
|
|
1032
|
-
model_name=event[EventFieldType.MODEL],
|
|
1033
|
-
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1034
|
-
)
|
|
1035
|
-
|
|
1036
|
-
return event
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1040
|
-
"""Record a sample of features and labels in Prometheus registry"""
|
|
1041
|
-
|
|
1042
|
-
def __init__(self, project: str, **kwargs):
|
|
1043
|
-
super().__init__(**kwargs)
|
|
1044
|
-
self.project: str = project
|
|
1045
|
-
|
|
1046
|
-
def do(self, event):
|
|
1047
|
-
# Generate a dictionary of features and predictions
|
|
1048
|
-
features = {
|
|
1049
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1050
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
1051
|
-
}
|
|
1052
|
-
|
|
1053
|
-
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1054
|
-
project=self.project,
|
|
1055
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1056
|
-
features=features,
|
|
1057
|
-
)
|
|
1058
|
-
|
|
1059
|
-
return event
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
963
|
def update_endpoint_record(
|
|
1063
964
|
project: str,
|
|
1064
965
|
endpoint_id: str,
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -130,7 +130,6 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
130
130
|
project_name: str,
|
|
131
131
|
result_kind: int,
|
|
132
132
|
) -> None:
|
|
133
|
-
logger.info("Sending an event")
|
|
134
133
|
entity = mlrun.common.schemas.alert.EventEntities(
|
|
135
134
|
kind=alert_objects.EventEntityKind.MODEL_ENDPOINT_RESULT,
|
|
136
135
|
project=project_name,
|
|
@@ -146,7 +145,9 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
146
145
|
entity=entity,
|
|
147
146
|
value_dict=event_value,
|
|
148
147
|
)
|
|
148
|
+
logger.info("Sending a drift event")
|
|
149
149
|
mlrun.get_run_db().generate_event(event_kind, event_data)
|
|
150
|
+
logger.info("Drift event sent successfully")
|
|
150
151
|
|
|
151
152
|
@staticmethod
|
|
152
153
|
def _generate_alert_event_kind(
|
|
@@ -159,7 +160,9 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
159
160
|
event_kind = f"{event_kind}_detected"
|
|
160
161
|
else:
|
|
161
162
|
event_kind = f"{event_kind}_suspected"
|
|
162
|
-
return alert_objects.EventKind(
|
|
163
|
+
return alert_objects.EventKind(
|
|
164
|
+
value=mlrun.utils.helpers.normalize_name(event_kind)
|
|
165
|
+
)
|
|
163
166
|
|
|
164
167
|
@staticmethod
|
|
165
168
|
def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, WriterEventKind]:
|
|
@@ -257,8 +260,13 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
257
260
|
"data drift app",
|
|
258
261
|
endpoint_id=endpoint_id,
|
|
259
262
|
)
|
|
260
|
-
|
|
261
|
-
|
|
263
|
+
attributes = json.loads(event[ResultData.RESULT_EXTRA_DATA])
|
|
264
|
+
attributes[EventFieldType.DRIFT_STATUS] = str(
|
|
265
|
+
attributes[EventFieldType.DRIFT_STATUS]
|
|
266
|
+
)
|
|
267
|
+
self._app_result_store.update_model_endpoint(
|
|
262
268
|
endpoint_id=endpoint_id,
|
|
263
|
-
attributes=
|
|
269
|
+
attributes=attributes,
|
|
264
270
|
)
|
|
271
|
+
|
|
272
|
+
logger.info("Model monitoring writer finished handling event")
|
|
@@ -34,7 +34,7 @@ class _DefaultPackagerMeta(ABCMeta):
|
|
|
34
34
|
dynamically generated docstring that will include a summary of the packager.
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
def __new__(
|
|
37
|
+
def __new__(cls, name: str, bases: tuple, namespace: dict, **kwargs):
|
|
38
38
|
"""
|
|
39
39
|
Create a new DefaultPackager metaclass that saves the original packager docstring to another attribute named
|
|
40
40
|
`_packager_doc`.
|
|
@@ -48,7 +48,7 @@ class _DefaultPackagerMeta(ABCMeta):
|
|
|
48
48
|
namespace["_packager_doc"] = namespace.get("__doc__", "")
|
|
49
49
|
|
|
50
50
|
# Continue creating the metaclass:
|
|
51
|
-
return super().__new__(
|
|
51
|
+
return super().__new__(cls, name, bases, namespace, **kwargs)
|
|
52
52
|
|
|
53
53
|
@property
|
|
54
54
|
def __doc__(cls: type["DefaultPackager"]) -> str:
|
mlrun/projects/operations.py
CHANGED
|
@@ -15,10 +15,13 @@
|
|
|
15
15
|
import warnings
|
|
16
16
|
from typing import Optional, Union
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
import mlrun_pipelines.common.models
|
|
19
|
+
import mlrun_pipelines.models
|
|
19
20
|
|
|
20
21
|
import mlrun
|
|
21
22
|
import mlrun.common.constants as mlrun_constants
|
|
23
|
+
import mlrun.common.schemas.function
|
|
24
|
+
import mlrun.common.schemas.workflow
|
|
22
25
|
from mlrun.utils import hub_prefix
|
|
23
26
|
|
|
24
27
|
from .pipelines import enrich_function_object, pipeline_context
|
|
@@ -49,7 +52,7 @@ def _get_engine_and_function(function, project=None):
|
|
|
49
52
|
function = enrich_function_object(project, function, copy_function=False)
|
|
50
53
|
|
|
51
54
|
if not pipeline_context.workflow:
|
|
52
|
-
return
|
|
55
|
+
return mlrun.common.schemas.workflow.EngineType.LOCAL, function
|
|
53
56
|
|
|
54
57
|
return pipeline_context.workflow.engine, function
|
|
55
58
|
|
|
@@ -78,7 +81,7 @@ def run_function(
|
|
|
78
81
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
79
82
|
builder_env: Optional[list] = None,
|
|
80
83
|
reset_on_run: Optional[bool] = None,
|
|
81
|
-
) -> Union[mlrun.model.RunObject, PipelineNodeWrapper]:
|
|
84
|
+
) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
|
|
82
85
|
"""Run a local or remote task as part of a local/kubeflow pipeline
|
|
83
86
|
|
|
84
87
|
run_function() allow you to execute a function locally, on a remote cluster, or as part of an automated workflow
|
|
@@ -186,7 +189,11 @@ def run_function(
|
|
|
186
189
|
)
|
|
187
190
|
task.spec.verbose = task.spec.verbose or verbose
|
|
188
191
|
|
|
189
|
-
if engine ==
|
|
192
|
+
if engine == mlrun.common.schemas.workflow.EngineType.KFP:
|
|
193
|
+
if schedule:
|
|
194
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
195
|
+
"Scheduling jobs is not supported when running a workflow with the kfp engine."
|
|
196
|
+
)
|
|
190
197
|
return function.as_step(
|
|
191
198
|
name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels
|
|
192
199
|
)
|
|
@@ -262,7 +269,7 @@ def build_function(
|
|
|
262
269
|
overwrite_build_params: bool = False,
|
|
263
270
|
extra_args: str = None,
|
|
264
271
|
force_build: bool = False,
|
|
265
|
-
) -> Union[BuildStatus, PipelineNodeWrapper]:
|
|
272
|
+
) -> Union[BuildStatus, mlrun_pipelines.models.PipelineNodeWrapper]:
|
|
266
273
|
"""deploy ML function, build container with its dependencies
|
|
267
274
|
|
|
268
275
|
:param function: Name of the function (in the project) or function object
|
|
@@ -298,7 +305,7 @@ def build_function(
|
|
|
298
305
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
299
306
|
"Cannot build use deploy_function()"
|
|
300
307
|
)
|
|
301
|
-
if engine ==
|
|
308
|
+
if engine == mlrun.common.schemas.workflow.EngineType.KFP:
|
|
302
309
|
if overwrite_build_params:
|
|
303
310
|
function.spec.build.commands = None
|
|
304
311
|
if requirements or requirements_file:
|
|
@@ -330,6 +337,7 @@ def build_function(
|
|
|
330
337
|
commands=commands,
|
|
331
338
|
secret=secret_name,
|
|
332
339
|
requirements=requirements,
|
|
340
|
+
requirements_file=requirements_file,
|
|
333
341
|
overwrite=overwrite_build_params,
|
|
334
342
|
extra_args=extra_args,
|
|
335
343
|
)
|
|
@@ -370,7 +378,7 @@ def deploy_function(
|
|
|
370
378
|
builder_env: dict = None,
|
|
371
379
|
project_object=None,
|
|
372
380
|
mock: bool = None,
|
|
373
|
-
) -> Union[DeployStatus, PipelineNodeWrapper]:
|
|
381
|
+
) -> Union[DeployStatus, mlrun_pipelines.models.PipelineNodeWrapper]:
|
|
374
382
|
"""deploy real-time (nuclio based) functions
|
|
375
383
|
|
|
376
384
|
:param function: name of the function (in the project) or function object
|
|
@@ -387,7 +395,7 @@ def deploy_function(
|
|
|
387
395
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
388
396
|
"deploy is used with real-time functions, for other kinds use build_function()"
|
|
389
397
|
)
|
|
390
|
-
if engine ==
|
|
398
|
+
if engine == mlrun.common.schemas.workflow.EngineType.KFP:
|
|
391
399
|
return function.deploy_step(models=models, env=env, tag=tag, verbose=verbose)
|
|
392
400
|
else:
|
|
393
401
|
if env:
|